95 files changed, 5286 insertions, 10051 deletions
diff --git a/README.md b/README.md
deleted file mode 100644
index deeb78ae21..0000000000
--- a/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-![GODOT](/logo.png)
-
-### The Engine
-
-Godot is a fully featured, open source, MIT licensed, game engine. It focuses on having great tools, and a visual oriented workflow that can export to PC, Mobile and Web platforms with no hassle.
-The editor, language and APIs are feature rich, yet simple to learn, allowing you to become productive in a matter of hours.
-
-### About
-
-Godot has been developed by Juan Linietsky and Ariel Manzur for several years, and was born as an in-house engine, used to publish several work-for-hire titles.
-Development is sponsored by OKAM Studio (http://www.okamstudio.com).
-
-### Godot is BETA. Collaborate!!
-
-Having been developed as in-house means that the user experience may still not be ideal for everyone. The features needed to make a great game are there, but we really need your help to fix all the rough edges and improve usability (via feedback and/or code contributions).
-We know we are close to having an awesome, open source, game engine with nothing to envy from the best commercial offerings, but we can't do this alone. This is why Godot is now open source, so everyone can help us reach this goal.
-
-### Binary Downloads, Documentation, Community, etc.
-
-Binary downloads, documentation, community, etc. can be found in Godot homepage:
-
-http://www.godotengine.org
-
-### Compiling from Source
-
-Compilation instructions for every platform can be found in the Wiki:
-http://www.godotengine.org/wiki/doku.php?id=advanced
-
-
-
diff --git a/doc/base/classes.xml b/doc/base/classes.xml
index 9a7259df7f..51b135d248 100644
--- a/doc/base/classes.xml
+++ b/doc/base/classes.xml
@@ -4743,13 +4743,20 @@
 	</brief_description>
 	<description>
 	Camera node for 2D scenes. It forces the screen (current layer) to scroll following this node. This makes it easier (and faster) to program scrollable scenes than manually changing the position of [CanvasItem] based nodes.
+	This node is intended to be a simple helper get get things going quickly
+	and it may happen often that more functionality is desired to change
+	how the camera works. To make your own custom camera node, simply
+	inherit from [Node2D] and change the transform of the canvas by
+	calling get_viewport().set_canvas_transform(m) in [Viewport].
+	
 	</description>
 	<methods>
 		<method name="set_offset"  >
 			<argument index="0" name="offset" type="Vector2">
 			</argument>
 			<description>
-			Set the scroll offset.
+			Set the scroll offset. Useful for looking around or
+			camera shake animations.
 			</description>
 		</method>
 		<method name="get_offset" qualifiers="const" >
diff --git a/drivers/webp/SCsub b/drivers/webp/SCsub
index 6d9707677f..3ae046ff79 100644
--- a/drivers/webp/SCsub
+++ b/drivers/webp/SCsub
@@ -2,64 +2,58 @@ Import('env')
 
 
 webp_sources = [
-	"webp/mux/muxread.c",\
-	"webp/mux/muxedit.c",\
-	"webp/demux/demux.c",\
-	"webp/mux/muxinternal.c",\
-	"webp/enc/alpha.c",\
-	"webp/enc/tree.c",\
-	"webp/enc/webpenc.c",\
-	"webp/enc/cost.c",\
-	"webp/enc/quant.c",\
-	"webp/enc/token.c",\
-	"webp/enc/iterator.c",\
-	"webp/enc/syntax.c",\
-	"webp/enc/frame.c",\
-	"webp/enc/picture.c",\
-	"webp/enc/analysis.c",\
-	"webp/enc/backward_references.c",\
-	"webp/enc/filter.c",\
-	"webp/enc/vp8l.c",\
-	"webp/enc/layer.c",\
-	"webp/enc/histogram.c",\
-	"webp/enc/config.c",\
-	"webp/dec/alpha.c",\
-	"webp/dec/vp8.c",\
-	"webp/dec/tree.c",\
-	"webp/dec/webp.c",\
-	"webp/dec/idec.c",\
-	"webp/dec/quant.c",\
-	"webp/dec/frame.c",\
-	"webp/dec/buffer.c",\
-	"webp/dec/io.c",\
-	"webp/dec/vp8l.c",\
-	"webp/dec/layer.c",
+	"webp/mux/muxedit.c",
+	"webp/mux/muxread.c",
+	"webp/mux/muxinternal.c",
+	"webp/mux/demux.c",
+	"webp/enc/tree.c",
+	"webp/enc/analysis.c",
+	"webp/enc/backward_references.c",
+	"webp/enc/alpha.c",
+	"webp/enc/picture.c",
+	"webp/enc/frame.c",
+	"webp/enc/webpenc.c",
+	"webp/enc/cost.c",
+	"webp/enc/filter.c",
+	"webp/enc/vp8l.c",
+	"webp/enc/quant.c",
+	"webp/enc/histogram.c",
+	"webp/enc/syntax.c",
+	"webp/enc/config.c",
+	"webp/enc/layer.c",
+	"webp/enc/iterator.c",
+	"webp/dsp/dec_sse2.c",
+	"webp/dsp/upsampling_sse2.c",
 	"webp/dsp/dec_neon.c",
-	"webp/dsp/upsampling_sse2.c",\
-	"webp/dsp/dec_sse2.c",\
-	"webp/dsp/enc_neon.c",\
-	"webp/dsp/dec.c",\
-	"webp/dsp/upsampling.c",\
-	"webp/dsp/enc_sse2.c",\
-	"webp/dsp/enc.c",\
-	"webp/dsp/cpu.c",\
-	"webp/dsp/lossless.c",\
-	"webp/dsp/upsampling_neon.c",\
-	"webp/dsp/yuv.c",\
-	"webp/utils/bit_reader.c",\
-	"webp/utils/thread.c",\
-	"webp/utils/alpha_processing.c",\
-	"webp/utils/random.c",\
-	"webp/utils/quant_levels.c",\
-	"webp/utils/huffman.c",\
-	"webp/utils/filters.c",\
-	"webp/utils/rescaler.c",\
-	"webp/utils/quant_levels_dec.c",\
-	"webp/utils/color_cache.c",\
-	"webp/utils/utils.c",\
-	"webp/utils/huffman_encode.c",\
-	"webp/utils/bit_writer.c",\
-	"webp/image_loader_webp.cpp",\
+	"webp/dsp/enc.c",
+	"webp/dsp/enc_sse2.c",
+	"webp/dsp/upsampling.c",
+	"webp/dsp/lossless.c",
+	"webp/dsp/cpu.c",
+	"webp/dsp/dec.c",
+	"webp/dsp/yuv.c",
+	"webp/utils/bit_reader.c",
+	"webp/utils/filters.c",
+	"webp/utils/bit_writer.c",
+	"webp/utils/thread.c",
+	"webp/utils/quant_levels.c",
+	"webp/utils/color_cache.c",
+	"webp/utils/rescaler.c",
+	"webp/utils/utils.c",
+	"webp/utils/huffman.c",
+	"webp/utils/huffman_encode.c",
+	"webp/dec/tree.c",
+	"webp/dec/alpha.c",
+	"webp/dec/frame.c",
+	"webp/dec/vp8l.c",
+	"webp/dec/vp8.c",
+	"webp/dec/quant.c",
+	"webp/dec/webp.c",
+	"webp/dec/buffer.c",
+	"webp/dec/io.c",
+	"webp/dec/layer.c",
+	"webp/dec/idec.c",
+	"webp/image_loader_webp.cpp"
 ]
 
 env.drivers_sources+=webp_sources
@@ -68,4 +62,3 @@ env.drivers_sources+=webp_sources
 
 Export('env')
 
-
diff --git a/drivers/webp/dec/alpha.c b/drivers/webp/dec/alpha.c
index 93729a035f..6e65de9030 100644
--- a/drivers/webp/dec/alpha.c
+++ b/drivers/webp/dec/alpha.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
@@ -12,150 +10,131 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
-#include "../utils/quant_levels_dec.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"
 
-//------------------------------------------------------------------------------
-// ALPHDecoder object.
-
-ALPHDecoder* ALPHNew(void) {
-  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1, sizeof(*dec));
-  return dec;
-}
-
-void ALPHDelete(ALPHDecoder* const dec) {
-  if (dec != NULL) {
-    VP8LDelete(dec->vp8l_dec_);
-    dec->vp8l_dec_ = NULL;
-    free(dec);
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// TODO(skal): move to dsp/ ?
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
 //------------------------------------------------------------------------------
-// Decoding.
-
-// Initialize alpha decoding by parsing the alpha header and decoding the image
-// header for alpha data stored using lossless compression.
-// Returns false in case of error in alpha header (data too short, invalid
-// compression method or filter, error in lossless header data etc).
-static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-                    size_t data_size, int width, int height, uint8_t* output) {
-  int ok = 0;
-  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
-  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// The 'output' buffer should be pre-allocated and must be of the same
+// dimension 'height'x'stride', as that of the image.
+//
+// Returns 1 on successfully decoding the compressed alpha and
+//         0 if either:
+//           error in bit-stream header (invalid compression mode or filter), or
+//           error returned by appropriate compression method.
+
+static int DecodeAlpha(const uint8_t* data, size_t data_size,
+                       int width, int height, int stride, uint8_t* output) {
+  uint8_t* decoded_data = NULL;
+  const size_t decoded_size = height * width;
+  uint8_t* unfiltered_data = NULL;
+  WEBP_FILTER_TYPE filter;
+  int pre_processing;
   int rsrv;
+  int ok = 0;
+  int method;
 
-  assert(width > 0 && height > 0);
+  assert(width > 0 && height > 0 && stride >= width);
   assert(data != NULL && output != NULL);
 
-  dec->width_ = width;
-  dec->height_ = height;
-
   if (data_size <= ALPHA_HEADER_LEN) {
     return 0;
   }
 
-  dec->method_ = (data[0] >> 0) & 0x03;
-  dec->filter_ = (data[0] >> 2) & 0x03;
-  dec->pre_processing_ = (data[0] >> 4) & 0x03;
+  method = (data[0] >> 0) & 0x03;
+  filter = (data[0] >> 2) & 0x03;
+  pre_processing = (data[0] >> 4) & 0x03;
   rsrv = (data[0] >> 6) & 0x03;
-  if (dec->method_ < ALPHA_NO_COMPRESSION ||
-      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
-      dec->filter_ >= WEBP_FILTER_LAST ||
-      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
+  if (method < ALPHA_NO_COMPRESSION ||
+      method > ALPHA_LOSSLESS_COMPRESSION ||
+      filter >= WEBP_FILTER_LAST ||
+      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
       rsrv != 0) {
     return 0;
   }
 
-  if (dec->method_ == ALPHA_NO_COMPRESSION) {
-    const size_t alpha_decoded_size = dec->width_ * dec->height_;
-    ok = (alpha_data_size >= alpha_decoded_size);
+  if (method == ALPHA_NO_COMPRESSION) {
+    ok = (data_size >= decoded_size);
+    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
   } else {
-    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
-    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
+    decoded_data = (uint8_t*)malloc(decoded_size);
+    if (decoded_data == NULL) return 0;
+    ok = VP8LDecodeAlphaImageStream(width, height,
+                                    data + ALPHA_HEADER_LEN,
+                                    data_size - ALPHA_HEADER_LEN,
+                                    decoded_data);
   }
-  return ok;
-}
 
-// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
-// starting from row number 'row'. It assumes that rows up to (row - 1) have
-// already been decoded.
-// Returns false in case of bitstream error.
-static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
-  ALPHDecoder* const alph_dec = dec->alph_dec_;
-  const int width = alph_dec->width_;
-  const int height = alph_dec->height_;
-  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
-  uint8_t* const output = dec->alpha_plane_;
-  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
-    const size_t offset = row * width;
-    const size_t num_pixels = num_rows * width;
-    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
-    memcpy(dec->alpha_plane_ + offset,
-           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
-  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
-    assert(alph_dec->vp8l_dec_ != NULL);
-    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
-      return 0;
+  if (ok) {
+    WebPFilterFunc unfilter_func = WebPUnfilters[filter];
+    if (unfilter_func != NULL) {
+      unfiltered_data = (uint8_t*)malloc(decoded_size);
+      if (unfiltered_data == NULL) {
+        ok = 0;
+        goto Error;
+      }
+      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
+      // and apply filter per image-row.
+      unfilter_func(decoded_data, width, height, 1, width, unfiltered_data);
+      // Construct raw_data (height x stride) from alpha data (height x width).
+      CopyPlane(unfiltered_data, width, output, stride, width, height);
+      free(unfiltered_data);
+    } else {
+      // Construct raw_data (height x stride) from alpha data (height x width).
+      CopyPlane(decoded_data, width, output, stride, width, height);
     }
-  }
-
-  if (unfilter_func != NULL) {
-    unfilter_func(width, height, width, row, num_rows, output);
-  }
-
-  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
-    if (!DequantizeLevels(output, width, height, row, num_rows)) {
-      return 0;
+    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
+      ok = DequantizeLevels(decoded_data, width, height);
     }
   }
 
-  if (row + num_rows == dec->pic_hdr_.height_) {
-    dec->is_alpha_decoded_ = 1;
+ Error:
+  if (method != ALPHA_NO_COMPRESSION) {
+    free(decoded_data);
   }
-  return 1;
+  return ok;
 }
 
 //------------------------------------------------------------------------------
-// Main entry point.
 
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows) {
-  const int width = dec->pic_hdr_.width_;
-  const int height = dec->pic_hdr_.height_;
+  const int stride = dec->pic_hdr_.width_;
 
-  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
+  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
     return NULL;    // sanity check.
   }
 
   if (row == 0) {
-    // Initialize decoding.
-    assert(dec->alpha_plane_ != NULL);
-    dec->alph_dec_ = ALPHNew();
-    if (dec->alph_dec_ == NULL) return NULL;
-    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
-                  width, height, dec->alpha_plane_)) {
-      ALPHDelete(dec->alph_dec_);
-      dec->alph_dec_ = NULL;
-      return NULL;
-    }
-  }
-
-  if (!dec->is_alpha_decoded_) {
-    int ok = 0;
-    assert(dec->alph_dec_ != NULL);
-    ok = ALPHDecode(dec, row, num_rows);
-    if (!ok || dec->is_alpha_decoded_) {
-      ALPHDelete(dec->alph_dec_);
-      dec->alph_dec_ = NULL;
+    // Decode everything during the first call.
+    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
+                     dec->alpha_plane_)) {
+      return NULL;  // Error.
     }
-    if (!ok) return NULL;  // Error.
   }
 
   // Return a pointer to the current decoded row.
-  return dec->alpha_plane_ + row * width;
+  return dec->alpha_plane_ + row * stride;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/alphai.h b/drivers/webp/dec/alphai.h
deleted file mode 100644
index 5fa230ca82..0000000000
--- a/drivers/webp/dec/alphai.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Alpha decoder: internal header.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_DEC_ALPHAI_H_
-#define WEBP_DEC_ALPHAI_H_
-
-#include "./webpi.h"
-#include "../utils/filters.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP8LDecoder;  // Defined in dec/vp8li.h.
-
-typedef struct ALPHDecoder ALPHDecoder;
-struct ALPHDecoder {
-  int width_;
-  int height_;
-  int method_;
-  WEBP_FILTER_TYPE filter_;
-  int pre_processing_;
-  struct VP8LDecoder* vp8l_dec_;
-  VP8Io io_;
-  int use_8b_decode;  // Although alpha channel requires only 1 byte per
-                      // pixel, sometimes VP8LDecoder may need to allocate
-                      // 4 bytes per pixel internally during decode.
-};
-
-//------------------------------------------------------------------------------
-// internal functions. Not public.
-
-// Allocates a new alpha decoder instance.
-ALPHDecoder* ALPHNew(void);
-
-// Clears and deallocates an alpha decoder instance.
-void ALPHDelete(ALPHDecoder* const dec);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_DEC_ALPHAI_H_ */
diff --git a/drivers/webp/dec/buffer.c b/drivers/webp/dec/buffer.c
index 1e852efe74..c159f6f248 100644
--- a/drivers/webp/dec/buffer.c
+++ b/drivers/webp/dec/buffer.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
@@ -17,6 +15,10 @@
 #include "./webpi.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 
@@ -208,3 +210,6 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/decode_vp8.h b/drivers/webp/dec/decode_vp8.h
index b9337bbec0..12c77bcbf6 100644
--- a/drivers/webp/dec/decode_vp8.h
+++ b/drivers/webp/dec/decode_vp8.h
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
@@ -16,7 +14,7 @@
 
 #include "../webp/decode.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -132,8 +130,7 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
   return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
 
-// Decode the VP8 frame header. Returns true if ok.
-// Note: 'io->data' must be pointing to the start of the VP8 frame header.
+// Start decoding a new picture. Returns true if ok.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@@ -178,7 +175,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
     const uint8_t* data, size_t data_size,  // data available so far
     int* const width, int* const height, int* const has_alpha);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/frame.c b/drivers/webp/dec/frame.c
index e1eea94ebe..9c91a48e17 100644
--- a/drivers/webp/dec/frame.c
+++ b/drivers/webp/dec/frame.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
@@ -15,10 +13,11 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
-#define ALIGN_MASK (32 - 1)
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+#define ALIGN_MASK (32 - 1)
 
 //------------------------------------------------------------------------------
 // Filtering
@@ -30,18 +29,25 @@ static void ReconstructRow(const VP8Decoder* const dec,
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 
+static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
+  if (keyframe) {
+    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+  } else {
+    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
+  }
+}
+
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int cache_id = ctx->id_;
   const int y_bps = dec->cache_y_stride_;
-  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
+  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  const int level = f_info->f_level_;
   const int ilevel = f_info->f_ilevel_;
-  const int limit = f_info->f_limit_;
-  if (limit == 0) {
+  const int limit = 2 * level + ilevel;
+  if (level == 0) {
     return;
   }
-  assert(limit >= 3);
   if (dec->filter_type_ == 1) {   // simple
     if (mb_x > 0) {
       VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@@ -57,9 +63,10 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
     }
   } else {    // complex
     const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh = f_info->hev_thresh_;
+    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh =
+        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
     if (mb_x > 0) {
       VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
       VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@@ -90,132 +97,53 @@ static void FilterRow(const VP8Decoder* const dec) {
 }
 
 //------------------------------------------------------------------------------
-// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
 
-static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
+void VP8StoreBlock(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
-    int s;
-    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
-    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-      int i4x4;
-      // First, compute the initial level
-      int base_level;
-      if (dec->segment_hdr_.use_segment_) {
-        base_level = dec->segment_hdr_.filter_strength_[s];
-        if (!dec->segment_hdr_.absolute_delta_) {
-          base_level += hdr->level_;
-        }
-      } else {
-        base_level = hdr->level_;
-      }
-      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
-        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
-        int level = base_level;
-        if (hdr->use_lf_delta_) {
-          // TODO(skal): only CURRENT is handled for now.
-          level += hdr->ref_lf_delta_[0];
-          if (i4x4) {
-            level += hdr->mode_lf_delta_[0];
-          }
-        }
-        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-        if (level > 0) {
-          int ilevel = level;
-          if (hdr->sharpness_ > 0) {
-            if (hdr->sharpness_ > 4) {
-              ilevel >>= 2;
-            } else {
-              ilevel >>= 1;
-            }
-            if (ilevel > 9 - hdr->sharpness_) {
-              ilevel = 9 - hdr->sharpness_;
-            }
-          }
-          if (ilevel < 1) ilevel = 1;
-          info->f_ilevel_ = ilevel;
-          info->f_limit_ = 2 * level + ilevel;
-          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-        } else {
-          info->f_limit_ = 0;  // no filtering
-        }
-        info->f_inner_ = i4x4;
+    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
+    const int skip = dec->mb_info_[dec->mb_x_].skip_;
+    int level = dec->filter_levels_[dec->segment_];
+    if (dec->filter_hdr_.use_lf_delta_) {
+      // TODO(skal): only CURRENT is handled for now.
+      level += dec->filter_hdr_.ref_lf_delta_[0];
+      if (dec->is_i4x4_) {
+        level += dec->filter_hdr_.mode_lf_delta_[0];
       }
     }
-  }
-}
+    level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+    info->f_level_ = level;
 
-//------------------------------------------------------------------------------
-// Dithering
-
-#define DITHER_AMP_TAB_SIZE 12
-static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
-  // roughly, it's dqm->uv_mat_[1]
-  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
-};
-
-void VP8InitDithering(const WebPDecoderOptions* const options,
-                      VP8Decoder* const dec) {
-  assert(dec != NULL);
-  if (options != NULL) {
-    const int d = options->dithering_strength;
-    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
-    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
-    if (f > 0) {
-      int s;
-      int all_amp = 0;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8QuantMatrix* const dqm = &dec->dqm_[s];
-        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
-          // TODO(skal): should we specially dither more for uv_quant_ < 0?
-          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
-          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
-        }
-        all_amp |= dqm->dither_;
+    if (dec->filter_hdr_.sharpness_ > 0) {
+      if (dec->filter_hdr_.sharpness_ > 4) {
+        level >>= 2;
+      } else {
+        level >>= 1;
       }
-      if (all_amp != 0) {
-        VP8InitRandom(&dec->dithering_rg_, 1.0f);
-        dec->dither_ = 1;
+      if (level > 9 - dec->filter_hdr_.sharpness_) {
+        level = 9 - dec->filter_hdr_.sharpness_;
       }
     }
-  }
-}
 
-// minimal amp that will provide a non-zero dithering effect
-#define MIN_DITHER_AMP 4
-#define DITHER_DESCALE 4
-#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
-#define DITHER_AMP_BITS 8
-#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
-
-static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
-  int i, j;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) {
-      // TODO: could be made faster with SSE2
-      const int bits =
-          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
-      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
-      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
-      const int v = (int)dst[i] + delta;
-      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
-    }
-    dst += bps;
+    info->f_ilevel_ = (level < 1) ? 1 : level;
+    info->f_inner_ = (!skip || dec->is_i4x4_);
   }
-}
-
-static void DitherRow(VP8Decoder* const dec) {
-  int mb_x;
-  assert(dec->dither_);
-  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
-    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-    const VP8MBData* const data = ctx->mb_data_ + mb_x;
-    const int cache_id = ctx->id_;
-    const int uv_bps = dec->cache_uv_stride_;
-    if (data->dither_ >= MIN_DITHER_AMP) {
-      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
-      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
-      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
-      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
+  {
+    // Transfer samples to row cache
+    int y;
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
+    for (y = 0; y < 16; ++y) {
+      memcpy(ydst + y * dec->cache_y_stride_,
+             dec->yuv_b_ + Y_OFF + y * BPS, 16);
+    }
+    for (y = 0; y < 8; ++y) {
+      memcpy(udst + y * dec->cache_uv_stride_,
+           dec->yuv_b_ + U_OFF + y * BPS, 8);
+      memcpy(vdst + y * dec->cache_uv_stride_,
+           dec->yuv_b_ + V_OFF + y * BPS, 8);
     }
   }
 }
@@ -237,35 +165,25 @@ static void DitherRow(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int cache_id = ctx->id_;
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
   uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
   uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
   uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int mb_y = ctx->mb_y_;
-  const int is_first_row = (mb_y == 0);
-  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
-
-  if (dec->mt_method_ == 2) {
-    ReconstructRow(dec, ctx);
-  }
+  const int first_row = (ctx->mb_y_ == 0);
+  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
 
   if (ctx->filter_row_) {
     FilterRow(dec);
   }
 
-  if (dec->dither_) {
-    DitherRow(dec);
-  }
-
-  if (io->put != NULL) {
-    int y_start = MACROBLOCK_VPOS(mb_y);
-    int y_end = MACROBLOCK_VPOS(mb_y + 1);
-    if (!is_first_row) {
+  if (io->put) {
+    if (!first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
       io->u = udst;
@@ -276,7 +194,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
       io->v = dec->cache_v_ + uv_offset;
     }
 
-    if (!is_last_row) {
+    if (!last_row) {
       y_end -= extra_y_rows;
     }
     if (y_end > io->crop_bottom) {
@@ -284,8 +202,11 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
     io->a = NULL;
     if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // good idea.
+      // TODO(skal): several things to correct here:
+      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // * we're actually decompressing the full plane only once. It should be
+      //   more obvious from signature.
+      // * we could free alpha_data_ right after this call, but we don't own.
       io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
       if (io->a == NULL) {
         return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -317,8 +238,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
   }
   // rotate top samples if needed
-  if (cache_id + 1 == dec->num_caches_) {
-    if (!is_last_row) {
+  if (ctx->id_ + 1 == dec->num_caches_) {
+    if (!last_row) {
       memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
       memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
       memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@@ -335,14 +256,10 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int filter_row =
-      (dec->filter_type_ > 0) &&
-      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
-  if (dec->mt_method_ == 0) {
+  if (!dec->use_threads_) {
     // ctx->id_ and ctx->f_info_ are already set
     ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = filter_row;
-    ReconstructRow(dec, ctx);
+    ctx->filter_row_ = dec->filter_row_;
     ok = FinishRow(dec, io);
   } else {
     WebPWorker* const worker = &dec->worker_;
@@ -353,21 +270,13 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
       ctx->io_ = *io;
       ctx->id_ = dec->cache_id_;
       ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = filter_row;
-      if (dec->mt_method_ == 2) {  // swap macroblock data
-        VP8MBData* const tmp = ctx->mb_data_;
-        ctx->mb_data_ = dec->mb_data_;
-        dec->mb_data_ = tmp;
-      } else {
-        // perform reconstruction directly in main thread
-        ReconstructRow(dec, ctx);
-      }
-      if (filter_row) {            // swap filter info
+      ctx->filter_row_ = dec->filter_row_;
+      if (ctx->filter_row_) {    // just swap filter info
         VP8FInfo* const tmp = ctx->f_info_;
         ctx->f_info_ = dec->f_info_;
         dec->f_info_ = tmp;
       }
-      WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
+      WebPWorkerLaunch(worker);
       if (++dec->cache_id_ == dec->num_caches_) {
         dec->cache_id_ = 0;
       }
@@ -381,8 +290,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
 
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() no matter what.
-  if (io->setup != NULL && !io->setup(io)) {
+  // Note: Afterward, we must call teardown() not matter what.
+  if (io->setup && !io->setup(io)) {
     VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
     return dec->status_;
   }
@@ -395,7 +304,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 
   // Define the area where we can skip in-loop filtering, in case of cropping.
   //
-  // 'Simple' filter reads two luma samples outside of the macroblock
+  // 'Simple' filter reads two luma samples outside of the macroblock and
   // and filters one. It doesn't filter the chroma samples. Hence, we can
   // avoid doing the in-loop filtering before crop_top/crop_left position.
   // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@@ -430,17 +339,16 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
       dec->br_mb_y_ = dec->mb_h_;
     }
   }
-  PrecomputeFilterStrengths(dec);
   return VP8_STATUS_OK;
 }
 
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
     ok = WebPWorkerSync(&dec->worker_);
   }
 
-  if (io->teardown != NULL) {
+  if (io->teardown) {
     io->teardown(io);
   }
   return ok;
@@ -476,7 +384,7 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
   dec->cache_id_ = 0;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
     WebPWorker* const worker = &dec->worker_;
     if (!WebPWorkerReset(worker)) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
@@ -493,28 +401,6 @@ static int InitThreadContext(VP8Decoder* const dec) {
   return 1;
 }
 
-int VP8GetThreadMethod(const WebPDecoderOptions* const options,
-                       const WebPHeaderStructure* const headers,
-                       int width, int height) {
-  if (options == NULL || options->use_threads == 0) {
-    return 0;
-  }
-  (void)headers;
-  (void)width;
-  (void)height;
-  assert(!headers->is_lossless);
-#if defined(WEBP_USE_THREAD)
-  if (width < MIN_WIDTH_FOR_THREADS) return 0;
-  // TODO(skal): tune the heuristic further
-#if 0
-  if (height < 2 * width) return 2;
-#endif
-  return 2;
-#else   // !WEBP_USE_THREAD
-  return 0;
-#endif
-}
-
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES
 
@@ -526,15 +412,14 @@ static int AllocateMemory(VP8Decoder* const dec) {
   const int mb_w = dec->mb_w_;
   // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
   const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
+  const size_t top_size = (16 + 8 + 8) * mb_w;
   const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
   const size_t f_info_size =
       (dec->filter_type_ > 0) ?
-          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
         : 0;
   const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t mb_data_size =
-      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
+  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
   const size_t cache_height = (16 * num_caches
                             + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
   const size_t cache_size = top_size * cache_height;
@@ -543,7 +428,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
       (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
   const uint64_t needed = (uint64_t)intra_pred_mode_size
                         + top_size + mb_info_size + f_info_size
-                        + yuv_size + mb_data_size
+                        + yuv_size + coeffs_size
                         + cache_size + alpha_size + ALIGN_MASK;
   uint8_t* mem;
 
@@ -564,8 +449,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
   dec->intra_t_ = (uint8_t*)mem;
   mem += intra_pred_mode_size;
 
-  dec->yuv_t_ = (VP8TopSamples*)mem;
-  mem += top_size;
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
 
   dec->mb_info_ = ((VP8MB*)mem) + 1;
   mem += mb_info_size;
@@ -574,7 +463,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
@@ -586,12 +475,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
   dec->yuv_b_ = (uint8_t*)mem;
   mem += yuv_size;
 
-  dec->mb_data_ = (VP8MBData*)mem;
-  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
-  if (dec->mt_method_ == 2) {
-    dec->thread_ctx_.mb_data_ += mb_w;
-  }
-  mem += mb_data_size;
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;
 
   dec->cache_y_stride_ = 16 * mb_w;
   dec->cache_uv_stride_ = 8 * mb_w;
@@ -611,11 +496,9 @@ static int AllocateMemory(VP8Decoder* const dec) {
   // alpha plane
   dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
   mem += alpha_size;
-  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
 
-  // note: left/top-info is initialized once for all.
+  // note: left-info is initialized once for all.
   memset(dec->mb_info_ - 1, 0, mb_info_size);
-  VP8InitScanline(dec);   // initialize left too.
 
   // initialize top
   memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@@ -652,163 +535,138 @@ static const int kScan[16] = {
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };
 
-static int CheckMode(int mb_x, int mb_y, int mode) {
+static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
   if (mode == B_DC_PRED) {
-    if (mb_x == 0) {
-      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    if (dec->mb_x_ == 0) {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
     } else {
-      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
     }
   }
   return mode;
 }
 
-static void Copy32b(uint8_t* dst, uint8_t* src) {
-  memcpy(dst, src, 4);
-}
-
-static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
-                                    uint8_t* const dst) {
-  switch (bits >> 30) {
-    case 3:
-      VP8Transform(src, dst, 0);
-      break;
-    case 2:
-      VP8TransformAC3(src, dst);
-      break;
-    case 1:
-      VP8TransformDC(src, dst);
-      break;
-    default:
-      break;
-  }
-}
-
-static void DoUVTransform(uint32_t bits, const int16_t* const src,
-                          uint8_t* const dst) {
-  if (bits & 0xff) {    // any non-zero coeff at all?
-    if (bits & 0xaa) {  // any non-zero AC coefficient?
-      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
-    } else {
-      VP8TransformDCUV(src, dst);
-    }
-  }
+static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
+  *(uint32_t*)dst = *(uint32_t*)src;
 }
 
-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx) {
-  int j;
-  int mb_x;
-  const int mb_y = ctx->mb_y_;
-  const int cache_id = ctx->id_;
+void VP8ReconstructBlock(VP8Decoder* const dec) {
   uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
   uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
   uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    const VP8MBData* const block = ctx->mb_data_ + mb_x;
 
-    // Rotate in the left samples from previously decoded block. We move four
-    // pixels at a time for alignment reason, and because of in-loop filter.
-    if (mb_x > 0) {
-      for (j = -1; j < 16; ++j) {
-        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-      }
-      for (j = -1; j < 8; ++j) {
-        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-      }
-    } else {
-      for (j = 0; j < 16; ++j) {
-        y_dst[j * BPS - 1] = 129;
-      }
-      for (j = 0; j < 8; ++j) {
-        u_dst[j * BPS - 1] = 129;
-        v_dst[j * BPS - 1] = 129;
-      }
-      // Init top-left sample on left column too
-      if (mb_y > 0) {
-        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-      }
+  // Rotate in the left samples from previously decoded block. We move four
+  // pixels at a time for alignment reason, and because of in-loop filter.
+  if (dec->mb_x_ > 0) {
+    int j;
+    for (j = -1; j < 16; ++j) {
+      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+    }
+    for (j = -1; j < 8; ++j) {
+      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+    }
+  } else {
+    int j;
+    for (j = 0; j < 16; ++j) {
+      y_dst[j * BPS - 1] = 129;
+    }
+    for (j = 0; j < 8; ++j) {
+      u_dst[j * BPS - 1] = 129;
+      v_dst[j * BPS - 1] = 129;
+    }
+    // Init top-left sample on left column too
+    if (dec->mb_y_ > 0) {
+      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+    }
+  }
+  {
+    // bring top samples into the cache
+    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
+    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
+    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
+    const int16_t* coeffs = dec->coeffs_;
+    int n;
+
+    if (dec->mb_y_ > 0) {
+      memcpy(y_dst - BPS, top_y, 16);
+      memcpy(u_dst - BPS, top_u, 8);
+      memcpy(v_dst - BPS, top_v, 8);
+    } else if (dec->mb_x_ == 0) {
+      // we only need to do this init once at block (0,0).
+      // Afterward, it remains valid for the whole topmost row.
+      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+      memset(u_dst - BPS - 1, 127, 8 + 1);
+      memset(v_dst - BPS - 1, 127, 8 + 1);
     }
-    {
-      // bring top samples into the cache
-      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
-      const int16_t* const coeffs = block->coeffs_;
-      uint32_t bits = block->non_zero_y_;
-      int n;
-
-      if (mb_y > 0) {
-        memcpy(y_dst - BPS, top_yuv[0].y, 16);
-        memcpy(u_dst - BPS, top_yuv[0].u, 8);
-        memcpy(v_dst - BPS, top_yuv[0].v, 8);
-      } else if (mb_x == 0) {
-        // we only need to do this init once at block (0,0).
-        // Afterward, it remains valid for the whole topmost row.
-        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-        memset(u_dst - BPS - 1, 127, 8 + 1);
-        memset(v_dst - BPS - 1, 127, 8 + 1);
-      }
 
-      // predict and add residuals
-      if (block->is_i4x4_) {   // 4x4
-        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+    // predict and add residuals
 
-        if (mb_y > 0) {
-          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
-            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
-          } else {
-            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
-          }
-        }
-        // replicate the top-right pixels below
-        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+    if (dec->is_i4x4_) {   // 4x4
+      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
 
-        // predict and add residuals for all 4x4 blocks in turn.
-        for (n = 0; n < 16; ++n, bits <<= 2) {
-          uint8_t* const dst = y_dst + kScan[n];
-          VP8PredLuma4[block->imodes_[n]](dst);
-          DoTransform(bits, coeffs + n * 16, dst);
-        }
-      } else {    // 16x16
-        const int pred_func = CheckMode(mb_x, mb_y,
-                                        block->imodes_[0]);
-        VP8PredLuma16[pred_func](y_dst);
-        if (bits != 0) {
-          for (n = 0; n < 16; ++n, bits <<= 2) {
-            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
-          }
+      if (dec->mb_y_ > 0) {
+        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
+          top_right[0] = top_y[15] * 0x01010101u;
+        } else {
+          memcpy(top_right, top_y + 16, sizeof(*top_right));
         }
       }
-      {
-        // Chroma
-        const uint32_t bits_uv = block->non_zero_uv_;
-        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
-        VP8PredChroma8[pred_func](u_dst);
-        VP8PredChroma8[pred_func](v_dst);
-        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      // replicate the top-right pixels below
+      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+      // predict and add residues for all 4x4 blocks in turn.
+      for (n = 0; n < 16; n++) {
+        uint8_t* const dst = y_dst + kScan[n];
+        VP8PredLuma4[dec->imodes_[n]](dst);
+        if (dec->non_zero_ac_ & (1 << n)) {
+          VP8Transform(coeffs + n * 16, dst, 0);
+        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+          VP8TransformDC(coeffs + n * 16, dst);
+        }
       }
-
-      // stash away top samples for next block
-      if (mb_y < dec->mb_h_ - 1) {
-        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+    } else {    // 16x16
+      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+      VP8PredLuma16[pred_func](y_dst);
+      if (dec->non_zero_) {
+        for (n = 0; n < 16; n++) {
+          uint8_t* const dst = y_dst + kScan[n];
+          if (dec->non_zero_ac_ & (1 << n)) {
+            VP8Transform(coeffs + n * 16, dst, 0);
+          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+            VP8TransformDC(coeffs + n * 16, dst);
+          }
+        }
       }
     }
-    // Transfer reconstructed samples from yuv_b_ cache to final destination.
     {
-      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
-      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
-      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
-      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
-      for (j = 0; j < 16; ++j) {
-        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      // Chroma
+      const int pred_func = CheckMode(dec, dec->uvmode_);
+      VP8PredChroma8[pred_func](u_dst);
+      VP8PredChroma8[pred_func](v_dst);
+
+      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
+        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
+        if (dec->non_zero_ac_ & 0x0f0000) {
+          VP8TransformUV(u_coeffs, u_dst);
+        } else {
+          VP8TransformDCUV(u_coeffs, u_dst);
+        }
       }
-      for (j = 0; j < 8; ++j) {
-        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      if (dec->non_zero_ & 0xf00000) {   // chroma-V
+        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
+        if (dec->non_zero_ac_ & 0xf00000) {
+          VP8TransformUV(v_coeffs, v_dst);
+        } else {
+          VP8TransformDCUV(v_coeffs, v_dst);
+        }
+      }
+
+      // stash away top samples for next block
+      if (dec->mb_y_ < dec->mb_h_ - 1) {
+        memcpy(top_y, y_dst + 15 * BPS, 16);
+        memcpy(top_u, u_dst +  7 * BPS,  8);
+        memcpy(top_v, v_dst +  7 * BPS,  8);
       }
     }
   }
@@ -816,3 +674,6 @@ static void ReconstructRow(const VP8Decoder* const dec,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/idec.c b/drivers/webp/dec/idec.c
index 40d5ff6e0d..7df790ced8 100644
--- a/drivers/webp/dec/idec.c
+++ b/drivers/webp/dec/idec.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Incremental decoding
@@ -15,11 +13,14 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
 #define CHUNK_SIZE 4096
@@ -28,13 +29,11 @@
 //------------------------------------------------------------------------------
 // Data structures for memory and states
 
-// Decoding states. State normally flows as:
-// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
-// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
+// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
-  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
+  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
   STATE_VP8_PARTS0,
   STATE_VP8_DATA,
   STATE_VP8L_HEADER,
@@ -98,23 +97,6 @@ static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
   return (mem->end_ - mem->start_);
 }
 
-// Check if we need to preserve the compressed alpha data, as it may not have
-// been decoded yet.
-static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
-  if (idec->state_ == STATE_WEBP_HEADER) {
-    // We haven't parsed the headers yet, so we don't know whether the image is
-    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
-    return 0;
-  }
-  if (idec->is_lossless_) {
-    return 0;  // ALPH chunk is not present for lossless images.
-  } else {
-    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
-    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
-    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
-  }
-}
-
 static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const new_base = mem->buf_ + mem->start_;
@@ -140,22 +122,6 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
       }
       assert(last_part >= 0);
       dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
-      if (NeedCompressedAlpha(idec)) {
-        ALPHDecoder* const alph_dec = dec->alph_dec_;
-        dec->alpha_data_ += offset;
-        if (alph_dec != NULL) {
-          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
-            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
-            assert(alph_vp8l_dec != NULL);
-            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
-            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
-                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
-                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
-          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
-            // Nothing special to do in this case.
-          }
-        }
-      }
     } else {    // Resize lossless bitreader
       VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
       VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@@ -167,12 +133,8 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                              const uint8_t* const data, size_t data_size) {
-  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
-  const int need_compressed_alpha = NeedCompressedAlpha(idec);
-  const uint8_t* const old_start = mem->buf_ + mem->start_;
-  const uint8_t* const old_base =
-      need_compressed_alpha ? dec->alpha_data_ : old_start;
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
   assert(mem->mode_ == MEM_MODE_APPEND);
   if (data_size > MAX_CHUNK_PAYLOAD) {
     // security safeguard: trying to allocate more than what the format
@@ -181,8 +143,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   }
 
   if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    const size_t new_mem_start = old_start - old_base;
-    const size_t current_size = MemDataSize(mem) + new_mem_start;
+    const size_t current_size = MemDataSize(mem);
     const uint64_t new_size = (uint64_t)current_size + data_size;
     const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
     uint8_t* const new_buf =
@@ -192,7 +153,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     free(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
-    mem->start_ = new_mem_start;
+    mem->start_ = 0;
     mem->end_ = current_size;
   }
 
@@ -200,15 +161,14 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
 
-  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
   return 1;
 }
 
 static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_buf = mem->buf_;
-  const uint8_t* const old_start = old_buf + mem->start_;
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
   assert(mem->mode_ == MEM_MODE_MAP);
 
   if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
@@ -216,7 +176,7 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
   mem->buf_ = (uint8_t*)data;
   mem->end_ = mem->buf_size_ = data_size;
 
-  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
   return 1;
 }
 
@@ -282,7 +242,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
   if (idec->state_ == STATE_VP8_DATA) {
     VP8Io* const io = &idec->io_;
-    if (io->teardown != NULL) {
+    if (io->teardown) {
       io->teardown(io);
     }
   }
@@ -325,9 +285,15 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
     idec->dec_ = dec;
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = (idec->params_.options != NULL) &&
+                        (idec->params_.options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
   } else {
     VP8LDecoder* const dec = VP8LNew();
     if (dec == NULL) {
@@ -342,14 +308,13 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
   const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
   const size_t curr_size = MemDataSize(&idec->mem_);
-  int width, height;
   uint32_t bits;
 
   if (curr_size < VP8_FRAME_HEADER_SIZE) {
     // Not enough data bytes to extract VP8 Frame Header.
     return VP8_STATUS_SUSPENDED;
   }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
@@ -416,10 +381,7 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   if (dec->status_ != VP8_STATUS_OK) {
     return IDecError(idec, dec->status_);
   }
-  // This change must be done before calling VP8InitFrame()
-  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
-                                       io->width, io->height);
-  VP8InitDithering(params->options, dec);
+
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
@@ -445,11 +407,16 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   VP8Io* const io = &idec->io_;
 
   assert(dec->ready_);
+
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
     VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+    if (dec->mb_x_ == 0) {
+      VP8InitScanline(dec);
+    }
+    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
       MBContext context;
       SaveContext(dec, token_br, &context);
+
       if (!VP8DecodeMB(dec, token_br)) {
         RestoreContext(&context, dec, token_br);
         // We shouldn't fail when MAX_MB data was available
@@ -458,18 +425,20 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
         }
         return VP8_STATUS_SUSPENDED;
       }
+      VP8ReconstructBlock(dec);
+      // Store data and save block's filtering params
+      VP8StoreBlock(dec);
+
       // Release buffer only if there is only one partition
       if (dec->num_parts_ == 1) {
         idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
-    VP8InitScanline(dec);   // Prepare for next scanline
-
-    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
+    dec->mb_x_ = 0;
   }
   // Synchronize the thread and check for errors.
   if (!VP8ExitCritical(dec, io)) {
@@ -481,8 +450,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   return VP8_STATUS_OK;
 }
 
-static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
-                                         VP8StatusCode status) {
+static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
   if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
     return VP8_STATUS_SUSPENDED;
   }
@@ -539,14 +507,14 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
   VP8StatusCode status = VP8_STATUS_SUSPENDED;
 
-  if (idec->state_ == STATE_WEBP_HEADER) {
+  if (idec->state_ == STATE_PRE_VP8) {
     status = DecodeWebPHeaders(idec);
   } else {
     if (idec->dec_ == NULL) {
       return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
     }
   }
-  if (idec->state_ == STATE_VP8_HEADER) {
+  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
     status = DecodeVP8FrameHeader(idec);
   }
   if (idec->state_ == STATE_VP8_PARTS0) {
@@ -573,7 +541,7 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
     return NULL;
   }
 
-  idec->state_ = STATE_WEBP_HEADER;
+  idec->state_ = STATE_PRE_VP8;
   idec->chunk_size_ = 0;
 
   InitMemBuffer(&idec->mem_);
@@ -581,8 +549,7 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
   VP8InitIo(&idec->io_);
 
   WebPResetDecParams(&idec->params_);
-  idec->params_.output = (output_buffer != NULL) ? output_buffer
-                                                 : &idec->output_;
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
   WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
 
   return idec;
@@ -614,13 +581,9 @@ void WebPIDelete(WebPIDecoder* idec) {
   if (idec == NULL) return;
   if (idec->dec_ != NULL) {
     if (!idec->is_lossless_) {
-      if (idec->state_ == STATE_VP8_DATA) {
-        // Synchronize the thread, clean-up and check for errors.
-        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
-      }
-      VP8Delete((VP8Decoder*)idec->dec_);
+      VP8Delete(idec->dec_);
     } else {
-      VP8LDelete((VP8LDecoder*)idec->dec_);
+      VP8LDelete(idec->dec_);
     }
   }
   ClearMemBuffer(&idec->mem_);
@@ -633,22 +596,12 @@ void WebPIDelete(WebPIDecoder* idec) {
 
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                           size_t output_buffer_size, int output_stride) {
-  const int is_external_memory = (output_buffer != NULL);
   WebPIDecoder* idec;
-
   if (mode >= MODE_YUV) return NULL;
-  if (!is_external_memory) {    // Overwrite parameters to sane values.
-    output_buffer_size = 0;
-    output_stride = 0;
-  } else {  // A buffer was passed. Validate the other params.
-    if (output_stride == 0 || output_buffer_size == 0) {
-      return NULL;   // invalid parameter.
-    }
-  }
   idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
   idec->output_.colorspace = mode;
-  idec->output_.is_external_memory = is_external_memory;
+  idec->output_.is_external_memory = 1;
   idec->output_.u.RGBA.rgba = output_buffer;
   idec->output_.u.RGBA.stride = output_stride;
   idec->output_.u.RGBA.size = output_buffer_size;
@@ -659,30 +612,10 @@ WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
                            uint8_t* u, size_t u_size, int u_stride,
                            uint8_t* v, size_t v_size, int v_stride,
                            uint8_t* a, size_t a_size, int a_stride) {
-  const int is_external_memory = (luma != NULL);
-  WebPIDecoder* idec;
-  WEBP_CSP_MODE colorspace;
-
-  if (!is_external_memory) {    // Overwrite parameters to sane values.
-    luma_size = u_size = v_size = a_size = 0;
-    luma_stride = u_stride = v_stride = a_stride = 0;
-    u = v = a = NULL;
-    colorspace = MODE_YUVA;
-  } else {  // A luma buffer was passed. Validate the other parameters.
-    if (u == NULL || v == NULL) return NULL;
-    if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
-    if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
-    if (a != NULL) {
-      if (a_size == 0 || a_stride == 0) return NULL;
-    }
-    colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
-  }
-
-  idec = WebPINewDecoder(NULL);
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
-
-  idec->output_.colorspace = colorspace;
-  idec->output_.is_external_memory = is_external_memory;
+  idec->output_.colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
+  idec->output_.is_external_memory = 1;
   idec->output_.u.YUVA.y = luma;
   idec->output_.u.YUVA.y_stride = luma_stride;
   idec->output_.u.YUVA.y_size = luma_size;
@@ -835,7 +768,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                     VP8IoSetupHook setup,
                     VP8IoTeardownHook teardown,
                     void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
+  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
     return 0;
   }
 
@@ -847,3 +780,6 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
   return 1;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/io.c b/drivers/webp/dec/io.c
index 1ba376ed27..594804c2e6 100644
--- a/drivers/webp/dec/io.c
+++ b/drivers/webp/dec/io.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
@@ -18,6 +16,10 @@
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
 
@@ -115,7 +117,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
 
   if (y == 0) {
     // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
   } else {
     // We can finish the left-over line from previous call.
     upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@@ -599,7 +601,7 @@ static int CustomPut(const VP8Io* io) {
     return 0;
   }
   num_lines_out = p->emit(io, p);
-  if (p->emit_alpha != NULL) {
+  if (p->emit_alpha) {
     p->emit_alpha(io, p);
   }
   p->last_y += num_lines_out;
@@ -626,3 +628,6 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/layer.c b/drivers/webp/dec/layer.c
index dacb9e23cd..a3a5bdcfe8 100644
--- a/drivers/webp/dec/layer.c
+++ b/drivers/webp/dec/layer.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@@ -16,6 +14,10 @@
 
 #include "./vp8i.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 
 int VP8DecodeLayer(VP8Decoder* const dec) {
@@ -28,3 +30,6 @@ int VP8DecodeLayer(VP8Decoder* const dec) {
   return 1;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/quant.c b/drivers/webp/dec/quant.c
index 5b648f942c..d54097af0d 100644
--- a/drivers/webp/dec/quant.c
+++ b/drivers/webp/dec/quant.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
@@ -13,6 +11,10 @@
 
 #include "./vp8i.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
 }
@@ -100,11 +102,12 @@ void VP8ParseQuant(VP8Decoder* const dec) {
 
       m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
       m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
-
-      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
     }
   }
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/tree.c b/drivers/webp/dec/tree.c
index bf9b7c551a..82484e4c55 100644
--- a/drivers/webp/dec/tree.c
+++ b/drivers/webp/dec/tree.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
@@ -15,6 +13,10 @@
 
 #define USE_GENERIC_TREE
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
   -B_DC_PRED, 1,
@@ -29,12 +31,61 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif
 
+#ifndef ONLY_KEYFRAME_CODE
+
+// inter prediction modes
+enum {
+  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
+  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
+
+static const int8_t kYModesInter[8] = {
+  -DC_PRED, 1,
+    2, 3,
+      -V_PRED, -H_PRED,
+      -TM_PRED, -B_PRED
+};
+
+static const int8_t kMBSplit[6] = {
+  -3, 1,
+    -2, 2,
+      -0, -1
+};
+
+static const int8_t kMVRef[8] = {
+  -ZEROMV, 1,
+    -NEARESTMV, 2,
+      -NEARMV, 3,
+        -NEWMV, -SPLITMV
+};
+
+static const int8_t kMVRef4[6] = {
+  -LEFT4, 1,
+    -ABOVE4, 2,
+      -ZERO4, -NEW4
+};
+#endif
+
 //------------------------------------------------------------------------------
 // Default probabilities
 
+// Inter
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
+static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
+static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
+  { 162, 128, 225, 146, 172, 147, 214,  39,
+    156, 128, 129, 132,  75, 145, 178, 206,
+    239, 254, 254 },
+  { 164, 128, 204, 170, 119, 235, 140, 230,
+    228, 128, 130, 130,  74, 148, 180, 203,
+    236, 254, 254 }
+};
+#endif
+
 // Paragraph 13.5
 static const uint8_t
   CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -275,25 +326,28 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
 
 void VP8ResetProba(VP8Proba* const proba) {
   memset(proba->segments_, 255u, sizeof(proba->segments_));
-  // proba->bands_[][] is initialized later
+  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
+#ifndef ONLY_KEYFRAME_CODE
+  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
+  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
+  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
+#endif
 }
 
-void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
   uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
   uint8_t* const left = dec->intra_l_;
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-
-  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!block->is_i4x4_) {
-    // Hardcoded 16x16 intra-mode decision tree.
+  // Hardcoded 16x16 intra-mode decision tree.
+  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!dec->is_i4x4_) {
     const int ymode =
         VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                            : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    block->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(*top));
-    memset(left, ymode, 4 * sizeof(*left));
+    dec->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(top[0]));
+    memset(left, ymode, 4 * sizeof(left[0]));
   } else {
-    uint8_t* modes = block->imodes_;
+    uint8_t* modes = dec->imodes_;
     int y;
     for (y = 0; y < 4; ++y) {
       int ymode = left[y];
@@ -302,10 +356,10 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
         // Generic tree-parsing
-        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
-        while (i > 0) {
+        int i = 0;
+        do {
           i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        }
+        } while (i > 0);
         ymode = -i;
 #else
         // Hardcoded tree parsing
@@ -320,16 +374,15 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
                             (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
         top[x] = ymode;
+        *modes++ = ymode;
       }
-      memcpy(modes, top, 4 * sizeof(*top));
-      modes += 4;
       left[y] = ymode;
     }
   }
   // Hardcoded UVMode decision tree
-  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-                 : !VP8GetBit(br, 114) ? V_PRED
-                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+               : !VP8GetBit(br, 114) ? V_PRED
+               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 
 //------------------------------------------------------------------------------
@@ -471,6 +524,17 @@ static const uint8_t
   }
 };
 
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
+  { 237, 246, 253, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 250, 250,
+    252, 254, 254 },
+  { 231, 243, 245, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 251, 251,
+    254, 254, 254 }
+};
+#endif
+
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   VP8Proba* const proba = &dec->proba_;
@@ -479,9 +543,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
-                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
-          proba->bands_[t][b].probas_[c][p] = v;
+          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
+            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
+          }
         }
       }
     }
@@ -490,5 +554,36 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   if (dec->use_skip_proba_) {
     dec->skip_p_ = VP8GetValue(br, 8);
   }
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->frm_hdr_.key_frame_) {
+    int i;
+    dec->intra_p_ = VP8GetValue(br, 8);
+    dec->last_p_ = VP8GetValue(br, 8);
+    dec->golden_p_ = VP8GetValue(br, 8);
+    if (VP8Get(br)) {   // update y-mode
+      for (i = 0; i < 4; ++i) {
+        proba->ymode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    if (VP8Get(br)) {   // update uv-mode
+      for (i = 0; i < 3; ++i) {
+        proba->uvmode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    // update MV
+    for (i = 0; i < 2; ++i) {
+      int k;
+      for (k = 0; k < NUM_MV_PROBAS; ++k) {
+        if (VP8GetBit(br, MVUpdateProba[i][k])) {
+          const int v = VP8GetValue(br, 7);
+          proba->mv_[i][k] = v ? v << 1 : 1;
+        }
+      }
+    }
+  }
+#endif
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/vp8.c b/drivers/webp/dec/vp8.c
index bfd0e8f9d3..b0ccfa2a06 100644
--- a/drivers/webp/dec/vp8.c
+++ b/drivers/webp/dec/vp8.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@@ -13,12 +11,15 @@
 
 #include <stdlib.h>
 
-#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 #include "../utils/bit_reader.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 
 int WebPGetDecoderVersion(void) {
@@ -120,9 +121,6 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
     if (((bits >> 5)) >= chunk_size) {  // partition_length
       return 0;         // inconsistent size information.
     }
-    if (w == 0 || h == 0) {
-      return 0;         // We don't support both width and height to be zero.
-    }
 
     if (width) {
       *width = w;
@@ -238,6 +236,20 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
     }
   }
   dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
+  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
+    if (dec->segment_hdr_.use_segment_) {
+      int s;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        int strength = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          strength += hdr->level_;
+        }
+        dec->filter_levels_[s] = strength;
+      }
+    } else {
+      dec->filter_levels_[0] = hdr->level_;
+    }
+  }
   return !br->eof_;
 }
 
@@ -249,6 +261,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   VP8PictureHeader* pic_hdr;
   VP8BitReader* br;
   VP8StatusCode status;
+  WebPHeaderStructure headers;
 
   if (dec == NULL) {
     return 0;
@@ -258,8 +271,33 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                        "null VP8Io passed to VP8GetHeaders()");
   }
-  buf = io->data;
-  buf_size = io->data_size;
+
+  // Process Pre-VP8 chunks.
+  headers.data = io->data;
+  headers.data_size = io->data_size;
+  status = WebPParseHeaders(&headers);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "Incorrect/incomplete header.");
+  }
+  if (headers.is_lossless) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "Unexpected lossless format encountered.");
+  }
+
+  if (dec->alpha_data_ == NULL) {
+    assert(dec->alpha_data_size_ == 0);
+    // We have NOT set alpha data yet. Set it now.
+    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
+    // WebPParseHeaders() is called more than once, as in incremental decoding
+    // case.)
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+  }
+
+  // Process the VP8 frame header.
+  buf = headers.data + headers.offset;
+  buf_size = headers.data_size - headers.offset;
+  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
   if (buf_size < 4) {
     return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                        "Truncated header.");
@@ -355,11 +393,38 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
   // Frame buffer marking
   if (!frm_hdr->key_frame_) {
+    // Paragraph 9.7
+#ifndef ONLY_KEYFRAME_CODE
+    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
+    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
+    if (!(dec->buffer_flags_ & 1)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
+    }
+    if (!(dec->buffer_flags_ & 2)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
+    }
+    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
+    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
+#else
     return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                        "Not a key frame.");
+#endif
+  } else {
+    dec->buffer_flags_ = 0x003 | 0x100;
   }
 
-  VP8Get(br);   // ignore the value of update_proba_
+  // Paragraph 9.8
+#ifndef ONLY_KEYFRAME_CODE
+  dec->update_proba_ = VP8Get(br);
+  if (!dec->update_proba_) {    // save for later restore
+    dec->proba_saved_ = dec->proba_;
+  }
+  dec->buffer_flags_ &= 1 << 8;
+  dec->buffer_flags_ |=
+      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
+#else
+  VP8Get(br);   // just ignore the value of update_proba_
+#endif
 
   VP8ParseProba(br, dec);
 
@@ -393,7 +458,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)
 
-static const int kBands[16 + 1] = {
+static const uint8_t kBands[16 + 1] = {
   0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
   0  // extra entry as sentinel
 };
@@ -408,163 +473,168 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
-static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
-  int v;
-  if (!VP8GetBit(br, p[3])) {
-    if (!VP8GetBit(br, p[4])) {
-      v = 2;
-    } else {
-      v = 3 + VP8GetBit(br, p[5]);
-    }
-  } else {
-    if (!VP8GetBit(br, p[6])) {
-      if (!VP8GetBit(br, p[7])) {
-        v = 5 + VP8GetBit(br, 159);
-      } else {
-        v = 7 + 2 * VP8GetBit(br, 165);
-        v += VP8GetBit(br, 145);
-      }
-    } else {
-      const uint8_t* tab;
-      const int bit1 = VP8GetBit(br, p[8]);
-      const int bit0 = VP8GetBit(br, p[9 + bit1]);
-      const int cat = 2 * bit1 + bit0;
-      v = 0;
-      for (tab = kCat3456[cat]; *tab; ++tab) {
-        v += v + VP8GetBit(br, *tab);
-      }
-      v += 3 + (8 << cat);
-    }
-  }
-  return v;
-}
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
 
 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+// (and 0 if there's no coeff at all)
+static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
                      int ctx, const quant_t dq, int n, int16_t* out) {
   // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n].probas_[ctx];
-  for (; n < 16; ++n) {
-    if (!VP8GetBit(br, p[0])) {
-      return n;  // previous coeff was last non-zero coeff
-    }
-    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
-      p = prob[kBands[++n]].probas_[0];
-      if (n == 16) return 16;
-    }
-    {        // non zero coeff
-      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
-      int v;
+  const uint8_t* p = prob[n][ctx];
+  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
+    return 0;
+  }
+  while (1) {
+    ++n;
+    if (!VP8GetBit(br, p[1])) {
+      p = prob[kBands[n]][0];
+    } else {  // non zero coeff
+      int v, j;
       if (!VP8GetBit(br, p[2])) {
+        p = prob[kBands[n]][1];
         v = 1;
-        p = p_ctx[1];
       } else {
-        v = GetLargeValue(br, p);
-        p = p_ctx[2];
+        if (!VP8GetBit(br, p[3])) {
+          if (!VP8GetBit(br, p[4])) {
+            v = 2;
+          } else {
+            v = 3 + VP8GetBit(br, p[5]);
+          }
+        } else {
+          if (!VP8GetBit(br, p[6])) {
+            if (!VP8GetBit(br, p[7])) {
+              v = 5 + VP8GetBit(br, 159);
+            } else {
+              v = 7 + 2 * VP8GetBit(br, 165);
+              v += VP8GetBit(br, 145);
+            }
+          } else {
+            const uint8_t* tab;
+            const int bit1 = VP8GetBit(br, p[8]);
+            const int bit0 = VP8GetBit(br, p[9 + bit1]);
+            const int cat = 2 * bit1 + bit0;
+            v = 0;
+            for (tab = kCat3456[cat]; *tab; ++tab) {
+              v += v + VP8GetBit(br, *tab);
+            }
+            v += 3 + (8 << cat);
+          }
+        }
+        p = prob[kBands[n]][2];
       }
-      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      j = kZigzag[n - 1];
+      out[j] = VP8GetSigned(br, v) * dq[j > 0];
+      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
+        return n;
+      }
+    }
+    if (n == 16) {
+      return 16;
     }
   }
-  return 16;
 }
 
-static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
-  nz_coeffs <<= 2;
-  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
-  return nz_coeffs;
-}
-
-static int ParseResiduals(VP8Decoder* const dec,
-                          VP8MB* const mb, VP8BitReader* const token_br) {
-  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
-  const VP8BandProbas* ac_proba;
-  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int16_t* dst = block->coeffs_;
+// Alias-safe way of converting 4bytes to 32bits.
+typedef union {
+  uint8_t  i8[4];
+  uint32_t i32;
+} PackedNz;
+
+// Table to unpack four bits into four bytes
+static const PackedNz kUnpackTab[16] = {
+  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
+  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
+  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
+  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
+
+// Macro to pack four LSB of four bytes into four bits.
+#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
+    defined(__BIG_ENDIAN__)
+#define PACK_CST 0x08040201U
+#else
+#define PACK_CST 0x01020408U
+#endif
+#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
+
+static void ParseResiduals(VP8Decoder* const dec,
+                           VP8MB* const mb, VP8BitReader* const token_br) {
+  int out_t_nz, out_l_nz, first;
+  ProbaArray ac_prob;
+  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
+  int16_t* dst = dec->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
-  uint8_t tnz, lnz;
-  uint32_t non_zero_y = 0;
-  uint32_t non_zero_uv = 0;
+  PackedNz nz_ac, nz_dc;
+  PackedNz tnz, lnz;
+  uint32_t non_zero_ac = 0;
+  uint32_t non_zero_dc = 0;
   int x, y, ch;
-  uint32_t out_t_nz, out_l_nz;
-  int first;
 
+  nz_dc.i32 = nz_ac.i32 = 0;
   memset(dst, 0, 384 * sizeof(*dst));
-  if (!block->is_i4x4_) {    // parse DC
+  if (!dec->is_i4x4_) {    // parse DC
     int16_t dc[16] = { 0 };
-    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
-    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
-    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
-    if (nz > 1) {   // more than just the DC -> perform the full transform
-      VP8TransformWHT(dc, dst);
-    } else {        // only DC is non-zero -> inlined simplified transform
-      int i;
-      const int dc0 = (dc[0] + 3) >> 3;
-      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
-    }
+    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
+    mb->dc_nz_ = left_mb->dc_nz_ =
+        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+                   ctx, q->y2_mat_, 0, dc) > 0);
     first = 1;
-    ac_proba = bands[0];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
+    VP8TransformWHT(dc, dst);
   } else {
     first = 0;
-    ac_proba = bands[3];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
   }
 
-  tnz = mb->nz_ & 0x0f;
-  lnz = left_mb->nz_ & 0x0f;
+  tnz = kUnpackTab[mb->nz_ & 0xf];
+  lnz = kUnpackTab[left_mb->nz_ & 0xf];
   for (y = 0; y < 4; ++y) {
-    int l = lnz & 1;
-    uint32_t nz_coeffs = 0;
+    int l = lnz.i8[y];
     for (x = 0; x < 4; ++x) {
-      const int ctx = l + (tnz & 1);
-      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
-      l = (nz > first);
-      tnz = (tnz >> 1) | (l << 7);
-      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+      const int ctx = l + tnz.i8[x];
+      const int nz = GetCoeffs(token_br, ac_prob, ctx,
+                               q->y1_mat_, first, dst);
+      tnz.i8[x] = l = (nz > 0);
+      nz_dc.i8[x] = (dst[0] != 0);
+      nz_ac.i8[x] = (nz > 1);
       dst += 16;
     }
-    tnz >>= 4;
-    lnz = (lnz >> 1) | (l << 7);
-    non_zero_y = (non_zero_y << 8) | nz_coeffs;
+    lnz.i8[y] = l;
+    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
+    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
   }
-  out_t_nz = tnz;
-  out_l_nz = lnz >> 4;
+  out_t_nz = PACK(tnz, 24);
+  out_l_nz = PACK(lnz, 24);
 
+  tnz = kUnpackTab[mb->nz_ >> 4];
+  lnz = kUnpackTab[left_mb->nz_ >> 4];
   for (ch = 0; ch < 4; ch += 2) {
-    uint32_t nz_coeffs = 0;
-    tnz = mb->nz_ >> (4 + ch);
-    lnz = left_mb->nz_ >> (4 + ch);
     for (y = 0; y < 2; ++y) {
-      int l = lnz & 1;
+      int l = lnz.i8[ch + y];
       for (x = 0; x < 2; ++x) {
-        const int ctx = l + (tnz & 1);
-        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
-        l = (nz > 0);
-        tnz = (tnz >> 1) | (l << 3);
-        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+        const int ctx = l + tnz.i8[ch + x];
+        const int nz =
+            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
+                      ctx, q->uv_mat_, 0, dst);
+        tnz.i8[ch + x] = l = (nz > 0);
+        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
+        nz_ac.i8[y * 2 + x] = (nz > 1);
         dst += 16;
       }
-      tnz >>= 2;
-      lnz = (lnz >> 1) | (l << 5);
+      lnz.i8[ch + y] = l;
     }
-    // Note: we don't really need the per-4x4 details for U/V blocks.
-    non_zero_uv |= nz_coeffs << (4 * ch);
-    out_t_nz |= (tnz << 4) << ch;
-    out_l_nz |= (lnz & 0xf0) << ch;
+    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
+    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
   }
+  out_t_nz |= PACK(tnz, 20);
+  out_l_nz |= PACK(lnz, 20);
   mb->nz_ = out_t_nz;
   left_mb->nz_ = out_l_nz;
 
-  block->non_zero_y_ = non_zero_y;
-  block->non_zero_uv_ = non_zero_uv;
-
-  // We look at the mode-code of each block and check if some blocks have less
-  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
-  // empty blocks.
-  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
-
-  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
+  dec->non_zero_ac_ = non_zero_ac;
+  dec->non_zero_ = non_zero_ac | non_zero_dc;
+  mb->skip_ = !dec->non_zero_;
 }
+#undef PACK
 
 //------------------------------------------------------------------------------
 // Main loop
@@ -572,9 +642,7 @@ static int ParseResiduals(VP8Decoder* const dec,
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
   VP8BitReader* const br = &dec->br_;
   VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int skip;
+  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
 
   // Note: we don't save segment map (yet), as we don't expect
   // to decode more than 1 keyframe.
@@ -584,64 +652,67 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
         VP8GetBit(br, dec->proba_.segments_[1]) :
         2 + VP8GetBit(br, dec->proba_.segments_[2]);
   }
-  skip = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
 
   VP8ParseIntraMode(br, dec);
   if (br->eof_) {
     return 0;
   }
 
-  if (!skip) {
-    skip = ParseResiduals(dec, mb, token_br);
+  if (!info->skip_) {
+    ParseResiduals(dec, info, token_br);
   } else {
-    left->nz_ = mb->nz_ = 0;
-    if (!block->is_i4x4_) {
-      left->nz_dc_ = mb->nz_dc_ = 0;
+    left->nz_ = info->nz_ = 0;
+    if (!dec->is_i4x4_) {
+      left->dc_nz_ = info->dc_nz_ = 0;
     }
-    block->non_zero_y_ = 0;
-    block->non_zero_uv_ = 0;
+    dec->non_zero_ = 0;
+    dec->non_zero_ac_ = 0;
   }
 
-  if (dec->filter_type_ > 0) {  // store filter info
-    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
-    *finfo = dec->fstrengths_[dec->segment_][block->is_i4x4_];
-    finfo->f_inner_ |= !skip;
-  }
-
-  return !token_br->eof_;
+  return (!token_br->eof_);
 }
 
 void VP8InitScanline(VP8Decoder* const dec) {
   VP8MB* const left = dec->mb_info_ - 1;
   left->nz_ = 0;
-  left->nz_dc_ = 0;
+  left->dc_nz_ = 0;
   memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->mb_x_ = 0;
+  dec->filter_row_ =
+    (dec->filter_type_ > 0) &&
+    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
 }
 
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
   for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
-    // Parse bitstream for this row.
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+    VP8InitScanline(dec);
+    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
       if (!VP8DecodeMB(dec, token_br)) {
         return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                            "Premature end-of-file encountered.");
       }
-    }
-    VP8InitScanline(dec);   // Prepare for next scanline
+      VP8ReconstructBlock(dec);
 
-    // Reconstruct, filter and emit the row.
+      // Store data and save block's filtering params
+      VP8StoreBlock(dec);
+    }
     if (!VP8ProcessRow(dec, io)) {
       return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
-  if (dec->mt_method_ > 0) {
-    if (!WebPWorkerSync(&dec->worker_)) return 0;
+  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+    return 0;
   }
 
   // Finish
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->update_proba_) {
+    dec->proba_ = dec->proba_saved_;
+  }
+#endif
+
 #ifdef WEBP_EXPERIMENTAL_FEATURES
   if (dec->layer_data_size_ > 0) {
     if (!VP8DecodeLayer(dec)) {
@@ -697,12 +768,12 @@ void VP8Clear(VP8Decoder* const dec) {
   if (dec == NULL) {
     return;
   }
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
     WebPWorkerEnd(&dec->worker_);
   }
-  ALPHDelete(dec->alph_dec_);
-  dec->alph_dec_ = NULL;
-  free(dec->mem_);
+  if (dec->mem_) {
+    free(dec->mem_);
+  }
   dec->mem_ = NULL;
   dec->mem_size_ = 0;
   memset(&dec->br_, 0, sizeof(dec->br_));
@@ -711,3 +782,6 @@ void VP8Clear(VP8Decoder* const dec) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/vp8i.h b/drivers/webp/dec/vp8i.h
index 3f4cf297d9..4382edfd8e 100644
--- a/drivers/webp/dec/vp8i.h
+++ b/drivers/webp/dec/vp8i.h
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@@ -17,11 +15,10 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
-#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -30,9 +27,11 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 4
+#define DEC_MIN_VERSION 2
 #define DEC_REV_VERSION 0
 
+#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
+
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
        B_TM_PRED,
@@ -99,9 +98,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 
-// minimal width under which lossy multi-threading is always disabled
-#define MIN_WIDTH_FOR_THREADS 512
-
 //------------------------------------------------------------------------------
 // Headers
 
@@ -130,19 +126,15 @@ typedef struct {
   int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;
 
-
-// probas associated to one of the contexts
-typedef uint8_t VP8ProbaArray[NUM_PROBAS];
-
-typedef struct {   // all the probas associated to one band
-  VP8ProbaArray probas_[NUM_CTX];
-} VP8BandProbas;
-
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
   uint8_t segments_[MB_FEATURE_TREE_PROBS];
   // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t ymode_[4], uvmode_[3];
+  uint8_t mv_[2][NUM_MV_PROBAS];
+#endif
 } VP8Proba;
 
 // Filter parameters
@@ -159,59 +151,32 @@ typedef struct {
 // Informations about the macroblocks.
 
 typedef struct {  // filter specs
-  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
-  uint8_t f_ilevel_;     // inner limit in [1..63]
-  uint8_t f_inner_;      // do inner filtering?
-  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
+  unsigned int f_level_:6;      // filter strength: 0..63
+  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  unsigned int f_inner_:1;      // do inner filtering?
 } VP8FInfo;
 
-typedef struct {  // Top/Left Contexts used for syntax-parsing
-  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
-  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
+typedef struct {  // used for syntax-parsing
+  unsigned int nz_;          // non-zero AC/DC coeffs
+  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  unsigned int skip_:1;      // block type
 } VP8MB;
 
 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
   quant_t y1_mat_, y2_mat_, uv_mat_;
-
-  int uv_quant_;   // U/V quantizer value
-  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 
-// Data needed to reconstruct a macroblock
-typedef struct {
-  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
-  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
-  //   code=0 -> no coefficient
-  //   code=1 -> only DC
-  //   code=2 -> first three coefficients are non-zero
-  //   code=3 -> more than three coefficients are non-zero
-  // This allows to call specialized transform functions.
-  uint32_t non_zero_y_;
-  uint32_t non_zero_uv_;
-  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
-} VP8MBData;
-
 // Persistent information needed by the parallel processing
 typedef struct {
-  int id_;              // cache row to process (in [0..2])
-  int mb_y_;            // macroblock position of the row
-  int filter_row_;      // true if row-filtering is needed
-  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
-  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
-  VP8Io io_;            // copy of the VP8Io to pass to put()
+  int id_;            // cache row to process (in [0..2])
+  int mb_y_;          // macroblock position of the row
+  int filter_row_;    // true if row-filtering is needed
+  VP8FInfo* f_info_;  // filter strengths
+  VP8Io io_;          // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 
-// Saved top samples, per macroblock. Fits into a cache-line.
-typedef struct {
-  uint8_t y[16], u[8], v[8];
-} VP8TopSamples;
-
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
 
@@ -231,8 +196,7 @@ struct VP8Decoder {
 
   // Worker
   WebPWorker worker_;
-  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
-                       // 2=[parse][recon+filter]
+  int use_threads_;    // use multi-thread
   int cache_id_;       // current cache row
   int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
   VP8ThreadContext thread_ctx_;  // Thread context
@@ -249,9 +213,12 @@ struct VP8Decoder {
   // per-partition boolean decoders.
   VP8BitReader parts_[MAX_NUM_PARTITIONS];
 
-  // Dithering strength, deduced from decoding options
-  int dither_;                // whether to use dithering or not
-  VP8Random dithering_rg_;    // random generator for dithering
+  // buffer refresh flags
+  //   bit 0: refresh Gold, bit 1: refresh Alt
+  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
+  //   bit 6: Gold sign bias, bit 7: Alt sign bias
+  //   bit 8: refresh last frame
+  uint32_t buffer_flags_;
 
   // dequantization (one set of DC/AC dequant factor per segment)
   VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@@ -260,19 +227,24 @@ struct VP8Decoder {
   VP8Proba proba_;
   int use_skip_proba_;
   uint8_t skip_p_;
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t intra_p_, last_p_, golden_p_;
+  VP8Proba proba_saved_;
+  int update_proba_;
+#endif
 
   // Boundary data cache and persistent buffers.
-  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
-  uint8_t  intra_l_[4];   // left intra modes values
-
-  uint8_t segment_;       // segment of the currently parsed block
-  VP8TopSamples* yuv_t_;  // top y/u/v samples
+  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];  // left intra modes values
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
 
-  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
-  VP8FInfo* f_info_;      // filter strength info
-  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
+  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;     // filter strength info
+  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
+  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
 
-  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
+  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
   uint8_t* cache_u_;
   uint8_t* cache_v_;
   int cache_y_stride_;
@@ -284,20 +256,28 @@ struct VP8Decoder {
 
   // Per macroblock non-persistent infos.
   int mb_x_, mb_y_;       // current position, in macroblock units
-  VP8MBData* mb_data_;    // parsed reconstruction data
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  uint8_t segment_;       // block's segment
+
+  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
+  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
+  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
+  // If the bit is set, the 4x4 block contains some non-zero coefficients.
+  uint32_t non_zero_;
+  uint32_t non_zero_ac_;
 
   // Filtering side-info
-  int filter_type_;                          // 0=off, 1=simple, 2=complex
-  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
+  int filter_type_;                         // 0=off, 1=simple, 2=complex
+  int filter_row_;                          // per-row flag
+  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
 
-  // Alpha
-  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
-  const uint8_t* alpha_data_;     // compressed alpha data (if present)
+  // extensions
+  const uint8_t* alpha_data_;   // compressed alpha data (if present)
   size_t alpha_data_size_;
-  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
   uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
 
-  // extensions
   int layer_colorspace_;
   const uint8_t* layer_data_;   // compressed layer data (if present)
   size_t layer_data_size_;
@@ -320,6 +300,8 @@ void VP8ParseQuant(VP8Decoder* const dec);
 
 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+// Predict a block and add residual
+void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@@ -328,16 +310,10 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Return the multi-threading method to use (0=off), depending
-// on options and bitstream size. Only for lossy decoding.
-int VP8GetThreadMethod(const WebPDecoderOptions* const options,
-                       const WebPHeaderStructure* const headers,
-                       int width, int height);
-// Initialize dithering post-process if needed.
-void VP8InitDithering(const WebPDecoderOptions* const options,
-                      VP8Decoder* const dec);
-// Process the last decoded row (filtering + output).
+// Process the last decoded row (filtering + output)
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+// Store a block, along with filtering params
+void VP8StoreBlock(VP8Decoder* const dec);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
@@ -352,7 +328,7 @@ int VP8DecodeLayer(VP8Decoder* const dec);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/vp8l.c b/drivers/webp/dec/vp8l.c
index ea0254d7a8..897e4395c7 100644
--- a/drivers/webp/dec/vp8l.c
+++ b/drivers/webp/dec/vp8l.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@@ -14,14 +12,16 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./alphai.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
 #include "../dsp/yuv.h"
-#include "../utils/alpha_processing.h"
 #include "../utils/huffman.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define NUM_ARGB_CACHE_ROWS          16
 
 static const int kCodeLengthLiterals = 16;
@@ -57,19 +57,19 @@ static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
 };
 
 #define CODE_TO_PLANE_CODES        120
-static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
-  0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
-  0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
-  0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
-  0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
-  0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
-  0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
-  0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
-  0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
-  0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
-  0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
-  0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
-  0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
+static const uint8_t code_to_plane_lut[CODE_TO_PLANE_CODES] = {
+   0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+   0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+   0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+   0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+   0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+   0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+   0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+   0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+   0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+   0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+   0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+   0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
 };
 
 static int DecodeImageStream(int xsize, int ysize,
@@ -80,19 +80,20 @@ static int DecodeImageStream(int xsize, int ysize,
 //------------------------------------------------------------------------------
 
 int VP8LCheckSignature(const uint8_t* const data, size_t size) {
-  return (size >= VP8L_FRAME_HEADER_SIZE &&
-          data[0] == VP8L_MAGIC_BYTE &&
-          (data[4] >> 5) == 0);  // version
+  return (size >= 1) && (data[0] == VP8L_MAGIC_BYTE);
 }
 
 static int ReadImageInfo(VP8LBitReader* const br,
                          int* const width, int* const height,
                          int* const has_alpha) {
-  if (VP8LReadBits(br, 8) != VP8L_MAGIC_BYTE) return 0;
+  const uint8_t signature = VP8LReadBits(br, 8);
+  if (!VP8LCheckSignature(&signature, 1)) {
+    return 0;
+  }
   *width = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
   *height = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
   *has_alpha = VP8LReadBits(br, 1);
-  if (VP8LReadBits(br, VP8L_VERSION_BITS) != 0) return 0;
+  VP8LReadBits(br, VP8L_VERSION_BITS);  // Read/ignore the version number.
   return 1;
 }
 
@@ -100,8 +101,6 @@ int VP8LGetInfo(const uint8_t* data, size_t data_size,
                 int* const width, int* const height, int* const has_alpha) {
   if (data == NULL || data_size < VP8L_FRAME_HEADER_SIZE) {
     return 0;         // not enough data
-  } else if (!VP8LCheckSignature(data, data_size)) {
-    return 0;         // bad signature
   } else {
     int w, h, a;
     VP8LBitReader br;
@@ -139,45 +138,42 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
   if (plane_code > CODE_TO_PLANE_CODES) {
     return plane_code - CODE_TO_PLANE_CODES;
   } else {
-    const int dist_code = kCodeToPlane[plane_code - 1];
+    const int dist_code = code_to_plane_lut[plane_code - 1];
     const int yoffset = dist_code >> 4;
     const int xoffset = 8 - (dist_code & 0xf);
     const int dist = yoffset * xsize + xoffset;
-    return (dist >= 1) ? dist : 1;  // dist<1 can happen if xsize is very small
+    return (dist >= 1) ? dist : 1;
   }
 }
 
 //------------------------------------------------------------------------------
 // Decodes the next Huffman code from bit-stream.
 // FillBitWindow(br) needs to be called at minimum every second call
-// to ReadSymbol, in order to pre-fetch enough bits.
-static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
-                                  VP8LBitReader* const br) {
+// to ReadSymbolUnsafe.
+static int ReadSymbolUnsafe(const HuffmanTree* tree, VP8LBitReader* const br) {
   const HuffmanTreeNode* node = tree->root_;
-  uint32_t bits = VP8LPrefetchBits(br);
-  int bitpos = br->bit_pos_;
-  // Check if we find the bit combination from the Huffman lookup table.
-  const int lut_ix = bits & (HUFF_LUT - 1);
-  const int lut_bits = tree->lut_bits_[lut_ix];
-  if (lut_bits <= HUFF_LUT_BITS) {
-    VP8LSetBitPos(br, bitpos + lut_bits);
-    return tree->lut_symbol_[lut_ix];
-  }
-  node += tree->lut_jump_[lut_ix];
-  bitpos += HUFF_LUT_BITS;
-  bits >>= HUFF_LUT_BITS;
-
-  // Decode the value from a binary tree.
   assert(node != NULL);
-  do {
-    node = HuffmanTreeNextNode(node, bits & 1);
-    bits >>= 1;
-    ++bitpos;
-  } while (HuffmanTreeNodeIsNotLeaf(node));
-  VP8LSetBitPos(br, bitpos);
+  while (!HuffmanTreeNodeIsLeaf(node)) {
+    node = HuffmanTreeNextNode(node, VP8LReadOneBitUnsafe(br));
+  }
   return node->symbol_;
 }
 
+static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
+                                  VP8LBitReader* const br) {
+  const int read_safe = (br->pos_ + 8 > br->len_);
+  if (!read_safe) {
+    return ReadSymbolUnsafe(tree, br);
+  } else {
+    const HuffmanTreeNode* node = tree->root_;
+    assert(node != NULL);
+    while (!HuffmanTreeNodeIsLeaf(node)) {
+      node = HuffmanTreeNextNode(node, VP8LReadOneBit(br));
+    }
+    return node->symbol_;
+  }
+}
+
 static int ReadHuffmanCodeLengths(
     VP8LDecoder* const dec, const int* const code_length_code_lengths,
     int num_symbols, int* const code_lengths) {
@@ -331,10 +327,10 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     hdr->huffman_subsample_bits_ = huffman_precision;
     for (i = 0; i < huffman_pixs; ++i) {
       // The huffman data is stored in red and green bytes.
-      const int group = (huffman_image[i] >> 8) & 0xffff;
-      huffman_image[i] = group;
-      if (group >= num_htree_groups) {
-        num_htree_groups = group + 1;
+      const int index = (huffman_image[i] >> 8) & 0xffff;
+      huffman_image[i] = index;
+      if (index >= num_htree_groups) {
+        num_htree_groups = index + 1;
       }
     }
   }
@@ -415,13 +411,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
-  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
-    WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
     ++num_lines_out;
   }
@@ -429,22 +424,18 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
 }
 
 // Emit scaled rows.
-static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
-                                uint8_t* in, int in_stride, int mb_h,
-                                uint8_t* const out, int out_stride) {
+static int EmitRescaledRows(const VP8LDecoder* const dec,
+                            const uint32_t* const data, int in_stride, int mb_h,
+                            uint8_t* const out, int out_stride) {
   const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
+  const uint8_t* const in = (const uint8_t*)data;
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
     uint8_t* const row_out = out + num_lines_out * out_stride;
-    const int lines_left = mb_h - num_lines_in;
-    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
-    assert(needed_lines > 0 && needed_lines <= lines_left);
-    WebPMultARGBRows(row_in, in_stride,
-                     dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
-    num_lines_in += needed_lines;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
     num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
@@ -452,10 +443,11 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
 
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
-                    const uint8_t* row_in, int in_stride,
+                    const uint32_t* const data, int in_stride,
                     int mb_w, int mb_h,
                     uint8_t* const out, int out_stride) {
   int lines = mb_h;
+  const uint8_t* row_in = (const uint8_t*)data;
   uint8_t* row_out = out;
   while (lines-- > 0) {
     VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
@@ -477,8 +469,7 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
     uint8_t* const y = buf->y + y_pos * buf->y_stride;
     for (i = 0; i < width; ++i) {
       const uint32_t p = src[i];
-      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
-                       YUV_HALF);
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
     }
   }
 
@@ -497,11 +488,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
       const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
       const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
       if (!(y_pos & 1)) {  // even lines: store values
-        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
       } else {             // odd lines: average with previous values
-        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
         // Approximated average-of-four. But it's an acceptable diff.
         u[i] = (u[i] + tmp_u + 1) >> 1;
         v[i] = (v[i] + tmp_v + 1) >> 1;
@@ -513,11 +504,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
       const int g = (v0 >>  6) & 0x3fc;
       const int b = (v0 <<  2) & 0x3fc;
       if (!(y_pos & 1)) {  // even lines
-        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
       } else {             // odd lines (note: we could just skip this)
-        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
         u[i] = (u[i] + tmp_u + 1) >> 1;
         v[i] = (v[i] + tmp_v + 1) >> 1;
       }
@@ -533,12 +524,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
 
 static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
   WebPRescaler* const rescaler = dec->rescaler;
-  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     WebPRescalerExportRow(rescaler);
-    WebPMultARGBRow(src, dst_width, 1);
     ConvertToYUVA(src, dst_width, y_pos, dec->output_);
     ++y_pos;
     ++num_lines_out;
@@ -547,28 +537,28 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
 }
 
 static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
-                                uint8_t* in, int in_stride, int mb_h) {
+                                const uint32_t* const data,
+                                int in_stride, int mb_h) {
+  const uint8_t* const in = (const uint8_t*)data;
   int num_lines_in = 0;
   int y_pos = dec->last_out_row_;
   while (num_lines_in < mb_h) {
-    const int lines_left = mb_h - num_lines_in;
-    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
-    WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
-    num_lines_in += needed_lines;
-    in += needed_lines * in_stride;
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
     y_pos += ExportYUVA(dec, y_pos);
   }
   return y_pos;
 }
 
 static int EmitRowsYUVA(const VP8LDecoder* const dec,
-                        const uint8_t* in, int in_stride,
+                        const uint32_t* const data, int in_stride,
                         int mb_w, int num_rows) {
   int y_pos = dec->last_out_row_;
+  const uint8_t* row_in = (const uint8_t*)data;
   while (num_rows-- > 0) {
-    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output_);
-    in += in_stride;
+    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
+    row_in += in_stride;
     ++y_pos;
   }
   return y_pos;
@@ -579,11 +569,11 @@ static int EmitRowsYUVA(const VP8LDecoder* const dec,
 
 // Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
 // crop options. Also updates the input data pointer, so that it points to the
-// start of the cropped window. Note that pixels are in ARGB format even if
-// 'in_data' is uint8_t*.
+// start of the cropped window.
+// Note that 'pixel_stride' is in units of 'uint32_t' (and not 'bytes).
 // Returns true if the crop window is not empty.
 static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
-                         uint8_t** const in_data, int pixel_stride) {
+                         const uint32_t** const in_data, int pixel_stride) {
   assert(y_start < y_end);
   assert(io->crop_left < io->crop_right);
   if (y_end > io->crop_bottom) {
@@ -592,11 +582,11 @@ static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
   if (y_start < io->crop_top) {
     const int delta = io->crop_top - y_start;
     y_start = io->crop_top;
-    *in_data += delta * pixel_stride;
+    *in_data += pixel_stride * delta;
   }
   if (y_start >= y_end) return 0;  // Crop window is empty.
 
-  *in_data += io->crop_left * sizeof(uint32_t);
+  *in_data += io->crop_left;
 
   io->mb_y = y_start - io->crop_top;
   io->mb_w = io->crop_right - io->crop_left;
@@ -644,24 +634,10 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
   }
 }
 
-// Special method for paletted alpha data.
-static void ApplyInverseTransformsAlpha(VP8LDecoder* const dec, int num_rows,
-                                        const uint8_t* const rows) {
-  const int start_row = dec->last_row_;
-  const int end_row = start_row + num_rows;
-  const uint8_t* rows_in = rows;
-  uint8_t* rows_out = (uint8_t*)dec->io_->opaque + dec->io_->width * start_row;
-  VP8LTransform* const transform = &dec->transforms_[0];
-  assert(dec->next_transform_ == 1);
-  assert(transform->type_ == COLOR_INDEXING_TRANSFORM);
-  VP8LColorIndexInverseTransformAlpha(transform, start_row, end_row, rows_in,
-                                      rows_out);
-}
-
 // Processes (transforms, scales & color-converts) the rows decoded after the
 // last call.
 static void ProcessRows(VP8LDecoder* const dec, int row) {
-  const uint32_t* const rows = dec->pixels_ + dec->width_ * dec->last_row_;
+  const uint32_t* const rows = dec->argb_ + dec->width_ * dec->last_row_;
   const int num_rows = row - dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
@@ -670,18 +646,18 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   // Emit output.
   {
     VP8Io* const io = dec->io_;
-    uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
-    const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
-    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
+    const uint32_t* rows_data = dec->argb_cache_;
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
+      const int in_stride = io->width * sizeof(*rows_data);
       if (output->colorspace < MODE_YUV) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
         uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
         const int num_rows_out = io->use_scaling ?
-            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
-                                 rgba, buf->stride) :
+            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
+                             rgba, buf->stride) :
             EmitRows(output->colorspace, rows_data, in_stride,
                      io->mb_w, io->mb_h, rgba, buf->stride);
         // Update 'last_out_row_'.
@@ -700,163 +676,50 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   assert(dec->last_row_ <= dec->height_);
 }
 
-// Row-processing for the special case when alpha data contains only one
-// transform (color indexing), and trivial non-green literals.
-static int Is8bOptimizable(const VP8LMetadata* const hdr) {
-  int i;
-  if (hdr->color_cache_size_ > 0) return 0;
-  // When the Huffman tree contains only one symbol, we can skip the
-  // call to ReadSymbol() for red/blue/alpha channels.
-  for (i = 0; i < hdr->num_htree_groups_; ++i) {
-    const HuffmanTree* const htrees = hdr->htree_groups_[i].htrees_;
-    if (htrees[RED].num_nodes_ > 1) return 0;
-    if (htrees[BLUE].num_nodes_ > 1) return 0;
-    if (htrees[ALPHA].num_nodes_ > 1) return 0;
-  }
-  return 1;
-}
-
-static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
-  const int num_rows = row - dec->last_row_;
-  const uint8_t* const in =
-      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
-  if (num_rows > 0) {
-    ApplyInverseTransformsAlpha(dec, num_rows, in);
-  }
-  dec->last_row_ = dec->last_out_row_ = row;
-}
-
-static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
-                           int width, int height, int last_row) {
-  int ok = 1;
-  int row = dec->last_pixel_ / width;
-  int col = dec->last_pixel_ % width;
-  VP8LBitReader* const br = &dec->br_;
-  VP8LMetadata* const hdr = &dec->hdr_;
-  const HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
-  int pos = dec->last_pixel_;         // current position
-  const int end = width * height;     // End of data
-  const int last = width * last_row;  // Last pixel to decode
-  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
-  const int mask = hdr->huffman_mask_;
-  assert(htree_group != NULL);
-  assert(last_row <= height);
-  assert(Is8bOptimizable(hdr));
-
-  while (!br->eos_ && pos < last) {
-    int code;
-    // Only update when changing tile.
-    if ((col & mask) == 0) {
-      htree_group = GetHtreeGroupForPos(hdr, col, row);
-    }
-    VP8LFillBitWindow(br);
-    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
-    if (code < NUM_LITERAL_CODES) {  // Literal
-      data[pos] = code;
-      ++pos;
-      ++col;
-      if (col >= width) {
-        col = 0;
-        ++row;
-        if (row % NUM_ARGB_CACHE_ROWS == 0) {
-          ExtractPalettedAlphaRows(dec, row);
-        }
-      }
-    } else if (code < len_code_limit) {  // Backward reference
-      int dist_code, dist;
-      const int length_sym = code - NUM_LITERAL_CODES;
-      const int length = GetCopyLength(length_sym, br);
-      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
-      VP8LFillBitWindow(br);
-      dist_code = GetCopyDistance(dist_symbol, br);
-      dist = PlaneCodeToDistance(width, dist_code);
-      if (pos >= dist && end - pos >= length) {
-        int i;
-        for (i = 0; i < length; ++i) data[pos + i] = data[pos + i - dist];
-      } else {
-        ok = 0;
-        goto End;
-      }
-      pos += length;
-      col += length;
-      while (col >= width) {
-        col -= width;
-        ++row;
-        if (row % NUM_ARGB_CACHE_ROWS == 0) {
-          ExtractPalettedAlphaRows(dec, row);
-        }
-      }
-      if (pos < last && (col & mask)) {
-        htree_group = GetHtreeGroupForPos(hdr, col, row);
-      }
-    } else {  // Not reached
-      ok = 0;
-      goto End;
-    }
-    ok = !br->error_;
-    if (!ok) goto End;
-  }
-  // Process the remaining rows corresponding to last row-block.
-  ExtractPalettedAlphaRows(dec, row);
-
- End:
-  if (br->error_ || !ok || (br->eos_ && pos < end)) {
-    ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = (int)pos;
-    if (pos == end) dec->state_ = READ_DATA;
-  }
-  return ok;
-}
-
-static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
-                           int width, int height, int last_row,
+static int DecodeImageData(VP8LDecoder* const dec,
+                           uint32_t* const data, int width, int height,
                            ProcessRowsFunc process_func) {
   int ok = 1;
-  int row = dec->last_pixel_ / width;
-  int col = dec->last_pixel_ % width;
+  int col = 0, row = 0;
   VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
-  HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
-  uint32_t* src = data + dec->last_pixel_;
-  uint32_t* last_cached = src;
-  uint32_t* const src_end = data + width * height;     // End of data
-  uint32_t* const src_last = data + width * last_row;  // Last pixel to decode
+  HTreeGroup* htree_group = hdr->htree_groups_;
+  uint32_t* src = data;
+  uint32_t* last_cached = data;
+  uint32_t* const src_end = data + width * height;
   const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
   const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
   VP8LColorCache* const color_cache =
       (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
   const int mask = hdr->huffman_mask_;
+
   assert(htree_group != NULL);
-  assert(src_last <= src_end);
 
-  while (!br->eos_ && src < src_last) {
+  while (!br->eos_ && src < src_end) {
     int code;
-    // Only update when changing tile. Note we could use this test:
-    // if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
-    // but that's actually slower and needs storing the previous col/row.
+    // Only update when changing tile. Note we could use the following test:
+    //   if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
+    // but that's actually slower and requires storing the previous col/row
     if ((col & mask) == 0) {
       htree_group = GetHtreeGroupForPos(hdr, col, row);
     }
     VP8LFillBitWindow(br);
     code = ReadSymbol(&htree_group->htrees_[GREEN], br);
-    if (code < NUM_LITERAL_CODES) {  // Literal
+    if (code < NUM_LITERAL_CODES) {   // Literal.
       int red, green, blue, alpha;
       red = ReadSymbol(&htree_group->htrees_[RED], br);
       green = code;
       VP8LFillBitWindow(br);
       blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
       alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
-      *src = (alpha << 24) | (red << 16) | (green << 8) | blue;
-    AdvanceByOne:
+      *src = (alpha << 24) + (red << 16) + (green << 8) + blue;
+ AdvanceByOne:
       ++src;
       ++col;
       if (col >= width) {
         col = 0;
         ++row;
-        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
           process_func(dec, row);
         }
         if (color_cache != NULL) {
@@ -865,7 +728,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
           }
         }
       }
-    } else if (code < len_code_limit) {  // Backward reference
+    } else if (code < len_code_limit) {           // Backward reference
       int dist_code, dist;
       const int length_sym = code - NUM_LITERAL_CODES;
       const int length = GetCopyLength(length_sym, br);
@@ -873,10 +736,11 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
-      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
+      if (src - data < dist || src_end - src < length) {
         ok = 0;
         goto End;
-      } else {
+      }
+      {
         int i;
         for (i = 0; i < length; ++i) src[i] = src[i - dist];
         src += length;
@@ -885,19 +749,19 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       while (col >= width) {
         col -= width;
         ++row;
-        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
           process_func(dec, row);
         }
       }
-      if (src < src_last) {
-        if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
+      if (src < src_end) {
+        htree_group = GetHtreeGroupForPos(hdr, col, row);
         if (color_cache != NULL) {
           while (last_cached < src) {
             VP8LColorCacheInsert(color_cache, *last_cached++);
           }
         }
       }
-    } else if (code < color_cache_limit) {  // Color cache
+    } else if (code < color_cache_limit) {    // Color cache.
       const int key = code - len_code_limit;
       assert(color_cache != NULL);
       while (last_cached < src) {
@@ -905,7 +769,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       }
       *src = VP8LColorCacheLookup(color_cache, key);
       goto AdvanceByOne;
-    } else {  // Not reached
+    } else {    // Not reached.
       ok = 0;
       goto End;
     }
@@ -918,12 +782,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
  End:
   if (br->error_ || !ok || (br->eos_ && src < src_end)) {
     ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = (int)(src - data);
-    if (src == src_end) dec->state_ = READ_DATA;
+    dec->status_ = (!br->eos_) ?
+        VP8_STATUS_BITSTREAM_ERROR : VP8_STATUS_SUSPENDED;
+  } else if (src == src_end) {
+    dec->state_ = READ_DATA;
   }
+
   return ok;
 }
 
@@ -1040,9 +904,6 @@ VP8LDecoder* VP8LNew(void) {
   dec->status_ = VP8_STATUS_OK;
   dec->action_ = READ_DIM;
   dec->state_ = READ_DIM;
-
-  VP8LDspInit();  // Init critical function pointers.
-
   return dec;
 }
 
@@ -1051,8 +912,8 @@ void VP8LClear(VP8LDecoder* const dec) {
   if (dec == NULL) return;
   ClearMetadata(&dec->hdr_);
 
-  free(dec->pixels_);
-  dec->pixels_ = NULL;
+  free(dec->argb_);
+  dec->argb_ = NULL;
   for (i = 0; i < dec->next_transform_; ++i) {
     ClearTransform(&dec->transforms_[i]);
   }
@@ -1148,8 +1009,7 @@ static int DecodeImageStream(int xsize, int ysize,
   }
 
   // Use the Huffman trees to decode the LZ77 encoded data.
-  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize,
-                       transform_ysize, NULL);
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
   ok = ok && !br->error_;
 
  End:
@@ -1171,52 +1031,41 @@ static int DecodeImageStream(int xsize, int ysize,
       assert(data == NULL);
       assert(is_level0);
     }
-    dec->last_pixel_ = 0;  // Reset for future DECODE_DATA_FUNC() calls.
     if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
   }
   return ok;
 }
 
 //------------------------------------------------------------------------------
-// Allocate internal buffers dec->pixels_ and dec->argb_cache_.
-static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
+// Allocate dec->argb_ and dec->argb_cache_ using dec->width_ and dec->height_
+
+static int AllocateARGBBuffers(VP8LDecoder* const dec, int final_width) {
   const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
   // Scratch buffer corresponding to top-prediction row for transforming the
-  // first row in the row-blocks. Not needed for paletted alpha.
-  const uint64_t cache_top_pixels = (uint16_t)final_width;
-  // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
+  // first row in the row-blocks.
+  const uint64_t cache_top_pixels = final_width;
+  // Scratch buffer for temporary BGRA storage.
   const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
   const uint64_t total_num_pixels =
       num_pixels + cache_top_pixels + cache_pixels;
 
   assert(dec->width_ <= final_width);
-  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
-  if (dec->pixels_ == NULL) {
+  dec->argb_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(*dec->argb_));
+  if (dec->argb_ == NULL) {
     dec->argb_cache_ = NULL;    // for sanity check
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
-  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
-  return 1;
-}
-
-static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
-  const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
-  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
-  if (dec->pixels_ == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
-  }
+  dec->argb_cache_ = dec->argb_ + num_pixels + cache_top_pixels;
   return 1;
 }
 
 //------------------------------------------------------------------------------
-
 // Special row-processing that only stores the alpha data.
+
 static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
   const int num_rows = row - dec->last_row_;
-  const uint32_t* const in = dec->pixels_ + dec->width_ * dec->last_row_;
+  const uint32_t* const in = dec->argb_ + dec->width_ * dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
   ApplyInverseTransforms(dec, num_rows, in);
@@ -1230,76 +1079,44 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
     int i;
     for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
   }
+
   dec->last_row_ = dec->last_out_row_ = row;
 }
 
-int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size,
-                          uint8_t* const output) {
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output) {
+  VP8Io io;
   int ok = 0;
-  VP8LDecoder* dec;
-  VP8Io* io;
-  assert(alph_dec != NULL);
-  alph_dec->vp8l_dec_ = VP8LNew();
-  if (alph_dec->vp8l_dec_ == NULL) return 0;
-  dec = alph_dec->vp8l_dec_;
-
-  dec->width_ = alph_dec->width_;
-  dec->height_ = alph_dec->height_;
-  dec->io_ = &alph_dec->io_;
-  io = dec->io_;
+  VP8LDecoder* const dec = VP8LNew();
+  if (dec == NULL) return 0;
 
-  VP8InitIo(io);
-  WebPInitCustomIo(NULL, io);  // Just a sanity Init. io won't be used.
-  io->opaque = output;
-  io->width = alph_dec->width_;
-  io->height = alph_dec->height_;
+  dec->width_ = width;
+  dec->height_ = height;
+  dec->io_ = &io;
+
+  VP8InitIo(&io);
+  WebPInitCustomIo(NULL, &io);    // Just a sanity Init. io won't be used.
+  io.opaque = output;
+  io.width = width;
+  io.height = height;
 
   dec->status_ = VP8_STATUS_OK;
   VP8LInitBitReader(&dec->br_, data, data_size);
 
   dec->action_ = READ_HDR;
-  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
-    goto Err;
-  }
-
-  // Special case: if alpha data uses only the color indexing transform and
-  // doesn't use color cache (a frequent case), we will use DecodeAlphaData()
-  // method that only needs allocation of 1 byte per pixel (alpha channel).
-  if (dec->next_transform_ == 1 &&
-      dec->transforms_[0].type_ == COLOR_INDEXING_TRANSFORM &&
-      Is8bOptimizable(&dec->hdr_)) {
-    alph_dec->use_8b_decode = 1;
-    ok = AllocateInternalBuffers8b(dec);
-  } else {
-    // Allocate internal buffers (note that dec->width_ may have changed here).
-    alph_dec->use_8b_decode = 0;
-    ok = AllocateInternalBuffers32b(dec, alph_dec->width_);
-  }
+  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Err;
 
-  if (!ok) goto Err;
+  // Allocate output (note that dec->width_ may have changed here).
+  if (!AllocateARGBBuffers(dec, width)) goto Err;
 
+  // Decode (with special row processing).
   dec->action_ = READ_DATA;
-  return 1;
+  ok = DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
+                       ExtractAlphaRows);
 
  Err:
-  VP8LDelete(alph_dec->vp8l_dec_);
-  alph_dec->vp8l_dec_ = NULL;
-  return 0;
-}
-
-int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
-  VP8LDecoder* const dec = alph_dec->vp8l_dec_;
-  assert(dec != NULL);
-  assert(dec->action_ == READ_DATA);
-  assert(last_row <= dec->height_);
-
-  // Decode (with special row processing).
-  return alph_dec->use_8b_decode ?
-      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
-                      last_row) :
-      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                      last_row, ExtractAlphaRows);
+  VP8LDelete(dec);
+  return ok;
 }
 
 //------------------------------------------------------------------------------
@@ -1329,9 +1146,9 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
   return 1;
 
  Error:
-  VP8LClear(dec);
-  assert(dec->status_ != VP8_STATUS_OK);
-  return 0;
+   VP8LClear(dec);
+   assert(dec->status_ != VP8_STATUS_OK);
+   return 0;
 }
 
 int VP8LDecodeImage(VP8LDecoder* const dec) {
@@ -1354,14 +1171,14 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
     goto Err;
   }
 
-  if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
+  if (!AllocateARGBBuffers(dec, io->width)) goto Err;
 
   if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 
   // Decode.
   dec->action_ = READ_DATA;
-  if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                       dec->height_, ProcessRows)) {
+  if (!DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
+                       ProcessRows)) {
     goto Err;
   }
 
@@ -1378,3 +1195,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/vp8li.h b/drivers/webp/dec/vp8li.h
index afa294db1e..ee29eb5faf 100644
--- a/drivers/webp/dec/vp8li.h
+++ b/drivers/webp/dec/vp8li.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Lossless decoder: internal header.
@@ -22,7 +20,7 @@
 #include "../utils/huffman.h"
 #include "../webp/format_constants.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -57,8 +55,7 @@ typedef struct {
   HTreeGroup     *htree_groups_;
 } VP8LMetadata;
 
-typedef struct VP8LDecoder VP8LDecoder;
-struct VP8LDecoder {
+typedef struct {
   VP8StatusCode    status_;
   VP8LDecodeState  action_;
   VP8LDecodeState  state_;
@@ -66,8 +63,7 @@ struct VP8LDecoder {
 
   const WebPDecBuffer *output_;    // shortcut to io->opaque->output
 
-  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
-                                   // or uint32_t* for BGRA.
+  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
   uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
   VP8LBitReader    br_;
@@ -75,9 +71,6 @@ struct VP8LDecoder {
   int              width_;
   int              height_;
   int              last_row_;      // last input row decoded so far.
-  int              last_pixel_;    // last pixel decoded so far. However, it may
-                                   // not be transformed, scaled and
-                                   // color-converted yet.
   int              last_out_row_;  // last row output so far.
 
   VP8LMetadata     hdr_;
@@ -89,27 +82,18 @@ struct VP8LDecoder {
 
   uint8_t         *rescaler_memory;  // Working memory for rescaling work.
   WebPRescaler    *rescaler;         // Common rescaler for all channels.
-};
+} VP8LDecoder;
 
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 
-struct ALPHDecoder;  // Defined in dec/alphai.h.
-
 // in vp8l.c
 
-// Decodes image header for alpha data stored using lossless compression.
-// Returns false in case of error.
-int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size,
-                          uint8_t* const output);
-
-// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
-// already decoded in previous call(s), it will resume decoding from where it
-// was paused.
-// Returns false in case of bitstream error.
-int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
-                               int last_row);
+// Decodes a raw image stream (without header) and store the alpha data
+// into *output, which must be of size width x height. Returns false in case
+// of error.
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output);
 
 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@@ -130,7 +114,7 @@ void VP8LDelete(VP8LDecoder* const dec);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/webp.c b/drivers/webp/dec/webp.c
index fda88bda26..edd348cbe7 100644
--- a/drivers/webp/dec/webp.c
+++ b/drivers/webp/dec/webp.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
@@ -16,7 +14,11 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../webp/mux_types.h"  // ALPHA_FLAG
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -38,8 +40,8 @@
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
-// VP8L, XMP, EXIF  ...)
+// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8,
+// META  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.
 
@@ -74,9 +76,6 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
       if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
         return VP8_STATUS_BITSTREAM_ERROR;
       }
-      if (size > MAX_CHUNK_PAYLOAD) {
-        return VP8_STATUS_BITSTREAM_ERROR;
-      }
       // We have a RIFF container. Skip it.
       *riff_size = size;
       *data += RIFF_HEADER_SIZE;
@@ -178,9 +177,6 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
     }
 
     chunk_size = get_le32(buf + TAG_SIZE);
-    if (chunk_size > MAX_CHUNK_PAYLOAD) {
-      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
-    }
     // For odd-sized chunk-payload, there's one byte padding at the end.
     disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
     total_size += disk_chunk_size;
@@ -190,15 +186,6 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
     }
 
-    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
-    // parsed all the optional chunks.
-    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
-    // below to allow incomplete VP8/VP8L chunks.
-    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
-        !memcmp(buf, "VP8L", TAG_SIZE)) {
-      return VP8_STATUS_OK;
-    }
-
     if (buf_size < disk_chunk_size) {             // Insufficient data.
       return VP8_STATUS_NOT_ENOUGH_DATA;
     }
@@ -206,6 +193,9 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
     if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
       *alpha_data = buf + CHUNK_HEADER_SIZE;
       *alpha_size = chunk_size;
+    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
+      return VP8_STATUS_OK;  // Found.
     }
 
     // We have a full and valid chunk; skip it.
@@ -280,18 +270,9 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                           int* const width,
                                           int* const height,
                                           int* const has_alpha,
-                                          int* const has_animation,
-                                          int* const format,
                                           WebPHeaderStructure* const headers) {
-  int canvas_width = 0;
-  int canvas_height = 0;
-  int image_width = 0;
-  int image_height = 0;
   int found_riff = 0;
   int found_vp8x = 0;
-  int animation_present = 0;
-  int fragments_present = 0;
-
   VP8StatusCode status;
   WebPHeaderStructure hdrs;
 
@@ -312,35 +293,22 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   // Skip over VP8X.
   {
     uint32_t flags = 0;
-    status = ParseVP8X(&data, &data_size, &found_vp8x,
-                       &canvas_width, &canvas_height, &flags);
+    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
     if (status != VP8_STATUS_OK) {
       return status;  // Wrong VP8X / insufficient data.
     }
-    animation_present = !!(flags & ANIMATION_FLAG);
-    fragments_present = !!(flags & FRAGMENTS_FLAG);
     if (!found_riff && found_vp8x) {
       // Note: This restriction may be removed in the future, if it becomes
       // necessary to send VP8X chunk to the decoder.
       return VP8_STATUS_BITSTREAM_ERROR;
     }
-    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
-    if (has_animation != NULL) *has_animation = animation_present;
-    if (format != NULL) *format = 0;   // default = undefined
-
-    image_width = canvas_width;
-    image_height = canvas_height;
-    if (found_vp8x && (animation_present || fragments_present) &&
-        headers == NULL) {
-      status = VP8_STATUS_OK;
-      goto ReturnWidthHeight;  // Just return features from VP8X header.
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG_BIT);
+    if (found_vp8x && headers == NULL) {
+      return VP8_STATUS_OK;  // Return features from VP8X header.
     }
   }
 
-  if (data_size < TAG_SIZE) {
-    status = VP8_STATUS_NOT_ENOUGH_DATA;
-    goto ReturnWidthHeight;
-  }
+  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
 
   // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
   if ((found_riff && found_vp8x) ||
@@ -348,7 +316,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                  &hdrs.alpha_data, &hdrs.alpha_data_size);
     if (status != VP8_STATUS_OK) {
-      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
+      return status;  // Found an invalid chunk size / insufficient data.
     }
   }
 
@@ -356,41 +324,35 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
                           &hdrs.compressed_size, &hdrs.is_lossless);
   if (status != VP8_STATUS_OK) {
-    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
+    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
   }
   if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
     return VP8_STATUS_BITSTREAM_ERROR;
   }
 
-  if (format != NULL && !(animation_present || fragments_present)) {
-    *format = hdrs.is_lossless ? 2 : 1;
-  }
-
   if (!hdrs.is_lossless) {
     if (data_size < VP8_FRAME_HEADER_SIZE) {
-      status = VP8_STATUS_NOT_ENOUGH_DATA;
-      goto ReturnWidthHeight;
+      return VP8_STATUS_NOT_ENOUGH_DATA;
     }
     // Validates raw VP8 data.
-    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
-                    &image_width, &image_height)) {
+    if (!VP8GetInfo(data, data_size,
+                    (uint32_t)hdrs.compressed_size, width, height)) {
       return VP8_STATUS_BITSTREAM_ERROR;
     }
   } else {
     if (data_size < VP8L_FRAME_HEADER_SIZE) {
-      status = VP8_STATUS_NOT_ENOUGH_DATA;
-      goto ReturnWidthHeight;
+      return VP8_STATUS_NOT_ENOUGH_DATA;
     }
     // Validates raw VP8L data.
-    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
+    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
       return VP8_STATUS_BITSTREAM_ERROR;
     }
   }
-  // Validates image size coherency.
-  if (found_vp8x) {
-    if (canvas_width != image_width || canvas_height != image_height) {
-      return VP8_STATUS_BITSTREAM_ERROR;
-    }
+
+  if (has_alpha != NULL) {
+    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    // to set this is by looking for alpha data (from an ALPH chunk).
+    *has_alpha |= (hdrs.alpha_data != NULL);
   }
   if (headers != NULL) {
     *headers = hdrs;
@@ -398,44 +360,21 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
     assert(headers->offset == headers->data_size - data_size);
   }
- ReturnWidthHeight:
-  if (status == VP8_STATUS_OK ||
-      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
-    if (has_alpha != NULL) {
-      // If the data did not contain a VP8X/VP8L chunk the only definitive way
-      // to set this is by looking for alpha data (from an ALPH chunk).
-      *has_alpha |= (hdrs.alpha_data != NULL);
-    }
-    if (width != NULL) *width = image_width;
-    if (height != NULL) *height = image_height;
-    return VP8_STATUS_OK;
-  } else {
-    return status;
-  }
+  return VP8_STATUS_OK;  // Return features from VP8 header.
 }
 
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
-  VP8StatusCode status;
-  int has_animation = 0;
   assert(headers != NULL);
   // fill out headers, ignore width/height/has_alpha.
-  status = ParseHeadersInternal(headers->data, headers->data_size,
-                                NULL, NULL, NULL, &has_animation,
-                                NULL, headers);
-  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    // TODO(jzern): full support of animation frames will require API additions.
-    if (has_animation) {
-      status = VP8_STATUS_UNSUPPORTED_FEATURE;
-    }
-  }
-  return status;
+  return ParseHeadersInternal(headers->data, headers->data_size,
+                              NULL, NULL, NULL, headers);
 }
 
 //------------------------------------------------------------------------------
 // WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params != NULL) {
+  if (params) {
     memset(params, 0, sizeof(*params));
   }
 }
@@ -468,6 +407,11 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
     if (dec == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = params->options && (params->options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
 
@@ -479,10 +423,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
       status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                      params->output);
       if (status == VP8_STATUS_OK) {  // Decode
-        // This change must be done before calling VP8Decode()
-        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
-                                             io.width, io.height);
-        VP8InitDithering(params->options, dec);
         if (!VP8Decode(dec, &io)) {
           status = dec->status_;
         }
@@ -669,6 +609,7 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
   assert(features != NULL);
   memset(features, 0, sizeof(*features));
+  features->bitstream_version = 0;
 }
 
 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@@ -678,11 +619,10 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
   }
   DefaultFeatures(features);
 
-  // Only parse enough of the data to retrieve the features.
+  // Only parse enough of the data to retrieve width/height/has_alpha.
   return ParseHeadersInternal(data, data_size,
                               &features->width, &features->height,
-                              &features->has_alpha, &features->has_animation,
-                              &features->format, NULL);
+                              &features->has_alpha, NULL);
 }
 
 //------------------------------------------------------------------------------
@@ -726,13 +666,19 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
 VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
                                       WebPBitstreamFeatures* features,
                                       int version) {
+  VP8StatusCode status;
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
     return VP8_STATUS_INVALID_PARAM;   // version mismatch
   }
   if (features == NULL) {
     return VP8_STATUS_INVALID_PARAM;
   }
-  return GetFeatures(data, data_size, features);
+
+  status = GetFeatures(data, data_size, features);
+  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
+  }
+  return status;
 }
 
 VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
@@ -820,3 +766,6 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dec/webpi.h b/drivers/webp/dec/webpi.h
index d915f5ef6f..44e5744411 100644
--- a/drivers/webp/dec/webpi.h
+++ b/drivers/webp/dec/webpi.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
@@ -14,7 +12,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -63,10 +61,10 @@ typedef struct {
 } WebPHeaderStructure;
 
 // Skips over all valid chunks prior to the first VP8/VP8L frame header.
-// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
-// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
-// in the case of non-decodable features (animation for instance).
-// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
+// Returns VP8_STATUS_OK on success,
+//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
+//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
+// In 'headers', compressed_size, offset, alpha_data, alpha_size and lossless
 // fields are updated appropriately upon success.
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 
@@ -109,7 +107,7 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/decode.h b/drivers/webp/decode.h
index 0c3b62e215..43b6c58f4f 100644
--- a/drivers/webp/decode.h
+++ b/drivers/webp/decode.h
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //  Main decoding functions for WebP images.
@@ -16,23 +14,11 @@
 
 #include "./types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0203    // MAJOR(8b) + MINOR(8b)
-
-// Note: forward declaring enumerations is not allowed in (strict) C and C++,
-// the types are left here for reference.
-// typedef enum VP8StatusCode VP8StatusCode;
-// typedef enum WEBP_CSP_MODE WEBP_CSP_MODE;
-typedef struct WebPRGBABuffer WebPRGBABuffer;
-typedef struct WebPYUVABuffer WebPYUVABuffer;
-typedef struct WebPDecBuffer WebPDecBuffer;
-typedef struct WebPIDecoder WebPIDecoder;
-typedef struct WebPBitstreamFeatures WebPBitstreamFeatures;
-typedef struct WebPDecoderOptions WebPDecoderOptions;
-typedef struct WebPDecoderConfig WebPDecoderConfig;
+#define WEBP_DECODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
@@ -132,28 +118,20 @@ WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
 // Note: the naming describes the byte-ordering of packed samples in memory.
 // For instance, MODE_BGRA relates to samples ordered as B,G,R,A,B,G,R,A,...
 // Non-capital names (e.g.:MODE_Argb) relates to pre-multiplied RGB channels.
-// RGBA-4444 and RGB-565 colorspaces are represented by following byte-order:
-// RGBA-4444: [r3 r2 r1 r0 g3 g2 g1 g0], [b3 b2 b1 b0 a3 a2 a1 a0], ...
-// RGB-565: [r4 r3 r2 r1 r0 g5 g4 g3], [g2 g1 g0 b4 b3 b2 b1 b0], ...
-// In the case WEBP_SWAP_16BITS_CSP is defined, the bytes are swapped for
-// these two modes:
-// RGBA-4444: [b3 b2 b1 b0 a3 a2 a1 a0], [r3 r2 r1 r0 g3 g2 g1 g0], ...
-// RGB-565: [g2 g1 g0 b4 b3 b2 b1 b0], [r4 r3 r2 r1 r0 g5 g4 g3], ...
-
-typedef enum WEBP_CSP_MODE {
-  MODE_RGB = 0, MODE_RGBA = 1,
-  MODE_BGR = 2, MODE_BGRA = 3,
-  MODE_ARGB = 4, MODE_RGBA_4444 = 5,
-  MODE_RGB_565 = 6,
-  // RGB-premultiplied transparent modes (alpha value is preserved)
-  MODE_rgbA = 7,
-  MODE_bgrA = 8,
-  MODE_Argb = 9,
-  MODE_rgbA_4444 = 10,
-  // YUV modes must come after RGB ones.
-  MODE_YUV = 11, MODE_YUVA = 12,  // yuv 4:2:0
-  MODE_LAST = 13
-} WEBP_CSP_MODE;
+// RGB-565 and RGBA-4444 are also endian-agnostic and byte-oriented.
+typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
+               MODE_BGR = 2, MODE_BGRA = 3,
+               MODE_ARGB = 4, MODE_RGBA_4444 = 5,
+               MODE_RGB_565 = 6,
+               // RGB-premultiplied transparent modes (alpha value is preserved)
+               MODE_rgbA = 7,
+               MODE_bgrA = 8,
+               MODE_Argb = 9,
+               MODE_rgbA_4444 = 10,
+               // YUV modes must come after RGB ones.
+               MODE_YUV = 11, MODE_YUVA = 12,  // yuv 4:2:0
+               MODE_LAST = 13
+             } WEBP_CSP_MODE;
 
 // Some useful macros:
 static WEBP_INLINE int WebPIsPremultipliedMode(WEBP_CSP_MODE mode) {
@@ -174,13 +152,13 @@ static WEBP_INLINE int WebPIsRGBMode(WEBP_CSP_MODE mode) {
 //------------------------------------------------------------------------------
 // WebPDecBuffer: Generic structure for describing the output sample buffer.
 
-struct WebPRGBABuffer {    // view as RGBA
+typedef struct {    // view as RGBA
   uint8_t* rgba;    // pointer to RGBA samples
   int stride;       // stride in bytes from one scanline to the next.
   size_t size;      // total size of the *rgba buffer.
-};
+} WebPRGBABuffer;
 
-struct WebPYUVABuffer {              // view as YUVA
+typedef struct {              // view as YUVA
   uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
   int y_stride;               // luma stride
   int u_stride, v_stride;     // chroma strides
@@ -188,10 +166,10 @@ struct WebPYUVABuffer {              // view as YUVA
   size_t y_size;              // luma plane size
   size_t u_size, v_size;      // chroma planes size
   size_t a_size;              // alpha-plane size
-};
+} WebPYUVABuffer;
 
 // Output buffer
-struct WebPDecBuffer {
+typedef struct {
   WEBP_CSP_MODE colorspace;  // Colorspace.
   int width, height;         // Dimensions.
   int is_external_memory;    // If true, 'internal_memory' pointer is not used.
@@ -204,7 +182,7 @@ struct WebPDecBuffer {
   uint8_t* private_memory;   // Internally allocated memory (only when
                              // is_external_memory is false). Should not be used
                              // externally, but accessed via the buffer union.
-};
+} WebPDecBuffer;
 
 // Internal, version-checked, entry point
 WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
@@ -222,7 +200,7 @@ WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
 //------------------------------------------------------------------------------
 // Enumeration of the status codes
 
-typedef enum VP8StatusCode {
+typedef enum {
   VP8_STATUS_OK = 0,
   VP8_STATUS_OUT_OF_MEMORY,
   VP8_STATUS_INVALID_PARAM,
@@ -259,17 +237,13 @@ typedef enum VP8StatusCode {
 //   }
 //   WebPIDelete(idec);
 
+typedef struct WebPIDecoder WebPIDecoder;
+
 // Creates a new incremental decoder with the supplied buffer parameter.
 // This output_buffer can be passed NULL, in which case a default output buffer
 // is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
 // is kept, which means that the lifespan of 'output_buffer' must be larger than
 // that of the returned WebPIDecoder object.
-// The supplied 'output_buffer' content MUST NOT be changed between calls to
-// WebPIAppend() or WebPIUpdate() unless 'output_buffer.is_external_memory' is
-// set to 1. In such a case, it is allowed to modify the pointers, size and
-// stride of output_buffer.u.RGBA or output_buffer.u.YUVA, provided they remain
-// within valid bounds.
-// All other fields of WebPDecBuffer MUST remain constant between calls.
 // Returns NULL if the allocation failed.
 WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 
@@ -277,27 +251,19 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 // will output the RGB/A samples specified by 'csp' into a preallocated
 // buffer 'output_buffer'. The size of this buffer is at least
 // 'output_buffer_size' and the stride (distance in bytes between two scanlines)
-// is specified by 'output_stride'.
-// Additionally, output_buffer can be passed NULL in which case the output
-// buffer will be allocated automatically when the decoding starts. The
-// colorspace 'csp' is taken into account for allocating this buffer. All other
-// parameters are ignored.
-// Returns NULL if the allocation failed, or if some parameters are invalid.
+// is specified by 'output_stride'. Returns NULL if the allocation failed.
 WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
     WEBP_CSP_MODE csp,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // This function allocates and initializes an incremental-decoder object, which
-// will output the raw luma/chroma samples into a preallocated planes if
-// supplied. The luma plane is specified by its pointer 'luma', its size
-// 'luma_size' and its stride 'luma_stride'. Similarly, the chroma-u plane
-// is specified by the 'u', 'u_size' and 'u_stride' parameters, and the chroma-v
-// plane by 'v' and 'v_size'. And same for the alpha-plane. The 'a' pointer
-// can be pass NULL in case one is not interested in the transparency plane.
-// Conversely, 'luma' can be passed NULL if no preallocated planes are supplied.
-// In this case, the output buffer will be automatically allocated (using
-// MODE_YUVA) when decoding starts. All parameters are then ignored.
-// Returns NULL if the allocation failed or if a parameter is invalid.
+// will output the raw luma/chroma samples into a preallocated planes. The luma
+// plane is specified by its pointer 'luma', its size 'luma_size' and its stride
+// 'luma_stride'. Similarly, the chroma-u plane is specified by the 'u',
+// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v'
+// and 'v_size'. And same for the alpha-plane. The 'a' pointer can be pass
+// NULL in case one is not interested in the transparency plane.
+// Returns NULL if the allocation failed.
 WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
@@ -378,7 +344,7 @@ WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
      CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
 
      // C) Adjust 'config', if needed
-     config.no_fancy_upsampling = 1;
+     config.no_fancy = 1;
      config.output.colorspace = MODE_BGRA;
      // etc.
 
@@ -399,20 +365,19 @@ WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
 */
 
 // Features gathered from the bitstream
-struct WebPBitstreamFeatures {
-  int width;          // Width in pixels, as read from the bitstream.
-  int height;         // Height in pixels, as read from the bitstream.
-  int has_alpha;      // True if the bitstream contains an alpha channel.
-  int has_animation;  // True if the bitstream is an animation.
-  int format;         // 0 = undefined (/mixed), 1 = lossy, 2 = lossless
+typedef struct {
+  int width;        // Width in pixels, as read from the bitstream.
+  int height;       // Height in pixels, as read from the bitstream.
+  int has_alpha;    // True if the bitstream contains an alpha channel.
 
   // Unused for now:
+  int bitstream_version;        // should be 0 for now. TODO(later)
   int no_incremental_decoding;  // if true, using incremental decoding is not
                                 // recommended.
   int rotate;                   // TODO(later)
   int uv_sampling;              // should be 0 for now. TODO(later)
-  uint32_t pad[2];              // padding for later use
-};
+  uint32_t pad[3];              // padding for later use
+} WebPBitstreamFeatures;
 
 // Internal, version-checked, entry point
 WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
@@ -420,9 +385,8 @@ WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
 
 // Retrieve features from the bitstream. The *features structure is filled
 // with information gathered from the bitstream.
-// Returns VP8_STATUS_OK when the features are successfully retrieved. Returns
-// VP8_STATUS_NOT_ENOUGH_DATA when more data is needed to retrieve the
-// features from headers. Returns error in other cases.
+// Returns false in case of error or version mismatch.
+// In case of error, features->bitstream_status will reflect the error code.
 static WEBP_INLINE VP8StatusCode WebPGetFeatures(
     const uint8_t* data, size_t data_size,
     WebPBitstreamFeatures* features) {
@@ -431,7 +395,7 @@ static WEBP_INLINE VP8StatusCode WebPGetFeatures(
 }
 
 // Decoding options
-struct WebPDecoderOptions {
+typedef struct {
   int bypass_filtering;               // if true, skip the in-loop filtering
   int no_fancy_upsampling;            // if true, use faster pointwise upsampler
   int use_cropping;                   // if true, cropping is applied _first_
@@ -441,20 +405,19 @@ struct WebPDecoderOptions {
   int use_scaling;                    // if true, scaling is applied _afterward_
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
-  int dithering_strength;             // dithering strength (0=Off, 100=full)
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
-  uint32_t pad[5];                    // padding for later use
-};
+  uint32_t pad[6];                    // padding for later use
+} WebPDecoderOptions;
 
 // Main object storing the configuration for advanced decoding.
-struct WebPDecoderConfig {
+typedef struct {
   WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
   WebPDecBuffer output;         // Output buffer (can point to external mem)
   WebPDecoderOptions options;   // Decoding options
-};
+} WebPDecoderConfig;
 
 // Internal, version-checked, entry point
 WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
@@ -484,7 +447,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
 WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
                                       WebPDecoderConfig* config);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/demux.h b/drivers/webp/demux.h
deleted file mode 100644
index 2da3239dd9..0000000000
--- a/drivers/webp/demux.h
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright 2012 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Demux API.
-// Enables extraction of image and extended format data from WebP files.
-
-// Code Example: Demuxing WebP data to extract all the frames, ICC profile
-// and EXIF/XMP metadata.
-/*
-  WebPDemuxer* demux = WebPDemux(&webp_data);
-
-  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
-  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
-  // ... (Get information about the features present in the WebP file).
-  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
-
-  // ... (Iterate over all frames).
-  WebPIterator iter;
-  if (WebPDemuxGetFrame(demux, 1, &iter)) {
-    do {
-      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
-      // ... and get other frame properties like width, height, offsets etc.
-      // ... see 'struct WebPIterator' below for more info).
-    } while (WebPDemuxNextFrame(&iter));
-    WebPDemuxReleaseIterator(&iter);
-  }
-
-  // ... (Extract metadata).
-  WebPChunkIterator chunk_iter;
-  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
-  // ... (Consume the ICC profile in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
-  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
-  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
-  WebPDemuxReleaseChunkIterator(&chunk_iter);
-  WebPDemuxDelete(demux);
-*/
-
-#ifndef WEBP_WEBP_DEMUX_H_
-#define WEBP_WEBP_DEMUX_H_
-
-#include "./mux_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define WEBP_DEMUX_ABI_VERSION 0x0101    // MAJOR(8b) + MINOR(8b)
-
-// Note: forward declaring enumerations is not allowed in (strict) C and C++,
-// the types are left here for reference.
-// typedef enum WebPDemuxState WebPDemuxState;
-// typedef enum WebPFormatFeature WebPFormatFeature;
-typedef struct WebPDemuxer WebPDemuxer;
-typedef struct WebPIterator WebPIterator;
-typedef struct WebPChunkIterator WebPChunkIterator;
-
-//------------------------------------------------------------------------------
-
-// Returns the version number of the demux library, packed in hexadecimal using
-// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetDemuxVersion(void);
-
-//------------------------------------------------------------------------------
-// Life of a Demux object
-
-typedef enum WebPDemuxState {
-  WEBP_DEMUX_PARSE_ERROR    = -1,  // An error occurred while parsing.
-  WEBP_DEMUX_PARSING_HEADER =  0,  // Not enough data to parse full header.
-  WEBP_DEMUX_PARSED_HEADER  =  1,  // Header parsing complete,
-                                   // data may be available.
-  WEBP_DEMUX_DONE           =  2   // Entire file has been parsed.
-} WebPDemuxState;
-
-// Internal, version-checked, entry point
-WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal(
-    const WebPData*, int, WebPDemuxState*, int);
-
-// Parses the full WebP file given by 'data'.
-// Returns a WebPDemuxer object on successful parse, NULL otherwise.
-static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
-  return WebPDemuxInternal(data, 0, NULL, WEBP_DEMUX_ABI_VERSION);
-}
-
-// Parses the possibly incomplete WebP file given by 'data'.
-// If 'state' is non-NULL it will be set to indicate the status of the demuxer.
-// Returns NULL in case of error or if there isn't enough data to start parsing;
-// and a WebPDemuxer object on successful parse.
-// Note that WebPDemuxer keeps internal pointers to 'data' memory segment.
-// If this data is volatile, the demuxer object should be deleted (by calling
-// WebPDemuxDelete()) and WebPDemuxPartial() called again on the new data.
-// This is usually an inexpensive operation.
-static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
-    const WebPData* data, WebPDemuxState* state) {
-  return WebPDemuxInternal(data, 1, state, WEBP_DEMUX_ABI_VERSION);
-}
-
-// Frees memory associated with 'dmux'.
-WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux);
-
-//------------------------------------------------------------------------------
-// Data/information extraction.
-
-typedef enum WebPFormatFeature {
-  WEBP_FF_FORMAT_FLAGS,  // Extended format flags present in the 'VP8X' chunk.
-  WEBP_FF_CANVAS_WIDTH,
-  WEBP_FF_CANVAS_HEIGHT,
-  WEBP_FF_LOOP_COUNT,
-  WEBP_FF_BACKGROUND_COLOR,
-  WEBP_FF_FRAME_COUNT    // Number of frames present in the demux object.
-                         // In case of a partial demux, this is the number of
-                         // frames seen so far, with the last frame possibly
-                         // being partial.
-} WebPFormatFeature;
-
-// Get the 'feature' value from the 'dmux'.
-// NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial()
-// returned a state > WEBP_DEMUX_PARSING_HEADER.
-WEBP_EXTERN(uint32_t) WebPDemuxGetI(
-    const WebPDemuxer* dmux, WebPFormatFeature feature);
-
-//------------------------------------------------------------------------------
-// Frame iteration.
-
-struct WebPIterator {
-  int frame_num;
-  int num_frames;          // equivalent to WEBP_FF_FRAME_COUNT.
-  int fragment_num;
-  int num_fragments;
-  int x_offset, y_offset;  // offset relative to the canvas.
-  int width, height;       // dimensions of this frame or fragment.
-  int duration;            // display duration in milliseconds.
-  WebPMuxAnimDispose dispose_method;  // dispose method for the frame.
-  int complete;   // true if 'fragment' contains a full frame. partial images
-                  // may still be decoded with the WebP incremental decoder.
-  WebPData fragment;  // The frame or fragment given by 'frame_num' and
-                      // 'fragment_num'.
-  int has_alpha;      // True if the frame or fragment contains transparency.
-  WebPMuxAnimBlend blend_method;  // Blend operation for the frame.
-
-  uint32_t pad[2];         // padding for later use.
-  void* private_;          // for internal use only.
-};
-
-// Retrieves frame 'frame_number' from 'dmux'.
-// 'iter->fragment' points to the first fragment on return from this function.
-// Individual fragments may be extracted using WebPDemuxSelectFragment().
-// Setting 'frame_number' equal to 0 will return the last frame of the image.
-// Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
-// Call WebPDemuxReleaseIterator() when use of the iterator is complete.
-// NOTE: 'dmux' must persist for the lifetime of 'iter'.
-WEBP_EXTERN(int) WebPDemuxGetFrame(
-    const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
-
-// Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or
-// previous ('iter->frame_num' - 1) frame. These functions do not loop.
-// Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter);
-
-// Sets 'iter->fragment' to reflect fragment number 'fragment_num'.
-// Returns true if fragment 'fragment_num' is present, false otherwise.
-WEBP_EXTERN(int) WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num);
-
-// Releases any memory associated with 'iter'.
-// Must be called before any subsequent calls to WebPDemuxGetChunk() on the same
-// iter. Also, must be called before destroying the associated WebPDemuxer with
-// WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter);
-
-//------------------------------------------------------------------------------
-// Chunk iteration.
-
-struct WebPChunkIterator {
-  // The current and total number of chunks with the fourcc given to
-  // WebPDemuxGetChunk().
-  int chunk_num;
-  int num_chunks;
-  WebPData chunk;    // The payload of the chunk.
-
-  uint32_t pad[6];   // padding for later use
-  void* private_;
-};
-
-// Retrieves the 'chunk_number' instance of the chunk with id 'fourcc' from
-// 'dmux'.
-// 'fourcc' is a character array containing the fourcc of the chunk to return,
-// e.g., "ICCP", "XMP ", "EXIF", etc.
-// Setting 'chunk_number' equal to 0 will return the last chunk in a set.
-// Returns true if the chunk is found, false otherwise. Image related chunk
-// payloads are accessed through WebPDemuxGetFrame() and related functions.
-// Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
-// NOTE: 'dmux' must persist for the lifetime of the iterator.
-WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux,
-                                   const char fourcc[4], int chunk_number,
-                                   WebPChunkIterator* iter);
-
-// Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous
-// ('iter->chunk_num' - 1) chunk. These functions do not loop.
-// Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter);
-
-// Releases any memory associated with 'iter'.
-// Must be called before destroying the associated WebPDemuxer with
-// WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_WEBP_DEMUX_H_ */
diff --git a/drivers/webp/dsp/cpu.c b/drivers/webp/dsp/cpu.c
index 7a1f417a55..0228734457 100644
--- a/drivers/webp/dsp/cpu.c
+++ b/drivers/webp/dsp/cpu.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // CPU detection
@@ -17,6 +15,10 @@
 #include <cpu-features.h>
 #endif
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
@@ -78,3 +80,6 @@ VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dsp/dec.c b/drivers/webp/dsp/dec.c
index 8b246fad0a..9ae7b6fa76 100644
--- a/drivers/webp/dsp/dec.c
+++ b/drivers/webp/dsp/dec.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Speed-critical decoding functions.
@@ -14,6 +12,10 @@
 #include "./dsp.h"
 #include "../dec/vp8i.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
 
@@ -57,14 +59,6 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
   dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
 
-#define STORE2(y, dc, d, c) do {    \
-  const int DC = (dc);              \
-  STORE(0, y, DC + (d));            \
-  STORE(1, y, DC + (c));            \
-  STORE(2, y, DC - (c));            \
-  STORE(3, y, DC - (d));            \
-} while (0)
-
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@@ -107,21 +101,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     dst += BPS;
   }
 }
-
-// Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
-  const int a = in[0] + 4;
-  const int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
-  STORE2(0, a + d4, d1, c1);
-  STORE2(1, a + c4, d1, c1);
-  STORE2(2, a - c4, d1, c1);
-  STORE2(3, a - d4, d1, c1);
-}
 #undef MUL
-#undef STORE2
 
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   TransformOne(in, dst);
@@ -446,16 +426,11 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }
 
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
   int j;
-#ifndef WEBP_REFERENCE_IMPLEMENTATION
-  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
   for (j = 0; j < 8; ++j) {
     *(uint64_t*)(dst + j * BPS) = v;
   }
-#else
-  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
-#endif
 }
 
 static void DC8uv(uint8_t *dst) {     // DC
@@ -464,7 +439,7 @@ static void DC8uv(uint8_t *dst) {     // DC
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS] + dst[-1 + i * BPS];
   }
-  Put8x8uv(dc0 >> 4, dst);
+  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
 }
 
 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@@ -473,7 +448,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS];
   }
-  Put8x8uv(dc0 >> 3, dst);
+  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
 }
 
 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@@ -482,11 +457,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
   for (i = 0; i < 8; ++i) {
     dc0 += dst[-1 + i * BPS];
   }
-  Put8x8uv(dc0 >> 3, dst);
+  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
 }
 
 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x80, dst);
+  Put8x8uv(0x8080808080808080ULL, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -697,7 +672,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------
 
 VP8DecIdct2 VP8Transform;
-VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@@ -725,7 +699,6 @@ void VP8DspInit(void) {
   VP8TransformUV = TransformUV;
   VP8TransformDC = TransformDC;
   VP8TransformDCUV = TransformDCUV;
-  VP8TransformAC3 = TransformAC3;
 
   VP8VFilter16 = VFilter16;
   VP8HFilter16 = HFilter16;
@@ -754,3 +727,6 @@ void VP8DspInit(void) {
   }
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dsp/dec_neon.c b/drivers/webp/dsp/dec_neon.c
index 9c3d8cc016..ec824b790b 100644
--- a/drivers/webp/dsp/dec_neon.c
+++ b/drivers/webp/dsp/dec_neon.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // ARM NEON version of dsp functions and loop filtering.
@@ -18,7 +16,11 @@
 
 #include "../dec/vp8i.h"
 
-#define QRegs "q0", "q1", "q2", "q3",                                          \
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 
 #define FLIP_SIGN_BIT2(a, b, s)                                                \
@@ -77,7 +79,7 @@
   "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
   "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
 
-#define STORE8x2(c1, c2, p, stride)                                            \
+#define STORE8x2(c1, c2, p,stride)                                             \
   "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
   "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
   "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
@@ -97,9 +99,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
-    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
+    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
 
-    DO_FILTER2(q1, q2, q3, q12, %[thresh])
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
 
     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 
@@ -118,18 +120,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
 
     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
-    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
-    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
-    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
-    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
+    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
+    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
+    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
 
-    DO_FILTER2(q1, q2, q12, q13, %[thresh])
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
 
     "sub        %[p], %[p], #1                 \n"  // p - 1
 
-    "vswp        d5, d24                       \n"
+    "vswp        d5, d6                        \n"
     STORE8x2(d4, d5, [%[p]], %[stride])
-    STORE8x2(d24, d25, [%[p]], %[stride])
+    STORE8x2(d6, d7, [%[p]], %[stride])
 
     : [p] "+r"(p)
     : [stride] "r"(stride), [thresh] "r"(thresh)
@@ -153,10 +155,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
   }
 }
 
-//-----------------------------------------------------------------------------
-// Inverse transforms (Paragraph 14.4)
-
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
   const int kBPS = BPS;
   const int16_t constants[] = {20091, 17734, 0, 0};
   /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@@ -305,129 +304,26 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   );
 }
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOneNEON(in, dst);
   if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOneNEON(in + 16, dst + 4);
   }
 }
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
-  const int DC = (in[0] + 4) >> 3;
-  const int kBPS = BPS;
-  __asm__ volatile (
-    "vdup.16         q1, %[DC]        \n"
-
-    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
-
-    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
-
-    // add DC and convert to s16.
-    "vaddw.u8        q2, q1, d0                  \n"
-    "vaddw.u8        q3, q1, d1                  \n"
-    // convert back to u8 with saturation
-    "vqmovun.s16     d0,  q2                     \n"
-    "vqmovun.s16     d1,  q3                     \n"
-
-    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[1], [%[dst]]             \n"
-    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
-    : [kBPS] "r"(kBPS),   /* constants */
-      [DC] "r"(DC)
-    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
-  );
-}
-
-static void TransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;  // The store is only incrementing the pointer as if we
-                         // had stored a single byte.
-  __asm__ volatile (
-    // part 1
-    // load data into q0, q1
-    "vld1.16         {q0, q1}, [%[in]]           \n"
-
-    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
-    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
-    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
-
-    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
-    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
-    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
-
-    // Transpose
-    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
-    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
-    "vtrn.32         q0, q1                      \n"
-    "vtrn.32         q2, q3                      \n"
-
-    "vmov.s32        q10, #3                     \n"  // dc = 3
-    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
-    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
-    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
-
-    "vadd.s32        q0, q12, q13                \n"
-    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
-    "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
-    "vsub.s32        q2, q12, q13                \n"
-    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
-    "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
-
-    // set the results to output
-    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
-
-    : [out] "+r"(out)  // modified registers
-    : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
-  );
-}
-
-#endif   // WEBP_USE_NEON
-
-//------------------------------------------------------------------------------
-// Entry point
-
 extern void VP8DspInitNEON(void);
 
 void VP8DspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
-  VP8Transform = TransformTwo;
-  VP8TransformAC3 = TransformOne;  // no special code here
-  VP8TransformDC = TransformDC;
-  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwoNEON;
 
   VP8SimpleVFilter16 = SimpleVFilter16NEON;
   VP8SimpleHFilter16 = SimpleHFilter16NEON;
   VP8SimpleVFilter16i = SimpleVFilter16iNEON;
   VP8SimpleHFilter16i = SimpleHFilter16iNEON;
-#endif   // WEBP_USE_NEON
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // WEBP_USE_NEON
diff --git a/drivers/webp/dsp/dec_sse2.c b/drivers/webp/dsp/dec_sse2.c
index 150c559f13..472b68ecb8 100644
--- a/drivers/webp/dsp/dec_sse2.c
+++ b/drivers/webp/dsp/dec_sse2.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of some decoding functions (idct, loop filtering).
@@ -16,13 +14,13 @@
 
 #if defined(WEBP_USE_SSE2)
 
-// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
-// one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
-
 #include <emmintrin.h>
 #include "../dec/vp8i.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
@@ -196,21 +194,21 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
 
   // Add inverse transform to 'dst' and store.
   {
-    const __m128i zero = _mm_setzero_si128();
+    const __m128i zero = _mm_set1_epi16(0);
     // Load the reference(s).
     __m128i dst0, dst1, dst2, dst3;
     if (do_two) {
       // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
-      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
-      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
-      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
+      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -230,66 +228,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
     // Store the results.
     if (do_two) {
       // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
-      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
-      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
-      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
     } else {
       // Store four bytes/pixels per line.
-      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
     }
   }
 }
 
-#if defined(USE_TRANSFORM_AC3)
-#define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
-  static const int kC1 = 20091 + (1 << 16);
-  static const int kC2 = 35468;
-  const __m128i A = _mm_set1_epi16(in[0] + 4);
-  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
-  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
-  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
-  const __m128i B = _mm_adds_epi16(A, CD);
-  const __m128i m0 = _mm_adds_epi16(B, d4);
-  const __m128i m1 = _mm_adds_epi16(B, c4);
-  const __m128i m2 = _mm_subs_epi16(B, c4);
-  const __m128i m3 = _mm_subs_epi16(B, d4);
-  const __m128i zero = _mm_setzero_si128();
-  // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
-  // Convert to 16b.
-  dst0 = _mm_unpacklo_epi8(dst0, zero);
-  dst1 = _mm_unpacklo_epi8(dst1, zero);
-  dst2 = _mm_unpacklo_epi8(dst2, zero);
-  dst3 = _mm_unpacklo_epi8(dst3, zero);
-  // Add the inverse transform.
-  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
-  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
-  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
-  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
-  // Unsigned saturate to 8b.
-  dst0 = _mm_packus_epi16(dst0, dst0);
-  dst1 = _mm_packus_epi16(dst1, dst1);
-  dst2 = _mm_packus_epi16(dst2, dst2);
-  dst3 = _mm_packus_epi16(dst3, dst3);
-  // Store the results.
-  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
-}
-#undef MUL
-#endif   // USE_TRANSFORM_AC3
-
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)
 
@@ -326,14 +278,14 @@ static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
 
 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
   const __m128i zero = _mm_setzero_si128();                                    \
-  const __m128i t_1 = MM_ABS(p1, p0);                                          \
-  const __m128i t_2 = MM_ABS(q1, q0);                                          \
+  const __m128i t1 = MM_ABS(p1, p0);                                           \
+  const __m128i t2 = MM_ABS(q1, q0);                                           \
                                                                                \
   const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
+  const __m128i t3 = _mm_subs_epu8(t1, h);  /* abs(p1 - p0) - hev_tresh */     \
+  const __m128i t4 = _mm_subs_epu8(t2, h);  /* abs(q1 - q0) - hev_tresh */     \
                                                                                \
-  not_hev = _mm_or_si128(t_3, t_4);                                            \
+  not_hev = _mm_or_si128(t3, t4);                                              \
   not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
 }
 
@@ -362,13 +314,13 @@ static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
 
 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
-// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
+// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
   const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
   const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
-  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
-  pi = _mm_adds_epi8(pi, delta);                                               \
-  qi = _mm_subs_epi8(qi, delta);                                               \
+  const __m128i a = _mm_packs_epi16(a_lo7, a_hi7);                             \
+  pi = _mm_adds_epi8(pi, a);                                                   \
+  qi = _mm_subs_epi8(qi, a);                                                   \
 }
 
 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
@@ -924,19 +876,10 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
   Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
 }
 
-#endif   // WEBP_USE_SSE2
-
-//------------------------------------------------------------------------------
-// Entry point
-
 extern void VP8DspInitSSE2(void);
 
 void VP8DspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
   VP8Transform = TransformSSE2;
-#if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3SSE2;
-#endif
 
   VP8VFilter16 = VFilter16SSE2;
   VP8HFilter16 = HFilter16SSE2;
@@ -951,6 +894,10 @@ void VP8DspInitSSE2(void) {
   VP8SimpleHFilter16 = SimpleHFilter16SSE2;
   VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
   VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
-#endif   // WEBP_USE_SSE2
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // WEBP_USE_SSE2
diff --git a/drivers/webp/dsp/dsp.h b/drivers/webp/dsp/dsp.h
index 3be783afe7..042c98aad2 100644
--- a/drivers/webp/dsp/dsp.h
+++ b/drivers/webp/dsp/dsp.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
@@ -16,15 +14,14 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
 // CPU detection
 
-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86))
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
 
@@ -36,7 +33,7 @@ extern "C" {
 #define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif
 
-#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)) && !defined(PSP2_ENABLED)
 #define WEBP_USE_NEON
 #endif
 
@@ -52,6 +49,8 @@ extern VP8CPUInfo VP8GetCPUInfo;
 //------------------------------------------------------------------------------
 // Encoding
 
+int VP8GetAlpha(const int histo[]);
+
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
@@ -86,16 +85,10 @@ typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                 int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 
-// specific to 2nd transform:
-typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
-                                   const struct VP8Matrix* const mtx);
-extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
-
-// Collect histogram for susceptibility calculation and accumulate in histo[].
-struct VP8Histogram;
-typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
-                          int start_block, int end_block,
-                          struct VP8Histogram* const histo);
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                         int start_block, int end_block);
 extern const int VP8DspScan[16 + 4 + 4];
 extern VP8CHisto VP8CollectHistogram;
 
@@ -108,11 +101,10 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
-extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
-extern VP8WHT VP8TransformWHT;
+extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
@@ -153,8 +145,6 @@ void VP8DspInit(void);
 
 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
 
-// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
-// bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
     const uint8_t* top_y, const uint8_t* bottom_y,
     const uint8_t* top_u, const uint8_t* top_v,
@@ -169,9 +159,6 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 // Initializes SSE2 version of the fancy upsamplers.
 void WebPInitUpsamplersSSE2(void);
 
-// NEON version
-void WebPInitUpsamplersNEON(void);
-
 #endif    // FANCY_UPSAMPLING
 
 // Point-sampling methods.
@@ -213,11 +200,10 @@ extern void (*WebPApplyAlphaMultiply4444)(
 void WebPInitPremultiply(void);
 
 void WebPInitPremultiplySSE2(void);   // should not be called directly.
-void WebPInitPremultiplyNEON(void);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dsp/enc.c b/drivers/webp/dsp/enc.c
index fcc6ec8ea2..02234564be 100644
--- a/drivers/webp/dsp/enc.c
+++ b/drivers/webp/dsp/enc.c
@@ -1,34 +1,47 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
 #include <stdlib.h>  // for abs()
-
 #include "./dsp.h"
 #include "../enc/vp8enci.h"
 
-static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-}
-
-static WEBP_INLINE int clip_max(int v, int max) {
-  return (v > max) ? max : v;
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
+int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
+  int num = 0, den = 0, val = 0;
+  int k;
+  int alpha;
+  // note: changing this loop to avoid the numerous "k + 1" slows things down.
+  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
+    if (histo[k + 1]) {
+      val += histo[k + 1];
+      num += val * (k + 1);
+      den += (k + 1) * (k + 1);
+    }
+  }
+  // we scale the value to a usable [0..255] range
+  alpha = den ? 10 * num / den - 5 : 0;
+  return ClipAlpha(alpha);
+}
+
 const int VP8DspScan[16 + 4 + 4] = {
   // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -40,23 +53,27 @@ const int VP8DspScan[16 + 4 + 4] = {
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
-  int j;
+static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                            int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
   for (j = start_block; j < end_block; ++j) {
-    int k;
-    int16_t out[16];
-
     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
-    // Convert coefficients to bin.
+    // Convert coefficients to bin (within out[]).
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 2;
+      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
+    }
+
+    // Use bin to update histogram.
     for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
-      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
-      histo->distribution[clipped_value]++;
+      histo[out[k]]++;
     }
   }
+
+  return VP8GetAlpha(histo);
 }
 
 //------------------------------------------------------------------------------
@@ -72,12 +89,15 @@ static void InitTables(void) {
   if (!tables_ok) {
     int i;
     for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = clip_8b(i);
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
     }
     tables_ok = 1;
   }
 }
 
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
+}
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -134,25 +154,25 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   int i;
   int tmp[16];
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
-    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
+    const int d0 = src[0] - ref[0];
     const int d1 = src[1] - ref[1];
     const int d2 = src[2] - ref[2];
     const int d3 = src[3] - ref[3];
-    const int a0 = (d0 + d3);         // 10b                      [-510,510]
-    const int a1 = (d1 + d2);
-    const int a2 = (d1 - d2);
-    const int a3 = (d0 - d3);
-    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
-    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
-    tmp[2 + i * 4] = (a0 - a1) * 8;
-    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
+    const int a0 = (d0 + d3) << 3;
+    const int a1 = (d1 + d2) << 3;
+    const int a2 = (d1 - d2) << 3;
+    const int a3 = (d0 - d3) << 3;
+    tmp[0 + i * 4] = (a0 + a1);
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
+    tmp[2 + i * 4] = (a0 - a1);
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
   }
   for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);
     const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
     const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
     const int a3 = (tmp[0 + i] - tmp[12 + i]);
-    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
+    out[0 + i] = (a0 + a1 + 7) >> 4;
     out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
     out[8 + i] = (a0 - a1 + 7) >> 4;
     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
@@ -187,32 +207,31 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
 }
 
 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  // input is 12b signed
-  int32_t tmp[16];
+  int tmp[16];
   int i;
   for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
-    const int a1 = (in[1 * 16] + in[3 * 16]);
-    const int a2 = (in[1 * 16] - in[3 * 16]);
-    const int a3 = (in[0 * 16] - in[2 * 16]);
-    tmp[0 + i * 4] = a0 + a1;   // 14b
+    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
+    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
+    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
+    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
+    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
     tmp[1 + i * 4] = a3 + a2;
     tmp[2 + i * 4] = a3 - a2;
     tmp[3 + i * 4] = a0 - a1;
   }
   for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);
     const int a1 = (tmp[4 + i] + tmp[12+ i]);
     const int a2 = (tmp[4 + i] - tmp[12+ i]);
     const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;    // 16b
+    const int b0 = a0 + a1;
     const int b1 = a3 + a2;
     const int b2 = a3 - a2;
     const int b3 = a0 - a1;
-    out[ 0 + i] = b0 >> 1;     // 15b
-    out[ 4 + i] = b1 >> 1;
-    out[ 8 + i] = b2 >> 1;
-    out[12 + i] = b3 >> 1;
+    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
+    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
+    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
+    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
   }
 }
 
@@ -570,30 +589,30 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   int i;
   // horizontal pass
   for (i = 0; i < 4; ++i, in += BPS) {
-    const int a0 = in[0] + in[2];
-    const int a1 = in[1] + in[3];
-    const int a2 = in[1] - in[3];
-    const int a3 = in[0] - in[2];
-    tmp[0 + i * 4] = a0 + a1;
+    const int a0 = (in[0] + in[2]) << 2;
+    const int a1 = (in[1] + in[3]) << 2;
+    const int a2 = (in[1] - in[3]) << 2;
+    const int a3 = (in[0] - in[2]) << 2;
+    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
     tmp[1 + i * 4] = a3 + a2;
     tmp[2 + i * 4] = a3 - a2;
     tmp[3 + i * 4] = a0 - a1;
   }
   // vertical pass
   for (i = 0; i < 4; ++i, ++w) {
-    const int a0 = tmp[0 + i] + tmp[8 + i];
-    const int a1 = tmp[4 + i] + tmp[12+ i];
-    const int a2 = tmp[4 + i] - tmp[12+ i];
-    const int a3 = tmp[0 + i] - tmp[8 + i];
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
     const int b0 = a0 + a1;
     const int b1 = a3 + a2;
     const int b2 = a3 - a2;
     const int b3 = a0 - a1;
-
-    sum += w[ 0] * abs(b0);
-    sum += w[ 4] * abs(b1);
-    sum += w[ 8] * abs(b2);
-    sum += w[12] * abs(b3);
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    sum += w[ 0] * ((abs(b0) + 3) >> 3);
+    sum += w[ 4] * ((abs(b1) + 3) >> 3);
+    sum += w[ 8] * ((abs(b2) + 3) >> 3);
+    sum += w[12] * ((abs(b3) + 3) >> 3);
   }
   return sum;
 }
@@ -602,7 +621,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                     const uint16_t* const w) {
   const int sum1 = TTransform(a, w);
   const int sum2 = TTransform(b, w);
-  return abs(sum2 - sum1) >> 5;
+  return (abs(sum2 - sum1) + 8) >> 4;
 }
 
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@@ -632,38 +651,13 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   for (; n < 16; ++n) {
     const int j = kZigzag[n];
     const int sign = (in[j] < 0);
-    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    if (coeff > 2047) coeff = 2047;
     if (coeff > mtx->zthresh_[j]) {
       const int Q = mtx->q_[j];
       const int iQ = mtx->iq_[j];
       const int B = mtx->bias_[j];
       out[n] = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
-    } else {
-      out[n] = 0;
-      in[j] = 0;
-    }
-  }
-  return (last >= 0);
-}
-
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  int n, last = -1;
-  for (n = 0; n < 16; ++n) {
-    const int j = kZigzag[n];
-    const int sign = (in[j] < 0);
-    const int coeff = sign ? -in[j] : in[j];
-    assert(mtx->sharpen_[j] == 0);
-    if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
       if (sign) out[n] = -out[n];
       in[j] = out[n] * Q;
       if (out[n]) last = n;
@@ -709,11 +703,9 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
-VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 
 extern void VP8EncDspInitSSE2(void);
-extern void VP8EncDspInitNEON(void);
 
 void VP8EncDspInit(void) {
   InitTables();
@@ -734,7 +726,6 @@ void VP8EncDspInit(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8Copy4x4 = Copy4x4;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@@ -743,11 +734,10 @@ void VP8EncDspInit(void) {
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
     }
-#elif defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
 #endif
   }
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dsp/enc_neon.c b/drivers/webp/dsp/enc_neon.c
deleted file mode 100644
index 52cca18682..0000000000
--- a/drivers/webp/dsp/enc_neon.c
+++ /dev/null
@@ -1,632 +0,0 @@
-// Copyright 2012 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// ARM NEON version of speed-critical encoding functions.
-//
-// adapted from libvpx (http://www.webmproject.org/code/)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_NEON)
-
-#include "../enc/vp8enci.h"
-
-//------------------------------------------------------------------------------
-// Transforms (Paragraph 14.4)
-
-// Inverse transform.
-// This code is pretty much the same as TransformOneNEON in the decoder, except
-// for subtraction to *ref. See the comments there for algorithmic explanations.
-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
-  const int kBPS = BPS;
-  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
-
-  __asm__ volatile (
-    "vld1.16         {q1, q2}, [%[in]]           \n"
-    "vld1.16         {d0}, [%[kC1C2]]            \n"
-
-    // d2: in[0]
-    // d3: in[8]
-    // d4: in[4]
-    // d5: in[12]
-    "vswp            d3, d4                      \n"
-
-    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
-    // q9 = {in[4], in[12]} * kC2 >> 16
-    "vqdmulh.s16     q8, q2, d0[0]               \n"
-    "vqdmulh.s16     q9, q2, d0[1]               \n"
-
-    // d22 = a = in[0] + in[8]
-    // d23 = b = in[0] - in[8]
-    "vqadd.s16       d22, d2, d3                 \n"
-    "vqsub.s16       d23, d2, d3                 \n"
-
-    //  q8 = in[4]/[12] * kC1 >> 16
-    "vshr.s16        q8, q8, #1                  \n"
-
-    // Add {in[4], in[12]} back after the multiplication.
-    "vqadd.s16       q8, q2, q8                  \n"
-
-    // d20 = c = in[4]*kC2 - in[12]*kC1
-    // d21 = d = in[4]*kC1 + in[12]*kC2
-    "vqsub.s16       d20, d18, d17               \n"
-    "vqadd.s16       d21, d19, d16               \n"
-
-    // d2 = tmp[0] = a + d
-    // d3 = tmp[1] = b + c
-    // d4 = tmp[2] = b - c
-    // d5 = tmp[3] = a - d
-    "vqadd.s16       d2, d22, d21                \n"
-    "vqadd.s16       d3, d23, d20                \n"
-    "vqsub.s16       d4, d23, d20                \n"
-    "vqsub.s16       d5, d22, d21                \n"
-
-    "vzip.16         q1, q2                      \n"
-    "vzip.16         q1, q2                      \n"
-
-    "vswp            d3, d4                      \n"
-
-    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
-    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
-    "vqdmulh.s16     q8, q2, d0[0]               \n"
-    "vqdmulh.s16     q9, q2, d0[1]               \n"
-
-    // d22 = a = tmp[0] + tmp[8]
-    // d23 = b = tmp[0] - tmp[8]
-    "vqadd.s16       d22, d2, d3                 \n"
-    "vqsub.s16       d23, d2, d3                 \n"
-
-    "vshr.s16        q8, q8, #1                  \n"
-    "vqadd.s16       q8, q2, q8                  \n"
-
-    // d20 = c = in[4]*kC2 - in[12]*kC1
-    // d21 = d = in[4]*kC1 + in[12]*kC2
-    "vqsub.s16       d20, d18, d17               \n"
-    "vqadd.s16       d21, d19, d16               \n"
-
-    // d2 = tmp[0] = a + d
-    // d3 = tmp[1] = b + c
-    // d4 = tmp[2] = b - c
-    // d5 = tmp[3] = a - d
-    "vqadd.s16       d2, d22, d21                \n"
-    "vqadd.s16       d3, d23, d20                \n"
-    "vqsub.s16       d4, d23, d20                \n"
-    "vqsub.s16       d5, d22, d21                \n"
-
-    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
-    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
-    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
-    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
-
-    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
-
-    // (val) + 4 >> 3
-    "vrshr.s16       d2, d2, #3                  \n"
-    "vrshr.s16       d3, d3, #3                  \n"
-    "vrshr.s16       d4, d4, #3                  \n"
-    "vrshr.s16       d5, d5, #3                  \n"
-
-    "vzip.16         q1, q2                      \n"
-    "vzip.16         q1, q2                      \n"
-
-    // Must accumulate before saturating
-    "vmovl.u8        q8, d6                      \n"
-    "vmovl.u8        q9, d7                      \n"
-
-    "vqadd.s16       q1, q1, q8                  \n"
-    "vqadd.s16       q2, q2, q9                  \n"
-
-    "vqmovun.s16     d0, q1                      \n"
-    "vqmovun.s16     d1, q2                      \n"
-
-    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[1], [%[dst]]             \n"
-
-    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
-    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
-    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
-  );
-}
-
-static void ITransform(const uint8_t* ref,
-                       const int16_t* in, uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
-  if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
-  }
-}
-
-// Same code as dec_neon.c
-static void ITransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;  // The store is only incrementing the pointer as if we
-                         // had stored a single byte.
-  __asm__ volatile (
-    // part 1
-    // load data into q0, q1
-    "vld1.16         {q0, q1}, [%[in]]           \n"
-
-    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
-    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
-    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
-
-    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
-    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
-    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
-
-    // Transpose
-    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
-    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
-    "vtrn.32         q0, q1                      \n"
-    "vtrn.32         q2, q3                      \n"
-
-    "vmov.s32        q4, #3                      \n" // dc = 3
-    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
-    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
-    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
-
-    "vadd.s32        q0, q6, q7                  \n"
-    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
-    "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
-    "vsub.s32        q2, q6, q7                  \n"
-    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
-    "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
-
-    // set the results to output
-    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
-
-    : [out] "+r"(out)  // modified registers
-    : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3", "q4",
-      "q5", "q6", "q7", "q8", "q9" // clobbered
-  );
-}
-
-// Forward transform.
-
-// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
-static const int16_t kCoeff16[] = {
-  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
-};
-static const int32_t kCoeff32[] = {
-   1812,  1812,  1812,  1812,
-    937,   937,   937,   937,
-  12000, 12000, 12000, 12000,
-  51000, 51000, 51000, 51000
-};
-
-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
-  const int kBPS = BPS;
-  const uint8_t* src_ptr = src;
-  const uint8_t* ref_ptr = ref;
-  const int16_t* coeff16 = kCoeff16;
-  const int32_t* coeff32 = kCoeff32;
-
-  __asm__ volatile (
-    // load src into q4, q5 in high half
-    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
-    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
-    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
-    "vld1.8 {d11}, [%[src_ptr]]               \n"
-
-    // load ref into q6, q7 in high half
-    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
-    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
-    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
-    "vld1.8 {d15}, [%[ref_ptr]]               \n"
-
-    // Pack the high values in to q4 and q6
-    "vtrn.32     q4, q5                       \n"
-    "vtrn.32     q6, q7                       \n"
-
-    // d[0-3] = src - ref
-    "vsubl.u8    q0, d8, d12                  \n"
-    "vsubl.u8    q1, d9, d13                  \n"
-
-    // load coeff16 into q8(d16=5352, d17=2217)
-    "vld1.16     {q8}, [%[coeff16]]           \n"
-
-    // load coeff32 high half into q9 = 1812, q10 = 937
-    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
-
-    // load coeff32 low half into q11=12000, q12=51000
-    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
-
-    // part 1
-    // Transpose. Register dN is the same as dN in C
-    "vtrn.32         d0, d2                   \n"
-    "vtrn.32         d1, d3                   \n"
-    "vtrn.16         d0, d1                   \n"
-    "vtrn.16         d2, d3                   \n"
-
-    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
-    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
-    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
-    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
-
-    "vadd.s16        d0, d4, d5               \n" // a0 + a1
-    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
-    "vsub.s16        d2, d4, d5               \n" // a0 - a1
-    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
-
-    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
-    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
-    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
-    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
-
-    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
-    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
-    "vshrn.s32       d1, q9, #9               \n"
-    "vshrn.s32       d3, q10, #9              \n"
-
-    // part 2
-    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    "vtrn.32         d0, d2                   \n"
-    "vtrn.32         d1, d3                   \n"
-    "vtrn.16         d0, d1                   \n"
-    "vtrn.16         d2, d3                   \n"
-
-    "vmov.s16        d26, #7                  \n"
-
-    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
-    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
-    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
-    "vadd.s16        d4, d4, d26              \n" // a1 + 7
-    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
-
-    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
-    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
-
-    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
-    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
-
-    "vceq.s16        d4, d7, #0               \n"
-
-    "vshr.s16        d0, d0, #4               \n"
-    "vshr.s16        d2, d2, #4               \n"
-
-    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
-    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
-
-    "vmvn            d4, d4                   \n" // !(d1 == 0)
-    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
-    "vshrn.s32       d1, q11, #16             \n"
-    // op[4] += (d1!=0)
-    "vsub.s16        d1, d1, d4               \n"
-    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
-    "vshrn.s32       d3, q12, #16             \n"
-
-    // set result to out array
-    "vst1.16         {q0, q1}, [%[out]]   \n"
-    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
-      [coeff32] "+r"(coeff32)          // modified registers
-    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
-      [out] "r"(out)                   // constants
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-      "q10", "q11", "q12", "q13"       // clobbered
-  );
-}
-
-static void FTransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;
-  __asm__ volatile (
-    // d0 = in[0 * 16] , d1 = in[1 * 16]
-    // d2 = in[2 * 16] , d3 = in[3 * 16]
-    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
-
-    "vaddl.s16       q2, d0, d2                 \n" // a0=(in[0*16]+in[2*16])
-    "vaddl.s16       q3, d1, d3                 \n" // a1=(in[1*16]+in[3*16])
-    "vsubl.s16       q4, d1, d3                 \n" // a2=(in[1*16]-in[3*16])
-    "vsubl.s16       q5, d0, d2                 \n" // a3=(in[0*16]-in[2*16])
-
-    "vqadd.s32       q6, q2, q3                 \n" // a0 + a1
-    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
-    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
-    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
-
-    // Transpose
-    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
-    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
-    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
-    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
-    "vtrn.32         q6, q7                     \n"
-    "vtrn.32         q8, q9                     \n"
-
-    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
-    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
-    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
-    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
-
-    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
-    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
-    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
-    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
-
-    "vshrn.s32       d18, q4, #1                \n" // b0 >> 1
-    "vshrn.s32       d19, q5, #1                \n" // b1 >> 1
-    "vshrn.s32       d20, q6, #1                \n" // b2 >> 1
-    "vshrn.s32       d21, q7, #1                \n" // b3 >> 1
-
-    "vst1.16         {q9, q10}, [%[out]]        \n"
-
-    : [in] "+r"(in)
-    : [kStep] "r"(kStep), [out] "r"(out)
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
-      "q6", "q7", "q8", "q9", "q10"       // clobbered
-  ) ;
-}
-
-//------------------------------------------------------------------------------
-// Texture distortion
-//
-// We try to match the spectral content (weighted) between source and
-// reconstructed samples.
-
-// Hadamard transform
-// Returns the weighted sum of the absolute value of transformed coefficients.
-// This uses a TTransform helper function in C
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int kBPS = BPS;
-  const uint8_t* A = a;
-  const uint8_t* B = b;
-  const uint16_t* W = w;
-  int sum;
-  __asm__ volatile (
-    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[1], [%[a]]            \n"
-
-    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[1], [%[b]]            \n"
-
-    // a d0/d2, b d1/d3
-    // d0/d1: 01 01 01 01
-    // d2/d3: 23 23 23 23
-    // But: it goes 01 45 23 67
-    // Notice the middle values are transposed
-    "vtrn.16         q0, q1                   \n"
-
-    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
-    "vaddl.u8        q2, d0, d2               \n"
-    "vaddl.u8        q10, d1, d3              \n"
-    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
-    "vsubl.u8        q3, d0, d2               \n"
-    "vsubl.u8        q11, d1, d3              \n"
-
-    // tmp[0] = a0 + a1
-    "vpaddl.s16      q0, q2                   \n"
-    "vpaddl.s16      q8, q10                  \n"
-
-    // tmp[1] = a3 + a2
-    "vpaddl.s16      q1, q3                   \n"
-    "vpaddl.s16      q9, q11                  \n"
-
-    // No pair subtract
-    // q2 = {a0, a3}
-    // q3 = {a1, a2}
-    "vtrn.16         q2, q3                   \n"
-    "vtrn.16         q10, q11                 \n"
-
-    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
-    "vsubl.s16       q12, d4, d6              \n"
-    "vsubl.s16       q13, d5, d7              \n"
-    "vsubl.s16       q14, d20, d22            \n"
-    "vsubl.s16       q15, d21, d23            \n"
-
-    // separate tmp[3] and tmp[2]
-    // q12 = tmp[3]
-    // q13 = tmp[2]
-    "vtrn.32         q12, q13                 \n"
-    "vtrn.32         q14, q15                 \n"
-
-    // Transpose tmp for a
-    "vswp            d1, d26                  \n" // vtrn.64
-    "vswp            d3, d24                  \n" // vtrn.64
-    "vtrn.32         q0, q1                   \n"
-    "vtrn.32         q13, q12                 \n"
-
-    // Transpose tmp for b
-    "vswp            d17, d30                 \n" // vtrn.64
-    "vswp            d19, d28                 \n" // vtrn.64
-    "vtrn.32         q8, q9                   \n"
-    "vtrn.32         q15, q14                 \n"
-
-    // The first Q register is a, the second b.
-    // q0/8 tmp[0-3]
-    // q13/15 tmp[4-7]
-    // q1/9 tmp[8-11]
-    // q12/14 tmp[12-15]
-
-    // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propagates them.
-    "vswp            d3, d27                  \n"
-    "vswp            d19, d31                 \n"
-
-    // a0 = tmp[0] + tmp[8]
-    "vadd.s32        q2, q0, q1               \n"
-    "vadd.s32        q3, q8, q9               \n"
-
-    // a1 = tmp[4] + tmp[12]
-    "vadd.s32        q10, q13, q12            \n"
-    "vadd.s32        q11, q15, q14            \n"
-
-    // a2 = tmp[4] - tmp[12]
-    "vsub.s32        q13, q13, q12            \n"
-    "vsub.s32        q15, q15, q14            \n"
-
-    // a3 = tmp[0] - tmp[8]
-    "vsub.s32        q0, q0, q1               \n"
-    "vsub.s32        q8, q8, q9               \n"
-
-    // b0 = a0 + a1
-    "vadd.s32        q1, q2, q10              \n"
-    "vadd.s32        q9, q3, q11              \n"
-
-    // b1 = a3 + a2
-    "vadd.s32        q12, q0, q13             \n"
-    "vadd.s32        q14, q8, q15             \n"
-
-    // b2 = a3 - a2
-    "vsub.s32        q0, q0, q13              \n"
-    "vsub.s32        q8, q8, q15              \n"
-
-    // b3 = a0 - a1
-    "vsub.s32        q2, q2, q10              \n"
-    "vsub.s32        q3, q3, q11              \n"
-
-    "vld1.64         {q10, q11}, [%[w]]       \n"
-
-    // abs(b0)
-    "vabs.s32        q1, q1                   \n"
-    "vabs.s32        q9, q9                   \n"
-    // abs(b1)
-    "vabs.s32        q12, q12                 \n"
-    "vabs.s32        q14, q14                 \n"
-    // abs(b2)
-    "vabs.s32        q0, q0                   \n"
-    "vabs.s32        q8, q8                   \n"
-    // abs(b3)
-    "vabs.s32        q2, q2                   \n"
-    "vabs.s32        q3, q3                   \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d20                 \n"
-    "vmovl.u16       q15, d21                 \n"
-
-    // w[0] * abs(b0)
-    "vmul.u32        q1, q1, q13              \n"
-    "vmul.u32        q9, q9, q13              \n"
-
-    // w[4] * abs(b1)
-    "vmla.u32        q1, q12, q15             \n"
-    "vmla.u32        q9, q14, q15             \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d22                 \n"
-    "vmovl.u16       q15, d23                 \n"
-
-    // w[8] * abs(b1)
-    "vmla.u32        q1, q0, q13              \n"
-    "vmla.u32        q9, q8, q13              \n"
-
-    // w[12] * abs(b1)
-    "vmla.u32        q1, q2, q15              \n"
-    "vmla.u32        q9, q3, q15              \n"
-
-    // Sum the arrays
-    "vpaddl.u32      q1, q1                   \n"
-    "vpaddl.u32      q9, q9                   \n"
-    "vadd.u64        d2, d3                   \n"
-    "vadd.u64        d18, d19                 \n"
-
-    // Hadamard transform needs 4 bits of extra precision (2 bits in each
-    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
-    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
-    // 16bits for w[] + 2 bits of abs() summation.
-    //
-    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
-    // A-OK.
-
-    // sum2 - sum1
-    "vsub.u32        d0, d2, d18              \n"
-    // abs(sum2 - sum1)
-    "vabs.s32        d0, d0                   \n"
-    // abs(sum2 - sum1) >> 5
-    "vshr.u32        d0, #5                   \n"
-
-    // It would be better to move the value straight into r0 but I'm not
-    // entirely sure how this works with inline assembly.
-    "vmov.32         %[sum], d0[0]            \n"
-
-    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
-    : [kBPS] "r"(kBPS)
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
-  ) ;
-
-  return sum;
-}
-
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
-  int D = 0;
-  int x, y;
-  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
-    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
-    }
-  }
-  return D;
-}
-
-#endif   // WEBP_USE_NEON
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspInitNEON(void);
-
-void VP8EncDspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-
-  VP8ITransformWHT = ITransformWHT;
-  VP8FTransformWHT = FTransformWHT;
-
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-#endif   // WEBP_USE_NEON
-}
-
diff --git a/drivers/webp/dsp/enc_sse2.c b/drivers/webp/dsp/enc_sse2.c
index 540a3cb2db..b046761dc1 100644
--- a/drivers/webp/dsp/enc_sse2.c
+++ b/drivers/webp/dsp/enc_sse2.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of speed-critical encoding functions.
@@ -19,48 +17,21 @@
 
 #include "../enc/vp8enci.h"
 
-//------------------------------------------------------------------------------
-// Quite useful macro for debugging. Left here for convenience.
-
-#if 0
-#include <stdio.h>
-static void PrintReg(const __m128i r, const char* const name, int size) {
-  int n;
-  union {
-    __m128i r;
-    uint8_t i8[16];
-    uint16_t i16[8];
-    uint32_t i32[4];
-    uint64_t i64[2];
-  } tmp;
-  tmp.r = r;
-  printf("%s\t: ", name);
-  if (size == 8) {
-    for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);
-  } else if (size == 16) {
-    for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);
-  } else if (size == 32) {
-    for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);
-  } else {
-    for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);
-  }
-  printf("\n");
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
-static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                 int start_block, int end_block,
-                                 VP8Histogram* const histo) {
+static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
-  int j;
   for (j = start_block; j < end_block; ++j) {
-    int16_t out[16];
-    int k;
-
     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
@@ -76,9 +47,9 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
       const __m128i xor1 = _mm_xor_si128(out1, sign1);
       const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
       const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 3
-      const __m128i v0 = _mm_srai_epi16(abs0, 3);
-      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // v = abs(out) >> 2
+      const __m128i v0 = _mm_srai_epi16(abs0, 2);
+      const __m128i v1 = _mm_srai_epi16(abs1, 2);
       // bin = min(v, MAX_COEFF_THRESH)
       const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
       const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
@@ -87,11 +58,13 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
       _mm_storeu_si128((__m128i*)&out[8], bin1);
     }
 
-    // Convert coefficients to bin.
+    // Use bin to update histogram.
     for (k = 0; k < 16; ++k) {
-      histo->distribution[out[k]]++;
+      histo[out[k]]++;
     }
   }
+
+  return VP8GetAlpha(histo);
 }
 
 //------------------------------------------------------------------------------
@@ -270,7 +243,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
 
   // Add inverse transform to 'ref' and store.
   {
-    const __m128i zero = _mm_setzero_si128();
+    const __m128i zero = _mm_set1_epi16(0);
     // Load the reference(s).
     __m128i ref0, ref1, ref2, ref3;
     if (do_two) {
@@ -322,22 +295,16 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                            int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i seven = _mm_set1_epi16(7);
-  const __m128i k937 = _mm_set1_epi32(937);
-  const __m128i k1812 = _mm_set1_epi32(1812);
+  const __m128i k7500 = _mm_set1_epi32(7500);
+  const __m128i k14500 = _mm_set1_epi32(14500);
   const __m128i k51000 = _mm_set1_epi32(51000);
   const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                            5352,  2217, 5352,  2217);
   const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                            2217, -5352, 2217, -5352);
-  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
-  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
-  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
-                                            2217, 5352, 2217, 5352);
-  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
-                                            -5352, 2217, -5352, 2217);
-  __m128i v01, v32;
 
+  __m128i v01, v32;
 
   // Difference between src and ref and initial transpose.
   {
@@ -359,52 +326,73 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
     const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
     const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
     const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-    // Compute difference. -> 00 01 02 03 00 00 00 00
+    // Compute difference.
     const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
     const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
     const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
     const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
 
-
-    // Unpack and shuffle
+    // Transpose.
     // 00 01 02 03   0 0 0 0
     // 10 11 12 13   0 0 0 0
     // 20 21 22 23   0 0 0 0
     // 30 31 32 33   0 0 0 0
-    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
-    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
-    // 00 01 10 11 02 03 12 13
-    // 20 21 30 31 22 23 32 33
-    const __m128i shuf01_p =
-        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i shuf23_p =
-        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
-    // 00 01 10 11 03 02 13 12
-    // 20 21 30 31 23 22 33 32
-    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
-    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
-    // 00 01 10 11 20 21 30 31
-    // 03 02 13 12 23 22 33 32
-    const __m128i a01 = _mm_add_epi16(s01, s32);
-    const __m128i a32 = _mm_sub_epi16(s01, s32);
-    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
-    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
-
-    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
-    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
-    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
-    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
-    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
-    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
-    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
-    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
-    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
-    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
-    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
-    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
-    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
-    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // a02 a12 a22 a32   a03 a13 a23 a33
+    // a00 a10 a20 a30   a01 a11 a21 a31
+    // a03 a13 a23 a33   a02 a12 a22 a32
+  }
+
+  // First pass and subsequent transpose.
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // b0 = (a0 + a3) << 3
+    // b1 = (a1 + a2) << 3
+    // b3 = (a0 - a3) << 3
+    // b2 = (a1 - a2) << 3
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i b01 = _mm_slli_epi16(a01, 3);
+    const __m128i b32 = _mm_slli_epi16(a32, 3);
+    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
+    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
+
+    // e0 = b0 + b1
+    // e2 = b0 - b1
+    const __m128i e0 = _mm_add_epi16(b01, b11);
+    const __m128i e2 = _mm_sub_epi16(b01, b11);
+    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
+
+    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
+    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
+    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k14500);
+    const __m128i d3 = _mm_add_epi32(c3, k7500);
+    const __m128i e1 = _mm_srai_epi32(d1, 12);
+    const __m128i e3 = _mm_srai_epi32(d3, 12);
+    const __m128i e13 = _mm_packs_epi32(e1, e3);
+
+    // Transpose.
+    // 00 01 02 03  20 21 22 23
+    // 10 11 12 13  30 31 32 33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
+    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // 02 12 22 32   03 13 23 33
+    // 00 10 20 30   01 11 21 31
+    // 03 13 23 33   02 12 22 32
   }
 
   // Second pass
@@ -418,12 +406,13 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
     const __m128i a32 = _mm_sub_epi16(v01, v32);
     const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
     const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
 
     // d0 = (a0 + a1 + 7) >> 4;
     // d2 = (a0 - a1 + 7) >> 4;
-    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
-    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+    const __m128i b0 = _mm_add_epi16(a01, a11);
+    const __m128i b2 = _mm_sub_epi16(a01, a11);
+    const __m128i c0 = _mm_add_epi16(b0, seven);
+    const __m128i c2 = _mm_add_epi16(b2, seven);
     const __m128i d0 = _mm_srai_epi16(c0, 4);
     const __m128i d2 = _mm_srai_epi16(c2, 4);
 
@@ -441,7 +430,6 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
     // f1 = f1 + (a3 != 0);
     // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
     // desired (0, 1), we add one earlier through k12000_plus_one.
-    // -> f1 = f1 + 1 - (a3 == 0)
     const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
 
     _mm_storel_epi64((__m128i*)&out[ 0], d0);
@@ -451,137 +439,13 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
   }
 }
 
-static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
-  int32_t tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]);
-    const int a1 = (in[1 * 16] + in[3 * 16]);
-    const int a2 = (in[1 * 16] - in[3 * 16]);
-    const int a3 = (in[0 * 16] - in[2 * 16]);
-    tmp[0 + i * 4] = a0 + a1;
-    tmp[1 + i * 4] = a3 + a2;
-    tmp[2 + i * 4] = a3 - a2;
-    tmp[3 + i * 4] = a0 - a1;
-  }
-  {
-    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
-    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
-    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
-    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
-    const __m128i a0 = _mm_add_epi32(src0, src2);
-    const __m128i a1 = _mm_add_epi32(src1, src3);
-    const __m128i a2 = _mm_sub_epi32(src1, src3);
-    const __m128i a3 = _mm_sub_epi32(src0, src2);
-    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
-    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
-    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
-    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
-    const __m128i out0 = _mm_packs_epi32(b0, b1);
-    const __m128i out1 = _mm_packs_epi32(b2, b3);
-    _mm_storeu_si128((__m128i*)&out[0], out0);
-    _mm_storeu_si128((__m128i*)&out[8], out1);
-  }
-}
-
 //------------------------------------------------------------------------------
 // Metric
 
-static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
-                       int num_quads, int do_16) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum1 = zero;
-  __m128i sum2 = zero;
-
-  while (num_quads-- > 0) {
-    // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
-    // thanks to buffer over-allocation to that effect.
-    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
-    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
-    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
-    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
-    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
-    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
-    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
-    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);
-
-    // compute clip0(a-b) and clip0(b-a)
-    const __m128i a0p = _mm_subs_epu8(a0, b0);
-    const __m128i a0m = _mm_subs_epu8(b0, a0);
-    const __m128i a1p = _mm_subs_epu8(a1, b1);
-    const __m128i a1m = _mm_subs_epu8(b1, a1);
-    const __m128i a2p = _mm_subs_epu8(a2, b2);
-    const __m128i a2m = _mm_subs_epu8(b2, a2);
-    const __m128i a3p = _mm_subs_epu8(a3, b3);
-    const __m128i a3m = _mm_subs_epu8(b3, a3);
-
-    // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
-    const __m128i diff0 = _mm_or_si128(a0p, a0m);
-    const __m128i diff1 = _mm_or_si128(a1p, a1m);
-    const __m128i diff2 = _mm_or_si128(a2p, a2m);
-    const __m128i diff3 = _mm_or_si128(a3p, a3m);
-
-    // unpack (only four operations, instead of eight)
-    const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
-    const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
-    const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
-    const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
-
-    // multiply with self
-    const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
-    const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
-    const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
-    const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
-
-    // collect in a cascading way
-    const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
-    const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
-    sum1 = _mm_add_epi32(sum1, low_sum0);
-    sum2 = _mm_add_epi32(sum2, low_sum1);
-
-    if (do_16) {  // if necessary, process the higher 8 bytes similarly
-      const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
-      const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
-      const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
-      const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
-
-      const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
-      const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
-      const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
-      const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
-      const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
-      const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
-      sum1 = _mm_add_epi32(sum1, hi_sum0);
-      sum2 = _mm_add_epi32(sum2, hi_sum1);
-    }
-    a += 4 * BPS;
-    b += 4 * BPS;
-  }
-  {
-    int32_t tmp[4];
-    const __m128i sum = _mm_add_epi32(sum1, sum2);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-  }
-}
-
-static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 4, 1);
-}
-
-static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 1);
-}
-
-static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 0);
-}
-
 static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
-  const __m128i zero = _mm_setzero_si128();
+  const __m128i zero = _mm_set1_epi16(0);
 
-  // Load values. Note that we read 8 pixels instead of 4,
-  // but the a/b buffers are over-allocated to that effect.
+  // Load values.
   const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
   const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
   const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
@@ -619,7 +483,6 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
   const __m128i sum0 = _mm_add_epi32(madd0, madd1);
   const __m128i sum1 = _mm_add_epi32(madd2, madd3);
   const __m128i sum2 = _mm_add_epi32(sum0, sum1);
-
   int32_t tmp[4];
   _mm_storeu_si128((__m128i*)tmp, sum2);
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
@@ -639,8 +502,10 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i three = _mm_set1_epi16(3);
 
-  // Load, combine and transpose inputs.
+  // Load, combine and tranpose inputs.
   {
     const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
     const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
@@ -685,14 +550,17 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
   // Horizontal pass and subsequent transpose.
   {
     // Calculate a and b (two 4x4 at once).
-    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
-    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
-    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
-    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
-    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
+    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
+    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
+    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
+    // b0_extra = (a0 != 0);
+    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
+    const __m128i b0_base = _mm_add_epi16(a0, a1);
     const __m128i b1 = _mm_add_epi16(a3, a2);
     const __m128i b2 = _mm_sub_epi16(a3, a2);
     const __m128i b3 = _mm_sub_epi16(a0, a1);
+    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
     // a00 a01 a02 a03   b00 b01 b02 b03
     // a10 a11 a12 a13   b10 b11 b12 b13
     // a20 a21 a22 a23   b20 b21 b22 b23
@@ -767,6 +635,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
       B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
     }
 
+    // b = abs(b) + 3
+    A_b0 = _mm_add_epi16(A_b0, three);
+    A_b2 = _mm_add_epi16(A_b2, three);
+    B_b0 = _mm_add_epi16(B_b0, three);
+    B_b2 = _mm_add_epi16(B_b2, three);
+
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    // b = (abs(b) + 3) >> 3
+    A_b0 = _mm_srai_epi16(A_b0, 3);
+    A_b2 = _mm_srai_epi16(A_b2, 3);
+    B_b0 = _mm_srai_epi16(B_b0, 3);
+    B_b2 = _mm_srai_epi16(B_b2, 3);
+
     // weighted sums
     A_b0 = _mm_madd_epi16(A_b0, w_0);
     A_b2 = _mm_madd_epi16(A_b2, w_8);
@@ -785,7 +666,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
 static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
                         const uint16_t* const w) {
   const int diff_sum = TTransformSSE2(a, b, w);
-  return abs(diff_sum) >> 5;
+  return (abs(diff_sum) + 8) >> 4;
 }
 
 static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
@@ -800,6 +681,7 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
   return D;
 }
 
+
 //------------------------------------------------------------------------------
 // Quantization
 //
@@ -807,8 +689,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
 // Simple quantization
 static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                              int n, const VP8Matrix* const mtx) {
-  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
-  const __m128i zero = _mm_setzero_si128();
+  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i sign0, sign8;
   __m128i coeff0, coeff8;
   __m128i out0, out8;
   __m128i packed_out;
@@ -826,10 +709,12 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
   const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
   const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
+  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
 
   // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
-  const __m128i sign0 = _mm_srai_epi16(in0, 15);
-  const __m128i sign8 = _mm_srai_epi16(in8, 15);
+  sign0 = _mm_srai_epi16(in0, 15);
+  sign8 = _mm_srai_epi16(in8, 15);
 
   // coeff = abs(in) = (in ^ sign) - sign
   coeff0 = _mm_xor_si128(in0, sign0);
@@ -841,6 +726,10 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   coeff0 = _mm_add_epi16(coeff0, sharpen0);
   coeff8 = _mm_add_epi16(coeff8, sharpen8);
 
+  // if (coeff > 2047) coeff = 2047
+  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
+  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
+
   // out = (coeff * iQ + B) >> QFIX;
   {
     // doing calculations with 32b precision (QFIX=17)
@@ -868,14 +757,9 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
     out_04 = _mm_srai_epi32(out_04, QFIX);
     out_08 = _mm_srai_epi32(out_08, QFIX);
     out_12 = _mm_srai_epi32(out_12, QFIX);
-
     // pack result as 16b
     out0 = _mm_packs_epi32(out_00, out_04);
     out8 = _mm_packs_epi32(out_08, out_12);
-
-    // if (coeff > 2047) coeff = 2047
-    out0 = _mm_min_epi16(out0, max_coeff_2047);
-    out8 = _mm_min_epi16(out8, max_coeff_2047);
   }
 
   // get sign back (if (sign[j]) out_n = -out_n)
@@ -888,8 +772,17 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   in0 = _mm_mullo_epi16(out0, q0);
   in8 = _mm_mullo_epi16(out8, q8);
 
-  _mm_storeu_si128((__m128i*)&in[0], in0);
-  _mm_storeu_si128((__m128i*)&in[8], in8);
+  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
+  {
+    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
+    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
+    in0 = _mm_and_si128(in0, cmp0);
+    in8 = _mm_and_si128(in8, cmp8);
+    _mm_storeu_si128((__m128i*)&in[0], in0);
+    _mm_storeu_si128((__m128i*)&in[8], in8);
+    out0 = _mm_and_si128(out0, cmp0);
+    out8 = _mm_and_si128(out8, cmp8);
+  }
 
   // zigzag the output before storing it.
   //
@@ -926,32 +819,19 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   }
 }
 
-static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
-                                const VP8Matrix* const mtx) {
-  return QuantizeBlockSSE2(in, out, 0, mtx);
-}
-
-#endif   // WEBP_USE_SSE2
-
-//------------------------------------------------------------------------------
-// Entry point
-
 extern void VP8EncDspInitSSE2(void);
-
 void VP8EncDspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
   VP8CollectHistogram = CollectHistogramSSE2;
   VP8EncQuantizeBlock = QuantizeBlockSSE2;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
   VP8ITransform = ITransformSSE2;
   VP8FTransform = FTransformSSE2;
-  VP8FTransformWHT = FTransformWHTSSE2;
-  VP8SSE16x16 = SSE16x16SSE2;
-  VP8SSE16x8 = SSE16x8SSE2;
-  VP8SSE8x8 = SSE8x8SSE2;
   VP8SSE4x4 = SSE4x4SSE2;
   VP8TDisto4x4 = Disto4x4SSE2;
   VP8TDisto16x16 = Disto16x16SSE2;
-#endif   // WEBP_USE_SSE2
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // WEBP_USE_SSE2
diff --git a/drivers/webp/dsp/lossless.c b/drivers/webp/dsp/lossless.c
index bab76d22de..62a6b7b15a 100644
--- a/drivers/webp/dsp/lossless.c
+++ b/drivers/webp/dsp/lossless.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
@@ -13,24 +11,25 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-#include <emmintrin.h>
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
 #endif
 
 #include <math.h>
 #include <stdlib.h>
 #include "./lossless.h"
 #include "../dec/vp8li.h"
-#include "./yuv.h"
+#include "../dsp/yuv.h"
+#include "../dsp/dsp.h"
+#include "../enc/histogram.h"
 
 #define MAX_DIFF_COST (1e30f)
 
 // lookup table for small values of log2(int)
 #define APPROX_LOG_MAX  4096
 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+#define LOG_LOOKUP_IDX_MAX 256
+static const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   0.0000000000000000f, 0.0000000000000000f,
   1.0000000000000000f, 1.5849625007211560f,
   2.0000000000000000f, 2.3219280948873621f,
@@ -161,200 +160,16 @@ const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   7.9886846867721654f, 7.9943534368588577f
 };
 
-const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
-  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
-  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
-  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
-  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
-  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
-  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
-  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
-  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
-  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
-  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
-  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
-  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
-  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
-  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
-  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
-  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
-  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
-  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
-  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
-  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
-  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
-  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
-  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
-  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
-  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
-  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
-  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
-  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
-  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
-  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
-  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
-  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
-  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
-  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
-  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
-  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
-  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
-  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
-  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
-  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
-  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
-  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
-  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
-  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
-  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
-  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
-  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
-  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
-  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
-  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
-  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
-  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
-  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
-  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
-  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
-  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
-  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
-  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
-  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
-  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
-  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
-  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
-  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
-  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
-};
-
-const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
-  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
-  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
-  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
-  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
-  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-};
-
-const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
-   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
-   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
-  127,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
-};
-
-float VP8LFastSLog2Slow(int v) {
-  assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
+float VP8LFastLog2(int v) {
+  if (v < LOG_LOOKUP_IDX_MAX) {
+    return kLog2Table[v];
+  } else if (v < APPROX_LOG_MAX) {
     int log_cnt = 0;
-    const float v_f = (float)v;
     while (v >= LOG_LOOKUP_IDX_MAX) {
       ++log_cnt;
       v = v >> 1;
     }
-    return v_f * (kLog2Table[v] + log_cnt);
-  } else {
-    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
-  }
-}
-
-float VP8LFastLog2Slow(int v) {
-  assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
-    int log_cnt = 0;
-    while (v >= LOG_LOOKUP_IDX_MAX) {
-      ++log_cnt;
-      v = v >> 1;
-    }
-    return kLog2Table[v] + log_cnt;
+    return kLog2Table[v] + (float)log_cnt;
   } else {
     return (float)(LOG_2_RECIPROCAL * log((double)v));
   }
@@ -424,9 +239,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 }
 
 static WEBP_INLINE int Sub3(int a, int b, int c) {
-  const int pb = b - c;
-  const int pa = a - c;
-  return abs(pb) - abs(pa);
+  const int pa = b - c;
+  const int pb = a - c;
+  return abs(pa) - abs(pb);
 }
 
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
@@ -435,6 +250,7 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
       Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
       Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
       Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
+
   return (pa_minus_pb <= 0) ? a : b;
 }
 
@@ -489,19 +305,18 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
+  const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
-// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
 typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
 static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor1, Predictor2, Predictor3,
@@ -525,36 +340,35 @@ static float PredictionCostSpatial(const int* counts,
   return (float)(-0.1 * bits);
 }
 
-// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int* const X,
-                                    const int* const Y, int n) {
+// Compute the Shanon's entropy: Sum(p*log2(p))
+static float ShannonEntropy(const int* const array, int n) {
   int i;
-  double retval = 0.;
-  int sumX = 0, sumXY = 0;
+  float retval = 0.f;
+  int sum = 0;
   for (i = 0; i < n; ++i) {
-    const int x = X[i];
-    const int xy = X[i] + Y[i];
-    if (x != 0) {
-      sumX += x;
-      retval -= VP8LFastSLog2(x);
-    }
-    if (xy != 0) {
-      sumXY += xy;
-      retval -= VP8LFastSLog2(xy);
+    if (array[i] != 0) {
+      sum += array[i];
+      retval -= VP8LFastSLog2(array[i]);
     }
   }
-  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  retval += VP8LFastSLog2(sum);
+  return retval;
 }
 
 static float PredictionCostSpatialHistogram(int accumulated[4][256],
                                             int tile[4][256]) {
   int i;
+  int k;
+  int combo[256];
   double retval = 0;
   for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
-    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
+    const double exp_val = 0.94;
+    retval += PredictionCostSpatial(&tile[i][0], 1, exp_val);
+    retval += ShannonEntropy(&tile[i][0], 256);
+    for (k = 0; k < 256; ++k) {
+      combo[k] = accumulated[i][k] + tile[i][k];
+    }
+    retval += ShannonEntropy(&combo[0], 256);
   }
   return (float)retval;
 }
@@ -757,9 +571,9 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  for (; i < num_pixs; ++i) {
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
+  int i;
+  for (i = 0; i < num_pixs; ++i) {
     const uint32_t argb = argb_data[i];
     const uint32_t green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
@@ -770,9 +584,13 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
+static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
+                                 int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  const uint32_t* const data_end = data + (y_end - y_start) * width;
   while (data < data_end) {
     const uint32_t argb = *data;
+    // "* 0001001u" is equivalent to "(green << 16) + green)"
     const uint32_t green = ((argb >> 8) & 0xff);
     uint32_t red_blue = (argb & 0x00ff00ffu);
     red_blue += (green << 16) | green;
@@ -837,25 +655,6 @@ static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
   return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 }
 
-static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
-                                             uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  uint32_t new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
-  return (new_red & 0xff);
-}
-
-static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
-                                              uint8_t red_to_blue,
-                                              uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  const uint32_t red = argb >> 16;
-  uint8_t new_blue = argb;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
-  return (new_blue & 0xff);
-}
-
 static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
                                           int ix, int xsize) {
   const uint32_t v = argb[ix];
@@ -876,10 +675,14 @@ static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
 static float PredictionCostCrossColor(const int accumulated[256],
                                       const int counts[256]) {
   // Favor low entropy, locally and globally.
-  // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
-  return CombinedShannonEntropy(counts, accumulated, 256) +
-         PredictionCostSpatial(counts, 3, kExpValue);
+  int i;
+  int combo[256];
+  for (i = 0; i < 256; ++i) {
+    combo[i] = accumulated[i] + counts[i];
+  }
+  return ShannonEntropy(combo, 256) +
+         ShannonEntropy(counts, 256) +
+         PredictionCostSpatial(counts, 3, 2.4);  // Favor small absolute values.
 }
 
 static Multipliers GetBestColorTransformForTile(
@@ -909,75 +712,85 @@ static Multipliers GetBestColorTransformForTile(
   if (all_y_max > ysize) {
     all_y_max = ysize;
   }
-
   for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
     int histo[256] = { 0 };
     int all_y;
+    Multipliers tx;
+    MultipliersClear(&tx);
+    tx.green_to_red_ = green_to_red & 0xff;
 
     for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+      uint32_t predict;
       int ix = all_y * xsize + tile_x_offset;
       int all_x;
       for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
         if (SkipRepeatedPixels(argb, ix, xsize)) {
           continue;
         }
-        ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
+        predict = TransformColor(&tx, argb[ix], 0);
+        ++histo[(predict >> 16) & 0xff];  // red.
       }
     }
     cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
-    if ((uint8_t)green_to_red == prevX.green_to_red_) {
+    if (tx.green_to_red_ == prevX.green_to_red_) {
       cur_diff -= 3;  // favor keeping the areas locally similar
     }
-    if ((uint8_t)green_to_red == prevY.green_to_red_) {
+    if (tx.green_to_red_ == prevY.green_to_red_) {
       cur_diff -= 3;  // favor keeping the areas locally similar
     }
-    if (green_to_red == 0) {
+    if (tx.green_to_red_ == 0) {
       cur_diff -= 3;
     }
     if (cur_diff < best_diff) {
       best_diff = cur_diff;
-      best_tx.green_to_red_ = green_to_red;
+      best_tx = tx;
     }
   }
   best_diff = MAX_DIFF_COST;
+  green_to_red = best_tx.green_to_red_;
   for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
     for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
       int all_y;
       int histo[256] = { 0 };
+      Multipliers tx;
+      tx.green_to_red_ = green_to_red;
+      tx.green_to_blue_ = green_to_blue;
+      tx.red_to_blue_ = red_to_blue;
       for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+        uint32_t predict;
         int all_x;
         int ix = all_y * xsize + tile_x_offset;
         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
           if (SkipRepeatedPixels(argb, ix, xsize)) {
             continue;
           }
-          ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
+          predict = TransformColor(&tx, argb[ix], 0);
+          ++histo[predict & 0xff];  // blue.
         }
       }
       cur_diff =
-          PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
-      if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
+        PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
+      if (tx.green_to_blue_ == prevX.green_to_blue_) {
         cur_diff -= 3;  // favor keeping the areas locally similar
       }
-      if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
+      if (tx.green_to_blue_ == prevY.green_to_blue_) {
         cur_diff -= 3;  // favor keeping the areas locally similar
       }
-      if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
+      if (tx.red_to_blue_ == prevX.red_to_blue_) {
         cur_diff -= 3;  // favor keeping the areas locally similar
       }
-      if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
+      if (tx.red_to_blue_ == prevY.red_to_blue_) {
         cur_diff -= 3;  // favor keeping the areas locally similar
       }
-      if (green_to_blue == 0) {
+      if (tx.green_to_blue_ == 0) {
         cur_diff -= 3;
       }
-      if (red_to_blue == 0) {
+      if (tx.red_to_blue_ == 0) {
         cur_diff -= 3;
       }
       if (cur_diff < best_diff) {
         best_diff = cur_diff;
-        best_tx.green_to_blue_ = green_to_blue;
-        best_tx.red_to_blue_ = red_to_blue;
+        best_tx = tx;
       }
     }
   }
@@ -1107,79 +920,54 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
 }
 
 // Separate out pixels packed together using pixel-bundling.
-// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
-#define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
-void FUNC_NAME(const VP8LTransform* const transform,                           \
-               int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
-  int y;                                                                       \
-  const int bits_per_pixel = 8 >> transform->bits_;                            \
-  const int width = transform->xsize_;                                         \
-  const uint32_t* const color_map = transform->data_;                          \
-  if (bits_per_pixel < 8) {                                                    \
-    const int pixels_per_byte = 1 << transform->bits_;                         \
-    const int count_mask = pixels_per_byte - 1;                                \
-    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
-    for (y = y_start; y < y_end; ++y) {                                        \
-      uint32_t packed_pixels = 0;                                              \
-      int x;                                                                   \
-      for (x = 0; x < width; ++x) {                                            \
-        /* We need to load fresh 'packed_pixels' once every                */  \
-        /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
-        /* is a power of 2, so can just use a mask for that, instead of    */  \
-        /* decrementing a counter.                                         */  \
-        if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
-        *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
-        packed_pixels >>= bits_per_pixel;                                      \
-      }                                                                        \
-    }                                                                          \
-  } else {                                                                     \
-    for (y = y_start; y < y_end; ++y) {                                        \
-      int x;                                                                   \
-      for (x = 0; x < width; ++x) {                                            \
-        *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-}
-
-static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
-  return (idx >> 8) & 0xff;
-}
-
-static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
-  return idx;
-}
-
-static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
-  return val;
-}
-
-static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
-  return (val >> 8) & 0xff;
+static void ColorIndexInverseTransform(
+    const VP8LTransform* const transform,
+    int y_start, int y_end, const uint32_t* src, uint32_t* dst) {
+  int y;
+  const int bits_per_pixel = 8 >> transform->bits_;
+  const int width = transform->xsize_;
+  const uint32_t* const color_map = transform->data_;
+  if (bits_per_pixel < 8) {
+    const int pixels_per_byte = 1 << transform->bits_;
+    const int count_mask = pixels_per_byte - 1;
+    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;
+    for (y = y_start; y < y_end; ++y) {
+      uint32_t packed_pixels = 0;
+      int x;
+      for (x = 0; x < width; ++x) {
+        // We need to load fresh 'packed_pixels' once every 'pixels_per_byte'
+        // increments of x. Fortunately, pixels_per_byte is a power of 2, so
+        // can just use a mask for that, instead of decrementing a counter.
+        if ((x & count_mask) == 0) packed_pixels = ((*src++) >> 8) & 0xff;
+        *dst++ = color_map[packed_pixels & bit_mask];
+        packed_pixels >>= bits_per_pixel;
+      }
+    }
+  } else {
+    for (y = y_start; y < y_end; ++y) {
+      int x;
+      for (x = 0; x < width; ++x) {
+        *dst++ = color_map[((*src++) >> 8) & 0xff];
+      }
+    }
+  }
 }
 
-static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
-                           GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
-                    GetAlphaValue)
-
-#undef COLOR_INDEX_INVERSE
-
 void VP8LInverseTransform(const VP8LTransform* const transform,
                           int row_start, int row_end,
                           const uint32_t* const in, uint32_t* const out) {
-  const int width = transform->xsize_;
   assert(row_start < row_end);
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
+      AddGreenToBlueAndRed(transform, row_start, row_end, out);
       break;
     case PREDICTOR_TRANSFORM:
       PredictorInverseTransform(transform, row_start, row_end, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
+        const int width = transform->xsize_;
         memcpy(out - width, out + (row_end - row_start - 1) * width,
                width * sizeof(*out));
       }
@@ -1194,7 +982,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
         // Also, note that this is the only transform that applies on
         // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
         // transforms work on effective width of xsize_.
-        const int out_stride = (row_end - row_start) * width;
+        const int out_stride = (row_end - row_start) * transform->xsize_;
         const int in_stride = (row_end - row_start) *
             VP8LSubSampleSize(transform->xsize_, transform->bits_);
         uint32_t* const src = out + out_stride - in_stride;
@@ -1246,15 +1034,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
-    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
-    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
-#ifdef WEBP_SWAP_16BIT_CSP
-    *dst++ = ba;
-    *dst++ = rg;
-#else
-    *dst++ = rg;
-    *dst++ = ba;
-#endif
+    *dst++ = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
+    *dst++ = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
   }
 }
 
@@ -1263,15 +1044,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
-    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
-    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
-#ifdef WEBP_SWAP_16BIT_CSP
-    *dst++ = gb;
-    *dst++ = rg;
-#else
-    *dst++ = rg;
-    *dst++ = gb;
-#endif
+    *dst++ = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
+    *dst++ = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
   }
 }
 
@@ -1292,34 +1066,20 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
       uint32_t argb = *src++;
-
-#if !defined(__BIG_ENDIAN__)
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-#if defined(__i386__) || defined(__x86_64__)
+#if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__))
       __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
       *(uint32_t*)dst = argb;
-#elif defined(_MSC_VER)
+      dst += sizeof(argb);
+#elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER)
       argb = _byteswap_ulong(argb);
       *(uint32_t*)dst = argb;
+      dst += sizeof(argb);
 #else
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
-#else  // WEBP_REFERENCE_IMPLEMENTATION
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
-#else  // __BIG_ENDIAN__
-      dst[0] = (argb >>  0) & 0xff;
-      dst[1] = (argb >>  8) & 0xff;
-      dst[2] = (argb >> 16) & 0xff;
-      dst[3] = (argb >> 24) & 0xff;
+      *dst++ = (argb >> 24) & 0xff;
+      *dst++ = (argb >> 16) & 0xff;
+      *dst++ = (argb >>  8) & 0xff;
+      *dst++ = (argb >>  0) & 0xff;
 #endif
-      dst += sizeof(argb);
     }
   } else {
     memcpy(dst, src, num_pixels * sizeof(*src));
@@ -1371,162 +1131,8 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
   }
 }
 
-// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst) {
-  int x;
-  if (xbits > 0) {
-    const int bit_depth = 1 << (3 - xbits);
-    const int mask = (1 << xbits) - 1;
-    uint32_t code = 0xff000000;
-    for (x = 0; x < width; ++x) {
-      const int xsub = x & mask;
-      if (xsub == 0) {
-        code = 0xff000000;
-      }
-      code |= row[x] << (8 + bit_depth * xsub);
-      dst[x >> xbits] = code;
-    }
-  } else {
-    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
-  }
-}
-
-//------------------------------------------------------------------------------
-
-// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
-// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
-#if defined(WEBP_USE_SSE2)
-static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i V1 = _mm_add_epi16(C0, C1);
-  const __m128i V2 = _mm_sub_epi16(V1, C2);
-  const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i A1 = _mm_sub_epi16(A0, B0);
-  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
-  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
-  const __m128i A3 = _mm_srai_epi16(A2, 1);
-  const __m128i A4 = _mm_add_epi16(A0, A3);
-  const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
-}
-
-static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
-  int pa_minus_pb;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
-  const __m128i AC0 = _mm_subs_epu8(A0, C0);
-  const __m128i CA0 = _mm_subs_epu8(C0, A0);
-  const __m128i BC0 = _mm_subs_epu8(B0, C0);
-  const __m128i CB0 = _mm_subs_epu8(C0, B0);
-  const __m128i AC = _mm_or_si128(AC0, CA0);
-  const __m128i BC = _mm_or_si128(BC0, CB0);
-  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
-  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
-  const __m128i diff = _mm_sub_epi16(pb, pa);
-  {
-    int16_t out[8];
-    _mm_storeu_si128((__m128i*)out, diff);
-    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
-  }
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; i + 4 < num_pixs; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-  // fallthrough and finish off with plain-C
-  for (; i < num_pixs; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
-    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
-  }
-}
-
-static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; data + 4 < data_end; data += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)data);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)data, out);
-  }
-  // fallthrough and finish off with plain-C
-  while (data < data_end) {
-    const uint32_t argb = *data;
-    const uint32_t green = ((argb >> 8) & 0xff);
-    uint32_t red_blue = (argb & 0x00ff00ffu);
-    red_blue += (green << 16) | green;
-    red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
-  }
-}
-
-extern void VP8LDspInitSSE2(void);
-
-void VP8LDspInitSSE2(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
-  VP8LSelect = SelectSSE2;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
-}
-#endif
 //------------------------------------------------------------------------------
 
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-VP8LPredSelectFunc VP8LSelect;
-VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
-
-void VP8LDspInit(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
-  VP8LSelect = Select;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8LDspInitSSE2();
-    }
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
 #endif
-  }
-}
-
-//------------------------------------------------------------------------------
-
diff --git a/drivers/webp/dsp/lossless.h b/drivers/webp/dsp/lossless.h
index 0f1d44200b..992516fcdf 100644
--- a/drivers/webp/dsp/lossless.h
+++ b/drivers/webp/dsp/lossless.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
@@ -18,31 +16,11 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
-//
-
-typedef uint32_t (*VP8LPredClampedAddSubFunc)(uint32_t c0, uint32_t c1,
-                                              uint32_t c2);
-typedef uint32_t (*VP8LPredSelectFunc)(uint32_t c0, uint32_t c1, uint32_t c2);
-typedef void (*VP8LSubtractGreenFromBlueAndRedFunc)(uint32_t* argb_data,
-                                                    int num_pixs);
-typedef void (*VP8LAddGreenToBlueAndRedFunc)(uint32_t* data_start,
-                                             const uint32_t* data_end);
-
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-extern VP8LPredSelectFunc VP8LSelect;
-extern VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-extern VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
-
-// Must be called before calling any of the above methods.
-void VP8LDspInit(void);
-
-//------------------------------------------------------------------------------
 // Image transforms.
 
 struct VP8LTransform;  // Defined in dec/vp8li.h.
@@ -55,12 +33,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                           int row_start, int row_end,
                           const uint32_t* const in, uint32_t* const out);
 
-// Similar to the static method ColorIndexInverseTransform() that is part of
-// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
-// uint32_t) arguments for 'src' and 'dst'.
-void VP8LColorIndexInverseTransformAlpha(
-    const struct VP8LTransform* const transform, int y_start, int y_end,
-    const uint8_t* src, uint8_t* dst);
+// Subtracts green from blue and red channels.
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
 
 void VP8LResidualImage(int width, int height, int bits,
                        uint32_t* const argb, uint32_t* const argb_scratch,
@@ -85,119 +59,10 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
   return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 
-// Faster logarithm for integers. Small values use a look-up table.
-#define LOG_LOOKUP_IDX_MAX 256
-extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
-extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-float VP8LFastLog2Slow(int v);
-float VP8LFastSLog2Slow(int v);
-static WEBP_INLINE float VP8LFastLog2(int v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
-}
+// Faster logarithm for integers, with the property of log2(0) == 0.
+float VP8LFastLog2(int v);
 // Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(int v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
-}
-
-// -----------------------------------------------------------------------------
-// PrefixEncode()
-
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  return 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  _BitScanReverse(&first_set_bit, n);
-  return first_set_bit;
-}
-#else
-// Returns (int)floor(log2(n)). n must be > 0.
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
-
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return log_floor;
-  else
-    return log_floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
-                                                  int* const extra_bits) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
-                                              int* const extra_bits,
-                                              int* const extra_bits_value) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-#define PREFIX_LOOKUP_IDX_MAX   512
-typedef struct {
-  int8_t code_;
-  int8_t extra_bits_;
-} VP8LPrefixCode;
-
-// These tables are derived using VP8LPrefixEncodeNoLUT.
-extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
-extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
-static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
-                                             int* const extra_bits) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-  } else {
-    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
-  }
-}
-
-static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
-                                         int* const extra_bits,
-                                         int* const extra_bits_value) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
-  } else {
-    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
-  }
-}
+static WEBP_INLINE float VP8LFastSLog2(int v) { return VP8LFastLog2(v) * v; }
 
 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@@ -208,12 +73,9 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
   return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }
 
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst);
-
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dsp/upsampling.c b/drivers/webp/dsp/upsampling.c
index 978e3ce250..4855eb1432 100644
--- a/drivers/webp/dsp/upsampling.c
+++ b/drivers/webp/dsp/upsampling.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // YUV to RGB upsampling functions.
@@ -14,7 +12,9 @@
 #include "./dsp.h"
 #include "./yuv.h"
 
-#include <assert.h>
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 // Fancy upsampler
@@ -32,7 +32,7 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
 //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
 
 // We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u, v) ((u) | ((v) << 16))
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
 
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
@@ -43,12 +43,11 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
   uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  assert(top_y != NULL);                                                       \
-  {                                                                            \
+  if (top_y) {                                                                 \
     const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
     FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
   }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if (bottom_y) {                                                              \
     const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
     FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
   }                                                                            \
@@ -59,7 +58,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
     const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
     const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    {                                                                          \
+    if (top_y) {                                                               \
       const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
       const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
       FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
@@ -67,7 +66,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
       FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
            top_dst + (2 * x - 0) * XSTEP);                                     \
     }                                                                          \
-    if (bottom_y != NULL) {                                                    \
+    if (bottom_y) {                                                            \
       const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
       const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
       FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
@@ -79,12 +78,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     l_uv = uv;                                                                 \
   }                                                                            \
   if (!(len & 1)) {                                                            \
-    {                                                                          \
+    if (top_y) {                                                               \
       const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
       FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
            top_dst + (len - 1) * XSTEP);                                       \
     }                                                                          \
-    if (bottom_y != NULL) {                                                    \
+    if (bottom_y) {                                                            \
       const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
       FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
            bottom_dst + (len - 1) * XSTEP);                                    \
@@ -167,8 +166,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                       uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
   const int half_len = len >> 1;                                               \
   int x;                                                                       \
-  assert(top_dst != NULL);                                                     \
-  {                                                                            \
+  if (top_dst != NULL) {                                                       \
     for (x = 0; x < half_len; ++x) {                                           \
       FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
       FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
@@ -330,11 +328,6 @@ void WebPInitUpsamplers(void) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitUpsamplersNEON();
-    }
-#endif
   }
 #endif  // FANCY_UPSAMPLING
 }
@@ -355,12 +348,10 @@ void WebPInitPremultiply(void) {
       WebPInitPremultiplySSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitPremultiplyNEON();
-    }
-#endif
   }
 #endif  // FANCY_UPSAMPLING
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dsp/upsampling_neon.c b/drivers/webp/dsp/upsampling_neon.c
deleted file mode 100644
index 791222f81e..0000000000
--- a/drivers/webp/dsp/upsampling_neon.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// NEON version of YUV to RGB upsampling functions.
-//
-// Author: mans@mansr.com (Mans Rullgard)
-// Based on SSE code by: somnath@google.com (Somnath Banerjee)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_NEON)
-
-#include <assert.h>
-#include <arm_neon.h>
-#include <string.h>
-#include "./yuv.h"
-
-#ifdef FANCY_UPSAMPLING
-
-//-----------------------------------------------------------------------------
-// U/V upsampling
-
-// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
-#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
-  uint8x8_t a = vld1_u8(r1);                                            \
-  uint8x8_t b = vld1_u8(r1 + 1);                                        \
-  uint8x8_t c = vld1_u8(r2);                                            \
-  uint8x8_t d = vld1_u8(r2 + 1);                                        \
-                                                                        \
-  uint16x8_t al = vshll_n_u8(a, 1);                                     \
-  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
-  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
-  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
-                                                                        \
-  uint8x8_t diag1, diag2;                                               \
-  uint16x8_t sl;                                                        \
-                                                                        \
-  /* a + b + c + d */                                                   \
-  sl = vaddl_u8(a,  b);                                                 \
-  sl = vaddw_u8(sl, c);                                                 \
-  sl = vaddw_u8(sl, d);                                                 \
-                                                                        \
-  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
-  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
-                                                                        \
-  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
-  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
-                                                                        \
-  diag2 = vshrn_n_u16(al, 3);                                           \
-  diag1 = vshrn_n_u16(bl, 3);                                           \
-                                                                        \
-  a = vrhadd_u8(a, diag1);                                              \
-  b = vrhadd_u8(b, diag2);                                              \
-  c = vrhadd_u8(c, diag2);                                              \
-  d = vrhadd_u8(d, diag1);                                              \
-                                                                        \
-  {                                                                     \
-    const uint8x8x2_t a_b = {{ a, b }};                                 \
-    const uint8x8x2_t c_d = {{ c, d }};                                 \
-    vst2_u8(out,      a_b);                                             \
-    vst2_u8(out + 32, c_d);                                             \
-  }                                                                     \
-}
-
-// Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
-                             uint8_t *out) {
-  UPSAMPLE_16PIXELS(r1, r2, out);
-}
-
-#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
-  uint8_t r1[9], r2[9];                                                 \
-  memcpy(r1, (tb), (num_pixels));                                       \
-  memcpy(r2, (bb), (num_pixels));                                       \
-  /* replicate last byte */                                             \
-  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
-  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
-  Upsample16Pixels(r1, r2, out);                                        \
-}
-
-//-----------------------------------------------------------------------------
-// YUV->RGB conversion
-
-static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
-
-#define v255 vmov_n_u8(255)
-
-#define STORE_Rgb(out, r, g, b) do {                                    \
-  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
-  vst3_u8(out, r_g_b);                                                  \
-} while (0)
-
-#define STORE_Bgr(out, r, g, b) do {                                    \
-  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
-  vst3_u8(out, b_g_r);                                                  \
-} while (0)
-
-#define STORE_Rgba(out, r, g, b) do {                                   \
-  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
-  vst4_u8(out, r_g_b_v255);                                             \
-} while (0)
-
-#define STORE_Bgra(out, r, g, b) do {                                   \
-  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
-  vst4_u8(out, b_g_r_v255);                                             \
-} while (0)
-
-#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
-  int i;                                                                \
-  for (i = 0; i < N; i += 8) {                                          \
-    const int off = ((cur_x) + i) * XSTEP;                              \
-    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
-    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
-    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
-    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
-    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
-    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
-    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
-    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
-    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
-    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
-    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
-    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
-    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
-    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
-    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
-    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
-    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
-    /* vrshrn_n_s32() already incorporates the rounding constant */     \
-    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
-                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
-    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
-                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
-    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
-                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
-    STORE_ ## FMT(out + off, y, u, v);                                  \
-  }                                                                     \
-}
-
-#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
-  int i;                                                                \
-  for (i = 0; i < N; i++) {                                             \
-    const int off = ((cur_x) + i) * XSTEP;                              \
-    const int y = src_y[(cur_x) + i];                                   \
-    const int u = (src_uv)[i];                                          \
-    const int v = (src_uv)[i + 16];                                     \
-    FUNC(y, u, v, rgb + off);                                           \
-  }                                                                     \
-}
-
-#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
-                      top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
-  if (bottom_y != NULL) {                                               \
-    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
-  }                                                                     \
-}
-
-#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
-                      top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
-  if (bottom_y != NULL) {                                               \
-    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
-  }                                                                     \
-}
-
-#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
-static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
-                      const uint8_t *top_u, const uint8_t *top_v,       \
-                      const uint8_t *cur_u, const uint8_t *cur_v,       \
-                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
-  int block;                                                            \
-  /* 16 byte aligned array to cache reconstructed u and v */            \
-  uint8_t uv_buf[2 * 32 + 15];                                          \
-  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
-  const int uv_len = (len + 1) >> 1;                                    \
-  /* 9 pixels must be read-able for each block */                       \
-  const int num_blocks = (uv_len - 1) >> 3;                             \
-  const int leftover = uv_len - num_blocks * 8;                         \
-  const int last_pos = 1 + 16 * num_blocks;                             \
-                                                                        \
-  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
-  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
-                                                                        \
-  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
-  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
-  const uint8x8_t u16  = vmov_n_u8(16);                                 \
-  const uint8x8_t u128 = vmov_n_u8(128);                                \
-                                                                        \
-  /* Treat the first pixel in regular way */                            \
-  assert(top_y != NULL);                                                \
-  {                                                                     \
-    const int u0 = (top_u[0] + u_diag) >> 1;                            \
-    const int v0 = (top_v[0] + v_diag) >> 1;                            \
-    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
-  }                                                                     \
-  if (bottom_y != NULL) {                                               \
-    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
-    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
-    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
-  }                                                                     \
-                                                                        \
-  for (block = 0; block < num_blocks; ++block) {                        \
-    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
-    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
-    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
-                  top_dst, bottom_dst, 16 * block + 1, 16);             \
-    top_u += 8;                                                         \
-    cur_u += 8;                                                         \
-    top_v += 8;                                                         \
-    cur_v += 8;                                                         \
-  }                                                                     \
-                                                                        \
-  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
-  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
-                top_dst, bottom_dst, last_pos, len - last_pos);         \
-}
-
-// NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
-
-#endif  // FANCY_UPSAMPLING
-
-#endif   // WEBP_USE_NEON
-
-//------------------------------------------------------------------------------
-
-#ifdef FANCY_UPSAMPLING
-
-extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-
-void WebPInitUpsamplersNEON(void) {
-#if defined(WEBP_USE_NEON)
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
-#endif   // WEBP_USE_NEON
-}
-
-void WebPInitPremultiplyNEON(void) {
-#if defined(WEBP_USE_NEON)
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
-#endif   // WEBP_USE_NEON
-}
-
-#else
-
-// this empty function is to avoid an empty .o
-void WebPInitPremultiplyNEON(void) {}
-
-#endif  // FANCY_UPSAMPLING
-
diff --git a/drivers/webp/dsp/upsampling_sse2.c b/drivers/webp/dsp/upsampling_sse2.c
index 0db0798c6d..8cb275a02b 100644
--- a/drivers/webp/dsp/upsampling_sse2.c
+++ b/drivers/webp/dsp/upsampling_sse2.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of YUV to RGB upsampling functions.
@@ -20,6 +18,10 @@
 #include <string.h>
 #include "./yuv.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef FANCY_UPSAMPLING
 
 // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
@@ -47,14 +49,14 @@
   (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
 } while (0)
 
-// pack and store two alternating pixel rows
+// pack and store two alterning pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
-  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
-  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
-  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
-  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
-  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
-  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
+  const __m128i ta = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */  \
+  const __m128i tb = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */  \
+  const __m128i t1 = _mm_unpacklo_epi8(ta, tb);                                \
+  const __m128i t2 = _mm_unpackhi_epi8(ta, tb);                                \
+  _mm_store_si128(((__m128i*)(out)) + 0, t1);                                  \
+  _mm_store_si128(((__m128i*)(out)) + 1, t2);                                  \
 } while (0)
 
 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
@@ -83,8 +85,8 @@
   GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                                \
   /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
-  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
+  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
+  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
 }
 
 // Turn the macro into a function for reducing code-size when non-critical
@@ -104,68 +106,69 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
   Upsample32Pixels(r1, r2, out);                                               \
 }
 
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
                     top_dst, bottom_dst, cur_x, num_pixels) {                  \
   int n;                                                                       \
-  for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
-         top_dst + ((cur_x) + n) * XSTEP);                                     \
+  if (top_y) {                                                                 \
+    for (n = 0; n < (num_pixels); ++n) {                                       \
+      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
+           top_dst + ((cur_x) + n) * XSTEP);                                   \
+    }                                                                          \
   }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if (bottom_y) {                                                              \
     for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
+      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
            bottom_dst + ((cur_x) + n) * XSTEP);                                \
     }                                                                          \
   }                                                                            \
 }
 
-#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
-                       top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
-  if (bottom_y != NULL) {                                                      \
-    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
-             bottom_dst + (cur_x) * XSTEP);                                    \
-  }                                                                            \
-} while (0)
-
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                       const uint8_t* top_u, const uint8_t* top_v,              \
                       const uint8_t* cur_u, const uint8_t* cur_v,              \
                       uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int uv_pos, pos;                                                             \
-  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  int b;                                                                       \
+  /* 16 byte aligned array to cache reconstructed u and v */                   \
   uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
-  uint8_t* const r_v = r_u + 32;                                               \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
+  const int uv_len = (len + 1) >> 1;                                           \
+  /* 17 pixels must be read-able for each block */                             \
+  const int num_blocks = (uv_len - 1) >> 4;                                    \
+  const int leftover = uv_len - num_blocks * 16;                               \
+  const int last_pos = 1 + 32 * num_blocks;                                    \
                                                                                \
-  assert(top_y != NULL);                                                       \
-  {   /* Treat the first pixel in regular way */                               \
-    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
-    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
-    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
-    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
-    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
-    if (bottom_y != NULL) {                                                    \
-      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
-      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
-      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
-    }                                                                          \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
+                                                                               \
+  assert(len > 0);                                                             \
+  /* Treat the first pixel in regular way */                                   \
+  if (top_y) {                                                                 \
+    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
+    FUNC(top_y[0], u0, v0, top_dst);                                           \
   }                                                                            \
-  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
-  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
-    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
-    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
-    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  if (bottom_y) {                                                              \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
+    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
   }                                                                            \
-  if (len > 1) {                                                               \
-    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
-    assert(left_over > 0);                                                     \
-    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
-    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
-                pos, len - pos);                                               \
+                                                                               \
+  for (b = 0; b < num_blocks; ++b) {                                           \
+    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
+    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
+                32 * b + 1, 32)                                                \
+    top_u += 16;                                                               \
+    cur_u += 16;                                                               \
+    top_v += 16;                                                               \
+    cur_v += 16;                                                               \
   }                                                                            \
+                                                                               \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
+  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
+              last_pos, len - last_pos);                                       \
 }
 
 // SSE2 variants of the fancy upsampler.
@@ -179,40 +182,28 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
-#undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC
 
-#endif  // FANCY_UPSAMPLING
-
-#endif   // WEBP_USE_SSE2
-
 //------------------------------------------------------------------------------
 
-#ifdef FANCY_UPSAMPLING
-
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersSSE2(void) {
-#if defined(WEBP_USE_SSE2)
-  VP8YUVInitSSE2();
   WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
   WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
   WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
-#endif   // WEBP_USE_SSE2
 }
 
 void WebPInitPremultiplySSE2(void) {
-#if defined(WEBP_USE_SSE2)
   WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
   WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
-#endif   // WEBP_USE_SSE2
 }
 
-#else
-
-// this empty function is to avoid an empty .o
-void WebPInitPremultiplySSE2(void) {}
-
 #endif  // FANCY_UPSAMPLING
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // WEBP_USE_SSE2
diff --git a/drivers/webp/dsp/yuv.c b/drivers/webp/dsp/yuv.c
index 4f9cafc104..7f05f9a3aa 100644
--- a/drivers/webp/dsp/yuv.c
+++ b/drivers/webp/dsp/yuv.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // YUV->RGB conversion function
@@ -13,8 +11,16 @@
 
 #include "./yuv.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
-#if defined(WEBP_YUV_USE_TABLE)
+enum { YUV_HALF = 1 << (YUV_FIX - 1) };
+
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
 static int done = 0;
 
@@ -22,17 +28,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
   return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
 void VP8YUVInit(void) {
   int i;
   if (done) {
     return;
   }
-#ifndef USE_YUVj
   for (i = 0; i < 256; ++i) {
     VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
     VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
@@ -44,164 +44,9 @@ void VP8YUVInit(void) {
     VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
     VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
   }
-#else
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -46802 * (i - 128);
-    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = i;
-    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
-    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
-  }
-#endif
-
   done = 1;
 }
 
-#else
-
-void VP8YUVInit(void) {}
-
-#endif  // WEBP_YUV_USE_TABLE
-
-//-----------------------------------------------------------------------------
-// SSE2 extras
-
-#if defined(WEBP_USE_SSE2)
-
-#ifdef FANCY_UPSAMPLING
-
-#include <emmintrin.h>
-#include <string.h>   // for memcpy
-
-typedef union {   // handy struct for converting SSE2 registers
-  int32_t i32[4];
-  uint8_t u8[16];
-  __m128i m;
-} VP8kCstSSE2;
-
-static int done_sse2 = 0;
-static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
-
-void VP8YUVInitSSE2(void) {
-  if (!done_sse2) {
-    int i;
-    for (i = 0; i < 256; ++i) {
-      VP8kYtoRGBA[i].i32[0] =
-        VP8kYtoRGBA[i].i32[1] =
-        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
-      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
-
-      VP8kUtoRGBA[i].i32[0] = 0;
-      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
-      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
-      VP8kUtoRGBA[i].i32[3] = 0;
-
-      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
-      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
-      VP8kVtoRGBA[i].i32[2] = 0;
-      VP8kVtoRGBA[i].i32[3] = 0;
-    }
-    done_sse2 = 1;
-  }
-}
-
-static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
-  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
-  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
-  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
-  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
-  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
-  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
-  return rgba2;
-}
-
-static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const rgb) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
-  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)rgb, tmp2);
-}
-
-static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const bgr) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
-  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
-  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)bgr, tmp3);
-}
-
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 4) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
-    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
-    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
-    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
-    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
-    _mm_storeu_si128((__m128i*)dst, tmp2);
-    dst += 4 * 4;
-  }
-}
-
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 2) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
-  }
-}
-
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
-    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
-  }
-  // Last two pixels are special: we write in a tmp buffer before sending
-  // to dst.
-  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
-}
-
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {
-    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
-  }
-  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
-}
-
-#else
-
-void VP8YUVInitSSE2(void) {}
-
-#endif  // FANCY_UPSAMPLING
-
-#endif  // WEBP_USE_SSE2
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/dsp/yuv.h b/drivers/webp/dsp/yuv.h
index dd778f9cbe..a569109c54 100644
--- a/drivers/webp/dsp/yuv.h
+++ b/drivers/webp/dsp/yuv.h
@@ -1,165 +1,36 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // inline YUV<->RGB conversion function
 //
-// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
-//
-// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
-//   R = 1.164 * (Y-16) + 1.596 * (V-128)
-//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
-//   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
-// where Y is in the [16,235] range, and U/V in the [16,240] range.
-// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
-// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should read:
-//   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
-//   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
-//   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized.
-// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
-// That's the maximum possible for a convenient ARM implementation.
-//
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 
-#include "./dsp.h"
 #include "../dec/decode_vp8.h"
 
-// Define the following to use the LUT-based code:
-// #define WEBP_YUV_USE_TABLE
-
-#if defined(WEBP_EXPERIMENTAL_FEATURES)
-// Do NOT activate this feature for real compression. This is only experimental!
-// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
-// This colorspace is close to Rec.601's Y'CbCr model with the notable
-// difference of allowing larger range for luma/chroma.
-// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
-// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-// #define USE_YUVj
-#endif
-
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-enum {
-  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
-  YUV_HALF = 1 << (YUV_FIX - 1),
-  YUV_MASK = (256 << YUV_FIX) - 1,
-  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
-
-  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
-  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
-  YUV_MASK2 = (256 << YUV_FIX2) - 1
+enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_RANGE_MIN = -227,        // min value of r/g/b output
+       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
 };
-
-// These constants are 14b fixed-point version of ITU-R BT.601 constants.
-#define kYScale 19077    // 1.164 = 255 / 219
-#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
-#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
-#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
-#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
-#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
-#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
-#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
-
-//------------------------------------------------------------------------------
-
-#if !defined(WEBP_YUV_USE_TABLE)
-
-// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
-
-static WEBP_INLINE int VP8Clip8(int v) {
-  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
-}
-
-static WEBP_INLINE int VP8YUVToR(int y, int v) {
-  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
-}
-
-static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
-  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
-}
-
-static WEBP_INLINE int VP8YUVToB(int y, int u) {
-  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
-}
-
-static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
-                                    uint8_t* const rgb) {
-  rgb[0] = VP8YUVToR(y, v);
-  rgb[1] = VP8YUVToG(y, u, v);
-  rgb[2] = VP8YUVToB(y, u);
-}
-
-static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
-                                    uint8_t* const bgr) {
-  bgr[0] = VP8YUVToB(y, u);
-  bgr[1] = VP8YUVToG(y, u, v);
-  bgr[2] = VP8YUVToR(y, v);
-}
-
-static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
-                                       uint8_t* const rgb) {
-  const int r = VP8YUVToR(y, v);      // 5 usable bits
-  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
-  const int b = VP8YUVToB(y, u);      // 5 usable bits
-  const int rg = (r & 0xf8) | (g >> 5);
-  const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
-  rgb[0] = gb;
-  rgb[1] = rg;
-#else
-  rgb[0] = rg;
-  rgb[1] = gb;
-#endif
-}
-
-static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
-                                         uint8_t* const argb) {
-  const int r = VP8YUVToR(y, v);        // 4 usable bits
-  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
-  const int b = VP8YUVToB(y, u);        // 4 usable bits
-  const int rg = (r & 0xf0) | (g >> 4);
-  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
-  argb[0] = ba;
-  argb[1] = rg;
-#else
-  argb[0] = rg;
-  argb[1] = ba;
-#endif
-}
-
-#else
-
-// Table-based version, not totally equivalent to the SSE2 version.
-// Rounding diff is only +/-1 though.
-
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
-static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const rgb) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -169,60 +40,42 @@ static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
   rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
 
-static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
-                                    uint8_t* const bgr) {
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
   const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
 }
 
-static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
-                                       uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-#ifdef WEBP_SWAP_16BIT_CSP
-  rgb[0] = gb;
-  rgb[1] = rg;
-#else
-  rgb[0] = rg;
-  rgb[1] = gb;
-#endif
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToRgb(y, u, v, argb + 1);
 }
 
-static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
                                          uint8_t* const argb) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
   const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
-#ifdef WEBP_SWAP_16BIT_CSP
-  argb[0] = ba;
-  argb[1] = rg;
-#else
-  argb[0] = rg;
-  argb[1] = ba;
-#endif
+  // Don't update alpha (last 4 bits of argb[1])
+  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  argb[1] = 0x0f | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
 }
 
-#endif  // WEBP_YUV_USE_TABLE
-
-//-----------------------------------------------------------------------------
-// Alpha handling variants
-
-static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const argb) {
-  argb[0] = 0xff;
-  VP8YuvToRgb(y, u, v, argb + 1);
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
 
 static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
@@ -240,77 +93,35 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
-//-----------------------------------------------------------------------------
-// SSE2 extra functions (mostly for upsampling_sse2.c)
-
-#if defined(WEBP_USE_SSE2)
-
-#if defined(FANCY_UPSAMPLING)
-// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-#endif  // FANCY_UPSAMPLING
-
-// Must be called to initialize tables before using the functions.
-void VP8YUVInitSSE2(void);
-
-#endif    // WEBP_USE_SSE2
-
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations.
 
-// Stub functions that can be called with various rounding values:
-static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
-  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
-  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
+static WEBP_INLINE int VP8ClipUV(int v) {
+   v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
 }
 
-#ifndef USE_YUVj
-
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
   const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
 }
 
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
-  const int u = -9719 * r - 19081 * g + 28800 * b;
-  return VP8ClipUV(u, rounding);
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+  return VP8ClipUV(-9719 * r - 19081 * g + 28800 * b);
 }
 
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
-  const int v = +28800 * r - 24116 * g - 4684 * b;
-  return VP8ClipUV(v, rounding);
-}
-
-#else
-
-// This JPEG-YUV colorspace, only for comparison!
-// These are also 16bit precision coefficients from Rec.601, but with full
-// [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + rounding) >> YUV_FIX;  // no need to clip
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+  return VP8ClipUV(+28800 * r - 24116 * g - 4684 * b);
 }
 
-static WEBP_INLINE int VP8_RGB_TO_U(int r, int g, int b, int rounding) {
-  const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u, rounding);
-}
-
-static WEBP_INLINE int VP8_RGB_TO_V(int r, int g, int b, int rounding) {
-  const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v, rounding);
-}
-
-#endif    // USE_YUVj
-
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/alpha.c b/drivers/webp/enc/alpha.c
index 21d4b5cbde..0e519b6c66 100644
--- a/drivers/webp/enc/alpha.c
+++ b/drivers/webp/enc/alpha.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane compression.
@@ -19,6 +17,10 @@
 #include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
 // The pre-processing (quantization) is performed if 'quality' is less than 100.
@@ -67,7 +69,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
     const uint8_t* src = data;
     for (j = 0; j < picture.height; ++j) {
       for (i = 0; i < picture.width; ++i) {
-        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
+        dst[i] = (src[i] << 8) | 0xff000000u;
       }
       src += width;
       dst += picture.argb_stride;
@@ -77,19 +79,18 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPConfigInit(&config);
   config.lossless = 1;
   config.method = effort_level;  // impact is very small
-  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
-  // lower methods (3 and below) is less than the threshold for triggering
-  // costly 'BackwardReferencesTraceBackwards'.
-  config.quality = 8.f * effort_level;
-  assert(config.quality >= 0 && config.quality <= 100.f);
+  // Set moderate default quality setting for alpha. Higher qualities (80 and
+  // above) could be very slow.
+  config.quality = 10.f + 15.f * effort_level;
+  if (config.quality > 100.f) config.quality = 100.f;
 
   ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
   ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
   if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+    const uint8_t* const data = VP8LBitWriterFinish(&tmp_bw);
+    const size_t data_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8BitWriterAppend(bw, data, data_size);
   }
   VP8LBitWriterDestroy(&tmp_bw);
   return ok && !bw->error_;
@@ -97,19 +98,12 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
-// Small struct to hold the result of a filter mode compression attempt.
-typedef struct {
-  size_t score;
-  VP8BitWriter bw;
-  WebPAuxStats stats;
-} FilterTrial;
-
-// This function always returns an initialized 'bw' object, even upon error.
 static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                                int method, int filter, int reduce_levels,
                                int effort_level,  // in [0..6] range
                                uint8_t* const tmp_alpha,
-                               FilterTrial* result) {
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
   int ok = 0;
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
@@ -130,26 +124,24 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   header = method | (filter << 2);
   if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
 
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  VP8BitWriterInit(bw, expected_size);
+  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
 
   filter_func = WebPFilters[filter];
-  if (filter_func != NULL) {
-    filter_func(data, width, height, width, tmp_alpha);
+  if (filter_func) {
+    filter_func(data, width, height, 1, width, tmp_alpha);
     alpha_src = tmp_alpha;
   }  else {
     alpha_src = data;
   }
 
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
+    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = ok && !bw->error_;
   } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    VP8BitWriterFinish(bw);
   }
-  result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
 
@@ -165,104 +157,6 @@ static void CopyPlane(const uint8_t* src, int src_stride,
   }
 }
 
-static int GetNumColors(const uint8_t* data, int width, int height,
-                        int stride) {
-  int j;
-  int colors = 0;
-  uint8_t color[256] = { 0 };
-
-  for (j = 0; j < height; ++j) {
-    int i;
-    const uint8_t* const p = data + j * stride;
-    for (i = 0; i < width; ++i) {
-      color[p[i]] = 1;
-    }
-  }
-  for (j = 0; j < 256; ++j) {
-    if (color[j] > 0) ++colors;
-  }
-  return colors;
-}
-
-#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
-#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
-
-// Given the input 'filter' option, return an OR'd bit-set of filters to try.
-static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
-                             int filter, int effort_level) {
-  uint32_t bit_map = 0U;
-  if (filter == WEBP_FILTER_FAST) {
-    // Quick estimate of the best candidate.
-    int try_filter_none = (effort_level > 3);
-    const int kMinColorsForFilterNone = 16;
-    const int kMaxColorsForFilterNone = 192;
-    const int num_colors = GetNumColors(alpha, width, height, width);
-    // For low number of colors, NONE yields better compression.
-    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-             EstimateBestFilter(alpha, width, height, width);
-    bit_map |= 1 << filter;
-    // For large number of colors, try FILTER_NONE in addition to the best
-    // filter as well.
-    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
-      bit_map |= FILTER_TRY_NONE;
-    }
-  } else if (filter == WEBP_FILTER_NONE) {
-    bit_map = FILTER_TRY_NONE;
-  } else {  // WEBP_FILTER_BEST -> try all
-    bit_map = FILTER_TRY_ALL;
-  }
-  return bit_map;
-}
-
-static void InitFilterTrial(FilterTrial* const score) {
-  score->score = (size_t)~0U;
-  VP8BitWriterInit(&score->bw, 0);
-}
-
-static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
-                                 size_t data_size, int method, int filter,
-                                 int reduce_levels, int effort_level,
-                                 uint8_t** const output,
-                                 size_t* const output_size,
-                                 WebPAuxStats* const stats) {
-  int ok = 1;
-  FilterTrial best;
-  uint32_t try_map =
-      GetFilterMap(alpha, width, height, filter, effort_level);
-  InitFilterTrial(&best);
-  if (try_map != FILTER_TRY_NONE) {
-    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
-    if (filtered_alpha == NULL) return 0;
-
-    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
-      if (try_map & 1) {
-        FilterTrial trial;
-        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
-                                 reduce_levels, effort_level, filtered_alpha,
-                                 &trial);
-        if (ok && trial.score < best.score) {
-          VP8BitWriterWipeOut(&best.bw);
-          best = trial;
-        } else {
-          VP8BitWriterWipeOut(&trial.bw);
-        }
-      }
-    }
-    free(filtered_alpha);
-  } else {
-    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
-                             reduce_levels, effort_level, NULL, &best);
-  }
-  if (ok) {
-    if (stats != NULL) *stats = best.stats;
-    *output_size = VP8BitWriterSize(&best.bw);
-    *output = VP8BitWriterBuf(&best.bw);
-  } else {
-    VP8BitWriterWipeOut(&best.bw);
-  }
-  return ok;
-}
-
 static int EncodeAlpha(VP8Encoder* const enc,
                        int quality, int method, int filter,
                        int effort_level,
@@ -293,11 +187,6 @@ static int EncodeAlpha(VP8Encoder* const enc,
     return 0;
   }
 
-  if (method == ALPHA_NO_COMPRESSION) {
-    // Don't filter, as filtering will make no impact on compressed size.
-    filter = WEBP_FILTER_NONE;
-  }
-
   quant_alpha = (uint8_t*)malloc(data_size);
   if (quant_alpha == NULL) {
     return 0;
@@ -316,95 +205,126 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   if (ok) {
-    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
-                               filter, reduce_levels, effort_level, output,
-                               output_size, pic->stats);
-    if (pic->stats != NULL) {  // need stats?
-      pic->stats->coded_size += (int)(*output_size);
-      enc->sse_[3] = sse;
+    VP8BitWriter bw;
+    int test_filter;
+    uint8_t* filtered_alpha = NULL;
+
+    // We always test WEBP_FILTER_NONE first.
+    ok = EncodeAlphaInternal(quant_alpha, width, height,
+                             method, WEBP_FILTER_NONE, reduce_levels,
+                             effort_level, NULL, &bw, pic->stats);
+    if (!ok) {
+      VP8BitWriterWipeOut(&bw);
+      goto End;
     }
-  }
 
+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
+      filter = EstimateBestFilter(quant_alpha, width, height, width);
+    }
+    // Stop?
+    if (filter == WEBP_FILTER_NONE) {
+      goto Ok;
+    }
+
+    filtered_alpha = (uint8_t*)malloc(data_size);
+    ok = (filtered_alpha != NULL);
+    if (!ok) {
+      goto End;
+    }
+
+    // Try the other mode(s).
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = VP8BitWriterSize(&bw);
+
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter = WEBP_FILTER_HORIZONTAL;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
+        }
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        VP8BitWriterWipeOut(&tmp_bw);
+      }
+      if (pic->stats != NULL) *pic->stats = best_stats;
+    }
+ Ok:
+    if (ok) {
+      *output_size = VP8BitWriterSize(&bw);
+      *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
+    }
+    free(filtered_alpha);
+  }
+ End:
   free(quant_alpha);
   return ok;
 }
 
+
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
-  const WebPConfig* config = enc->config_;
-  uint8_t* alpha_data = NULL;
-  size_t alpha_size = 0;
-  const int effort_level = config->method;  // maps to [0..6]
-  const WEBP_FILTER_TYPE filter =
-      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
-      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
-                                       WEBP_FILTER_BEST;
-  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
-                   filter, effort_level, &alpha_data, &alpha_size)) {
-    return 0;
-  }
-  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
-    free(alpha_data);
-    return 0;
-  }
-  enc->alpha_data_size_ = (uint32_t)alpha_size;
-  enc->alpha_data_ = alpha_data;
-  (void)dummy;
-  return 1;
-}
-
 void VP8EncInitAlpha(VP8Encoder* const enc) {
   enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
-  if (enc->thread_level_ > 0) {
-    WebPWorker* const worker = &enc->alpha_worker_;
-    WebPWorkerInit(worker);
-    worker->data1 = enc;
-    worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
-  }
-}
-
-int VP8EncStartAlpha(VP8Encoder* const enc) {
-  if (enc->has_alpha_) {
-    if (enc->thread_level_ > 0) {
-      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
-        return 0;
-      }
-      WebPWorkerLaunch(worker);
-      return 1;
-    } else {
-      return CompressAlphaJob(enc, NULL);   // just do the job right away
-    }
-  }
-  return 1;
 }
 
 int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
-    if (enc->thread_level_ > 0) {
-      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerSync(worker)) return 0;  // error
+    const WebPConfig* config = enc->config_;
+    uint8_t* tmp_data = NULL;
+    size_t tmp_size = 0;
+    const int effort_level = config->method;  // maps to [0..6]
+    const WEBP_FILTER_TYPE filter =
+        (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+        (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                         WEBP_FILTER_BEST;
+
+    if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                     filter, effort_level, &tmp_data, &tmp_size)) {
+      return 0;
+    }
+    if (tmp_size != (uint32_t)tmp_size) {  // Sanity check.
+      free(tmp_data);
+      return 0;
     }
+    enc->alpha_data_size_ = (uint32_t)tmp_size;
+    enc->alpha_data_ = tmp_data;
   }
   return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
-int VP8EncDeleteAlpha(VP8Encoder* const enc) {
-  int ok = 1;
-  if (enc->thread_level_ > 0) {
-    WebPWorker* const worker = &enc->alpha_worker_;
-    ok = WebPWorkerSync(worker);  // finish anything left in flight
-    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
-  }
+void VP8EncDeleteAlpha(VP8Encoder* const enc) {
   free(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
   enc->has_alpha_ = 0;
-  return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/analysis.c b/drivers/webp/enc/analysis.c
index 7d4cfdc190..22cfb492e7 100644
--- a/drivers/webp/enc/analysis.c
+++ b/drivers/webp/enc/analysis.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Macroblock analysis
@@ -19,8 +17,16 @@
 #include "./cost.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define MAX_ITERS_K_MEANS  6
 
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
 //------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
@@ -51,7 +57,6 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
       for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
         if (cnt[n] >= majority_cnt_3_x_3_grid) {
           majority_seg = n;
-          break;
         }
       }
       tmp[x + y * w] = majority_seg;
@@ -67,10 +72,50 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
 }
 
 //------------------------------------------------------------------------------
-// set segment susceptibility alpha_ / beta_
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  int proba;
+  const int total = a + b;
+  if (total == 0) return 255;  // that's the default probability.
+  proba = (255 * a + total / 2) / total;
+  return proba;
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    p[mb->segment_]++;
+  }
+  if (enc->pic_->stats) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    enc->segment_hdr_.size_ =
+      p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+      p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+      p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+      p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
 
 static WEBP_INLINE int clip(int v, int m, int M) {
-  return (v < m) ? m : (v > M) ? M : v;
+  return v < m ? m : v > M ? M : v;
 }
 
 static void SetSegmentAlphas(VP8Encoder* const enc,
@@ -97,72 +142,28 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
 }
 
 //------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
-#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
-#define DEFAULT_ALPHA (-1)
-#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
-
-static int FinalAlphaValue(int alpha) {
-  alpha = MAX_ALPHA - alpha;
-  return clip(alpha, 0, MAX_ALPHA);
-}
-
-static int GetAlpha(const VP8Histogram* const histo) {
-  int max_value = 0, last_non_zero = 1;
-  int k;
-  int alpha;
-  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
-    const int value = histo->distribution[k];
-    if (value > 0) {
-      if (value > max_value) max_value = value;
-      last_non_zero = k;
-    }
-  }
-  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
-  // values which happen to be mostly noise. This leaves the maximum precision
-  // for handling the useful small values which contribute most.
-  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
-  return alpha;
-}
-
-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  int i;
-  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
-    out->distribution[i] += in->distribution[i];
-  }
-}
-
-//------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
-static void AssignSegments(VP8Encoder* const enc,
-                           const int alphas[MAX_ALPHA + 1]) {
+static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
   const int nb = enc->segment_hdr_.num_segments_;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
-  int map[MAX_ALPHA + 1];
+  int map[256];
   int a, n, k;
-  int min_a = 0, max_a = MAX_ALPHA, range_a;
+  int min_a = 0, max_a = 255, range_a;
   // 'int' type is ok for histo, and won't overflow
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
-  assert(nb >= 1);
-
   // bracket the input
-  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+  for (n = 0; n < 256 && alphas[n] == 0; ++n) {}
   min_a = n;
-  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+  for (n = 255; n > min_a && alphas[n] == 0; --n) {}
   max_a = n;
   range_a = max_a - min_a;
 
   // Spread initial centers evenly
-  for (k = 0, n = 1; k < nb; ++k, n += 2) {
-    assert(n < 2 * nb);
-    centers[k] = min_a + (n * range_a) / (2 * nb);
+  for (n = 1, k = 0; n < 2 * nb; n += 2) {
+    centers[k++] = min_a + (n * range_a) / (2 * nb);
   }
 
   for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@@ -177,7 +178,7 @@ static void AssignSegments(VP8Encoder* const enc,
     n = 0;    // track the nearest center for current 'a'
     for (a = min_a; a <= max_a; ++a) {
       if (alphas[a]) {
-        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
           n++;
         }
         map[a] = n;
@@ -209,7 +210,7 @@ static void AssignSegments(VP8Encoder* const enc,
     VP8MBInfo* const mb = &enc->mb_info_[n];
     const int alpha = mb->alpha_;
     mb->segment_ = map[alpha];
-    mb->alpha_ = centers[map[alpha]];  // for the record.
+    mb->alpha_ = centers[map[alpha]];     // just for the record.
   }
 
   if (nb > 1) {
@@ -217,6 +218,7 @@ static void AssignSegments(VP8Encoder* const enc,
     if (smooth) SmoothSegmentMap(enc);
   }
 
+  SetSegmentProbas(enc);                             // Assign final proba
   SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }
 
@@ -225,32 +227,24 @@ static void AssignSegments(VP8Encoder* const enc,
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
 
-// Number of modes to inspect for alpha_ evaluation. For high-quality settings
-// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
-// during the analysis phase.
-#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
+// Number of modes to inspect for alpha_ evaluation. For high-quality settings,
+// we don't need to test all the possible modes during the analysis phase.
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2
 
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4;
   int mode;
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
 
   VP8MakeLuma16Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-
-    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
-                        it->yuv_p_ + VP8I16ModeOffsets[mode],
-                        0, 16, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+                                          it->yuv_p_ + VP8I16ModeOffsets[mode],
+                                          0, 16);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -262,63 +256,46 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
-                                                  : NUM_BMODES;
-  int i4_alpha;
-  VP8Histogram total_histo = { { 0 } };
-  int cur_histo = 0;
-
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
+  int i4_alpha = 0;
   VP8IteratorStartI4(it);
   do {
     int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
+    int best_mode_alpha = -1;
     const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
 
     VP8MakeIntra4Preds(it);
     for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
+      const int alpha = VP8CollectHistogram(src,
+                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
+                                            0, 1);
+      if (alpha > best_mode_alpha) {
         best_mode_alpha = alpha;
         modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
       }
     }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
+    i4_alpha += best_mode_alpha;
     // Note: we reuse the original samples for predictors
   } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
 
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
+  if (i4_alpha > best_alpha) {
     VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
+    best_alpha = ClipAlpha(i4_alpha);
   }
   return best_alpha;
 }
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4;
   int mode;
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-    VP8CollectHistogram(it->yuv_in_ + U_OFF,
-                        it->yuv_p_ + VP8UVModeOffsets[mode],
-                        16, 16 + 4 + 4, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
+                                          it->yuv_p_ + VP8UVModeOffsets[mode],
+                                          16, 16 + 4 + 4);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -328,8 +305,7 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 }
 
 static void MBAnalyze(VP8EncIterator* const it,
-                      int alphas[MAX_ALPHA + 1],
-                      int* const alpha, int* const uv_alpha) {
+                      int alphas[256], int* const uv_alpha) {
   const VP8Encoder* const enc = it->enc_;
   int best_alpha, best_uv_alpha;
 
@@ -338,7 +314,7 @@ static void MBAnalyze(VP8EncIterator* const it,
   VP8SetSegment(it, 0);      // default segment, spec-wise.
 
   best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ >= 5) {
+  if (enc->method_ != 3) {
     // We go and make a fast decision for intra4/intra16.
     // It's usually not a good and definitive pick, but helps seeding the stats
     // about level bit-cost.
@@ -348,22 +324,10 @@ static void MBAnalyze(VP8EncIterator* const it,
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
   // Final susceptibility mix
-  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
-  best_alpha = FinalAlphaValue(best_alpha);
+  best_alpha = (best_alpha + best_uv_alpha + 1) / 2;
   alphas[best_alpha]++;
-  it->mb_->alpha_ = best_alpha;   // for later remapping.
-
-  // Accumulate for later complexity analysis.
-  *alpha += best_alpha;   // mixed susceptibility (not just luma)
   *uv_alpha += best_uv_alpha;
-}
-
-static void DefaultMBInfo(VP8MBInfo* const mb) {
-  mb->type_ = 1;     // I16x16
-  mb->uv_mode_ = 0;
-  mb->skip_ = 0;     // not skipped
-  mb->segment_ = 0;  // default segment
-  mb->alpha_ = 0;
+  it->mb_->alpha_ = best_alpha;   // Informative only.
 }
 
 //------------------------------------------------------------------------------
@@ -376,122 +340,25 @@ static void DefaultMBInfo(VP8MBInfo* const mb) {
 // and decide intra4/intra16, but that's usually almost always a bad choice at
 // this stage.
 
-static void ResetAllMBInfo(VP8Encoder* const enc) {
-  int n;
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    DefaultMBInfo(&enc->mb_info_[n]);
-  }
-  // Default susceptibilities.
-  enc->dqm_[0].alpha_ = 0;
-  enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
-  enc->alpha_ = 0;
-  enc->uv_alpha_ = 0;
-  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
-}
-
-// struct used to collect job result
-typedef struct {
-  WebPWorker worker;
-  int alphas[MAX_ALPHA + 1];
-  int alpha, uv_alpha;
-  VP8EncIterator it;
-  int delta_progress;
-} SegmentJob;
-
-// main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+int VP8EncAnalyze(VP8Encoder* const enc) {
   int ok = 1;
-  if (!VP8IteratorIsDone(it)) {
-    uint8_t tmp[32 + ALIGN_CST];
-    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
-    do {
-      // Let's pretend we have perfect lossless reconstruction.
-      VP8IteratorImport(it, scratch);
-      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
-      ok = VP8IteratorProgress(it, job->delta_progress);
-    } while (ok && VP8IteratorNext(it));
-  }
-  return ok;
-}
-
-static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
-  int i;
-  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
-  dst->alpha += src->alpha;
-  dst->uv_alpha += src->uv_alpha;
-}
+  int alphas[256] = { 0 };
+  VP8EncIterator it;
 
-// initialize the job struct with some TODOs
-static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
-                           int start_row, int end_row) {
-  WebPWorkerInit(&job->worker);
-  job->worker.data1 = job;
-  job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
-  VP8IteratorInit(enc, &job->it);
-  VP8IteratorSetRow(&job->it, start_row);
-  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
-  memset(job->alphas, 0, sizeof(job->alphas));
-  job->alpha = 0;
-  job->uv_alpha = 0;
-  // only one of both jobs can record the progress, since we don't
-  // expect the user's hook to be multi-thread safe
-  job->delta_progress = (start_row == 0) ? 20 : 0;
-}
+  VP8IteratorInit(enc, &it);
+  enc->uv_alpha_ = 0;
+  do {
+    VP8IteratorImport(&it);
+    MBAnalyze(&it, alphas, &enc->uv_alpha_);
+    ok = VP8IteratorProgress(&it, 20);
+    // Let's pretend we have perfect lossless reconstruction.
+  } while (ok && VP8IteratorNext(&it, it.yuv_in_));
+  enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
+  if (ok) AssignSegments(enc, alphas);
 
-// main entry point
-int VP8EncAnalyze(VP8Encoder* const enc) {
-  int ok = 1;
-  const int do_segments =
-      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
-      (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
-  if (do_segments) {
-    const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
-    const int total_mb = last_row * enc->mb_w_;
-#ifdef WEBP_USE_THREAD
-    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
-    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
-#else
-    const int do_mt = 0;
-#endif
-    SegmentJob main_job;
-    if (do_mt) {
-      SegmentJob side_job;
-      // Note the use of '&' instead of '&&' because we must call the functions
-      // no matter what.
-      InitSegmentJob(enc, &main_job, 0, split_row);
-      InitSegmentJob(enc, &side_job, split_row, last_row);
-      // we don't need to call Reset() on main_job.worker, since we're calling
-      // WebPWorkerExecute() on it
-      ok &= WebPWorkerReset(&side_job.worker);
-      // launch the two jobs in parallel
-      if (ok) {
-        WebPWorkerLaunch(&side_job.worker);
-        WebPWorkerExecute(&main_job.worker);
-        ok &= WebPWorkerSync(&side_job.worker);
-        ok &= WebPWorkerSync(&main_job.worker);
-      }
-      WebPWorkerEnd(&side_job.worker);
-      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
-    } else {
-      // Even for single-thread case, we use the generic Worker tools.
-      InitSegmentJob(enc, &main_job, 0, last_row);
-      WebPWorkerExecute(&main_job.worker);
-      ok &= WebPWorkerSync(&main_job.worker);
-    }
-    WebPWorkerEnd(&main_job.worker);
-    if (ok) {
-      enc->alpha_ = main_job.alpha / total_mb;
-      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
-      AssignSegments(enc, main_job.alphas);
-    }
-  } else {   // Use only one default segment.
-    ResetAllMBInfo(enc);
-  }
   return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/backward_references.c b/drivers/webp/enc/backward_references.c
index 77b4be7432..b8c8ece806 100644
--- a/drivers/webp/enc/backward_references.c
+++ b/drivers/webp/enc/backward_references.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -143,95 +141,74 @@ static void HashChainInsert(HashChain* const p,
   p->hash_to_first_index_[hash_code] = pos;
 }
 
-static void GetParamsForHashChainFindCopy(int quality, int xsize,
-                                          int cache_bits, int* window_size,
-                                          int* iter_pos, int* iter_limit) {
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int iter_neg = -iter_mult * (quality >> 1);
-  // Limit the backward-ref window size for lower qualities.
-  const int max_window_size = (quality > 50) ? WINDOW_SIZE
-                            : (quality > 25) ? (xsize << 8)
-                            : (xsize << 4);
-  assert(xsize > 0);
-  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
-               : max_window_size;
-  *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigorous search loop in HashChainFindCopy
-  // can be relaxed.
-  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
-}
-
 static int HashChainFindCopy(const HashChain* const p,
-                             int base_position, int xsize_signed,
-                             const uint32_t* const argb, int max_len,
-                             int window_size, int iter_pos, int iter_limit,
+                             int quality, int index, int xsize,
+                             const uint32_t* const argb, int maxlen,
                              int* const distance_ptr,
                              int* const length_ptr) {
-  const uint32_t* const argb_start = argb + base_position;
-  uint64_t best_val = 0;
-  uint32_t best_length = 1;
-  uint32_t best_distance = 0;
-  const uint32_t xsize = (uint32_t)xsize_signed;
-  const int min_pos =
-      (base_position > window_size) ? base_position - window_size : 0;
+  const uint64_t hash_code = GetPixPairHash64(&argb[index]);
+  int prev_length = 0;
+  int64_t best_val = 0;
+  int best_length = 0;
+  int best_distance = 0;
+  const uint32_t* const argb_start = argb + index;
+  const int iter_min_mult = (quality < 50) ? 2 : (quality < 75) ? 4 : 8;
+  const int iter_min = -quality * iter_min_mult;
+  int iter_cnt = 10 + (quality >> 1);
+  const int min_pos = (index > WINDOW_SIZE) ? index - WINDOW_SIZE : 0;
   int pos;
+
   assert(xsize > 0);
-  if (max_len > MAX_LENGTH) {
-    max_len = MAX_LENGTH;
-  }
-  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
+  for (pos = p->hash_to_first_index_[hash_code];
        pos >= min_pos;
        pos = p->chain_[pos]) {
-    uint64_t val;
-    uint32_t curr_length;
-    uint32_t distance;
-    const uint64_t* const ptr1 =
-        (const uint64_t*)(argb + pos + best_length - 1);
-    const uint64_t* const ptr2 =
-        (const uint64_t*)(argb_start + best_length - 1);
-
-    if (iter_pos < 0) {
-      if (iter_pos < iter_limit || best_val >= 0xff0000) {
+    int64_t val;
+    int curr_length;
+    if (iter_cnt < 0) {
+      if (iter_cnt < iter_min || best_val >= 0xff0000) {
         break;
       }
     }
-    --iter_pos;
-
-    // Before 'expensive' linear match, check if the two arrays match at the
-    // current best length index and also for the succeeding elements.
-    if (*ptr1 != *ptr2) continue;
-
-    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
-    if (curr_length < best_length) continue;
-
-    distance = (uint32_t)(base_position - pos);
-    val = curr_length << 16;
+    --iter_cnt;
+    if (best_length != 0 &&
+        argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
+    if (curr_length < prev_length) {
+      continue;
+    }
+    val = 65536 * curr_length;
     // Favoring 2d locality here gives savings for certain images.
-    if (distance < 9 * xsize) {
-      const uint32_t y = distance / xsize;
-      uint32_t x = distance % xsize;
-      if (x > (xsize >> 1)) {
+    if (index - pos < 9 * xsize) {
+      const int y = (index - pos) / xsize;
+      int x = (index - pos) % xsize;
+      if (x > xsize / 2) {
         x = xsize - x;
       }
-      if (x <= 7) {
-        val += 9 * 9 + 9 * 9;
+      if (x <= 7 && x >= -8) {
         val -= y * y + x * x;
+      } else {
+        val -= 9 * 9 + 9 * 9;
       }
+    } else {
+      val -= 9 * 9 + 9 * 9;
     }
     if (best_val < val) {
+      prev_length = curr_length;
       best_val = val;
       best_length = curr_length;
-      best_distance = distance;
-      if (curr_length >= (uint32_t)max_len) {
+      best_distance = index - pos;
+      if (curr_length >= MAX_LENGTH) {
         break;
       }
-      if ((best_distance == 1 || distance == xsize) &&
+      if ((best_distance == 1 || best_distance == xsize) &&
           best_length >= 128) {
         break;
       }
     }
   }
-  *distance_ptr = (int)best_distance;
+  *distance_ptr = best_distance;
   *length_ptr = best_length;
   return (best_length >= MIN_LENGTH);
 }
@@ -280,9 +257,6 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   const int pix_count = xsize * ysize;
   HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
 
   if (hash_chain == NULL) return 0;
   if (use_color_cache) {
@@ -293,16 +267,16 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   if (!HashChainInit(hash_chain, pix_count)) goto Error;
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
     int len = 0;
     if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int max_len = pix_count - i;
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
+      int maxlen = pix_count - i;
+      if (maxlen > MAX_LENGTH) {
+        maxlen = MAX_LENGTH;
+      }
+      HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
                         &offset, &len);
     }
     if (len >= MIN_LENGTH) {
@@ -313,10 +287,12 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       int k;
       HashChainInsert(hash_chain, &argb[i], i);
       if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int max_len = pix_count - (i + 1);
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset2, &len2);
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > MAX_LENGTH) {
+          maxlen = MAX_LENGTH;
+        }
+        HashChainFindCopy(hash_chain, quality,
+                          i + 1, xsize, argb, maxlen, &offset2, &len2);
         if (len2 > len + 1) {
           const uint32_t pixel = argb[i];
           // Alternative#2 is a better match. So push pixel at 'i' as literal.
@@ -324,10 +300,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
             const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
           } else {
-            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
           }
           ++refs->size;
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -357,10 +333,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
       }
       ++refs->size;
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -386,8 +362,7 @@ typedef struct {
 
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, int recursive_cost_model,
-    const uint32_t* const argb, int quality, int cache_bits,
-    VP8LBackwardRefs* const refs);
+    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
     int num_symbols, const int population_counts[], double output[]) {
@@ -412,16 +387,17 @@ static void ConvertPopulationCountTableToBitEstimates(
 
 static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                           int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits) {
+                          int cache_bits) {
   int ok = 0;
   VP8LHistogram histo;
   VP8LBackwardRefs refs;
+  const int quality = 100;
 
   if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
 
   if (recursion_level > 0) {
     if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, &refs)) {
+                                          argb, cache_bits, &refs)) {
       goto Error;
     }
   } else {
@@ -462,37 +438,34 @@ static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
 
 static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                         uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
 }
 
 static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                           uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return m->distance_[code] + extra_bits_count;
 }
 
 static int BackwardReferencesHashChainDistanceOnly(
     int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
-    int quality, int cache_bits, uint32_t* const dist_array) {
+    int cache_bits, uint32_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
+  const int quality = 100;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  float* const cost =
-      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
+  double* const cost =
+      (double*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
   CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
   const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
   const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
-  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
 
   if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
 
@@ -504,17 +477,15 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits)) {
+                      cache_bits)) {
     goto Error;
   }
 
-  for (i = 0; i < pix_count; ++i) cost[i] = 1e38f;
+  for (i = 0; i < pix_count; ++i) cost[i] = 1e100;
 
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ++i) {
     double prev_cost = 0.0;
     int shortmax;
@@ -525,9 +496,11 @@ static int BackwardReferencesHashChainDistanceOnly(
       int offset = 0;
       int len = 0;
       if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int max_len = shortmax ? 2 : pix_count - i;
-        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
+        int maxlen = shortmax ? 2 : MAX_LENGTH;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
                           &offset, &len);
       }
       if (len >= MIN_LENGTH) {
@@ -536,15 +509,16 @@ static int BackwardReferencesHashChainDistanceOnly(
             prev_cost + GetDistanceCost(cost_model, code);
         int k;
         for (k = 1; k < len; ++k) {
-          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
+          const double cost_val =
+              distance_cost + GetLengthCost(cost_model, k);
           if (cost[i + k] > cost_val) {
-            cost[i + k] = (float)cost_val;
+            cost[i + k] = cost_val;
             dist_array[i + k] = k + 1;
           }
         }
         // This if is for speedup only. It roughly doubles the speed, and
         // makes compression worse by .1 %.
-        if (len >= 128 && code <= min_distance_code) {
+        if (len >= 128 && code < 2) {
           // Long copy for short distances, let's skip the middle
           // lookups for better copies.
           // 1) insert the hashes.
@@ -555,10 +529,10 @@ static int BackwardReferencesHashChainDistanceOnly(
           }
           // 2) Add to the hash_chain (but cannot add the last pixel)
           {
-            const int last = (len + i < pix_count - 1) ? len + i
-                                                       : pix_count - 1;
-            for (k = i; k < last; ++k) {
-              HashChainInsert(hash_chain, &argb[k], k);
+            const int last = (len < pix_count - 1 - i) ? len
+                                                       : pix_count - 1 - i;
+            for (k = 0; k < last; ++k) {
+              HashChainInsert(hash_chain, &argb[i + k], i + k);
             }
           }
           // 3) jump.
@@ -577,13 +551,13 @@ static int BackwardReferencesHashChainDistanceOnly(
         const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
         cost_val += GetCacheCost(cost_model, ix) * mul0;
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
       }
       if (cost[i] > cost_val) {
-        cost[i] = (float)cost_val;
+        cost[i] = cost_val;
         dist_array[i] = 1;  // only one is inserted.
       }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
     }
  next_symbol: ;
   }
@@ -598,30 +572,40 @@ Error:
   return ok;
 }
 
-// We pack the path at the end of *dist_array and return
-// a pointer to this part of the array. Example:
-// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint32_t* const dist_array,
-                           int dist_array_size,
-                           uint32_t** const chosen_path,
-                           int* const chosen_path_size) {
-  uint32_t* path = dist_array + dist_array_size;
-  uint32_t* cur = dist_array + dist_array_size - 1;
-  while (cur >= dist_array) {
-    const int k = *cur;
-    --path;
-    *path = k;
-    cur -= k;
-  }
-  *chosen_path = path;
-  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+static int TraceBackwards(const uint32_t* const dist_array,
+                          int dist_array_size,
+                          uint32_t** const chosen_path,
+                          int* const chosen_path_size) {
+  int i;
+  // Count how many.
+  int count = 0;
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    ++count;
+    i -= k;
+  }
+  // Allocate.
+  *chosen_path_size = count;
+  *chosen_path =
+      (uint32_t*)WebPSafeMalloc((uint64_t)count, sizeof(**chosen_path));
+  if (*chosen_path == NULL) return 0;
+
+  // Write in reverse order.
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    (*chosen_path)[--count] = k;
+    i -= k;
+  }
+  return 1;
 }
 
 static int BackwardReferencesHashChainFollowChosenPath(
-    int xsize, int ysize, const uint32_t* const argb,
-    int quality, int cache_bits,
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
     const uint32_t* const chosen_path, int chosen_path_size,
     VP8LBackwardRefs* const refs) {
+  const int quality = 100;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   int size = 0;
@@ -630,9 +614,6 @@ static int BackwardReferencesHashChainFollowChosenPath(
   int ix;
   int ok = 0;
   int cc_init = 0;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
 
@@ -645,17 +626,14 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
-    int max_len = chosen_path[ix];
-    if (max_len != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-      assert(len == max_len);
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      HashChainFindCopy(hash_chain, quality,
+                        i, xsize, argb, maxlen, &offset, &len);
+      assert(len == maxlen);
       refs->refs[size] = PixOrCopyCreateCopy(offset, len);
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -675,9 +653,9 @@ static int BackwardReferencesHashChainFollowChosenPath(
         const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
       }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -697,7 +675,7 @@ Error:
 static int BackwardReferencesTraceBackwards(int xsize, int ysize,
                                             int recursive_cost_model,
                                             const uint32_t* const argb,
-                                            int quality, int cache_bits,
+                                            int cache_bits,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
@@ -709,18 +687,22 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
-      dist_array)) {
+      xsize, ysize, recursive_cost_model, argb, cache_bits, dist_array)) {
+    goto Error;
+  }
+  if (!TraceBackwards(dist_array, dist_array_size,
+                      &chosen_path, &chosen_path_size)) {
     goto Error;
   }
-  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  free(dist_array);   // no need to retain this memory any longer
+  dist_array = NULL;
   if (!BackwardReferencesHashChainFollowChosenPath(
-      xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
-      refs)) {
+      xsize, ysize, argb, cache_bits, chosen_path, chosen_path_size, refs)) {
     goto Error;
   }
   ok = 1;
  Error:
+  free(chosen_path);
   free(dist_array);
   return ok;
 }
@@ -780,20 +762,18 @@ int VP8LGetBackwardReferences(int width, int height,
 
   // Choose appropriate backward reference.
   if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
+    // TraceBackwards is costly. Run it for higher qualities.
+    const int try_lz77_trace_backwards = (quality >= 75);
     *best = refs_lz77;   // default guess: lz77 is better
     VP8LClearBackwardRefs(&refs_rle);
     if (try_lz77_trace_backwards) {
-      // Set recursion level for large images using a color cache.
-      const int recursion_level =
-          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
+      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
       VP8LBackwardRefs refs_trace;
       if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
         goto End;
       }
-      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, &refs_trace)) {
+      if (BackwardReferencesTraceBackwards(
+          width, height, recursion_level, argb, cache_bits, &refs_trace)) {
         VP8LClearBackwardRefs(&refs_lz77);
         *best = refs_trace;
       }
diff --git a/drivers/webp/enc/backward_references.h b/drivers/webp/enc/backward_references.h
index e1c75f04f9..91c03361ed 100644
--- a/drivers/webp/enc/backward_references.h
+++ b/drivers/webp/enc/backward_references.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -18,7 +16,7 @@
 #include "../webp/types.h"
 #include "../webp/format_constants.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -31,6 +29,68 @@ extern "C" {
     (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
 
 // -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  return _BitScanReverse(&first_set_bit, n) ? first_set_bit : -1;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  if (value == 0) return -1;
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return floor;
+  else
+    return floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(int distance, int* const code,
+                                     int* const extra_bits_count,
+                                     int* const extra_bits_value) {
+  // Collect the two most significant bits where the highest bit is 1.
+  const int highest_bit = BitsLog2Floor(--distance);
+  // & 0x3f is to make behavior well defined when highest_bit
+  // does not exist or is the least significant bit.
+  const int second_highest_bit =
+      (distance >> ((highest_bit - 1) & 0x3f)) & 1;
+  *extra_bits_count = (highest_bit > 0) ? (highest_bit - 1) : 0;
+  *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+  *code = (highest_bit > 0) ? (2 * highest_bit + second_highest_bit)
+                            : (highest_bit == 0) ? 1 : 0;
+}
+
+// -----------------------------------------------------------------------------
 // PixOrCopy
 
 enum Mode {
@@ -145,7 +205,7 @@ int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
                                       int xsize, int ysize,
                                       int* const best_cache_bits);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/enc/config.c b/drivers/webp/enc/config.c
index af7f0b09e8..1a26113554 100644
--- a/drivers/webp/enc/config.c
+++ b/drivers/webp/enc/config.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Coding tools configuration
@@ -13,6 +11,10 @@
 
 #include "../webp/encode.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // WebPConfig
 //------------------------------------------------------------------------------
@@ -29,9 +31,9 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->target_PSNR = 0.;
   config->method = 4;
   config->sns_strength = 50;
-  config->filter_strength = 60;   // mid-filtering
+  config->filter_strength = 20;   // default: light filtering
   config->filter_sharpness = 0;
-  config->filter_type = 1;        // default: strong (so U/V is filtered too)
+  config->filter_type = 0;        // default: simple
   config->partitions = 0;
   config->segments = 4;
   config->pass = 1;
@@ -44,9 +46,6 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->alpha_quality = 100;
   config->lossless = 0;
   config->image_hint = WEBP_HINT_DEFAULT;
-  config->emulate_jpeg_size = 0;
-  config->thread_level = 0;
-  config->low_memory = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -54,13 +53,11 @@ int WebPConfigInitInternal(WebPConfig* config,
       config->sns_strength = 80;
       config->filter_sharpness = 4;
       config->filter_strength = 35;
-      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_PHOTO:
       config->sns_strength = 80;
       config->filter_sharpness = 3;
       config->filter_strength = 30;
-      config->preprocessing |= 2;
       break;
     case WEBP_PRESET_DRAWING:
       config->sns_strength = 25;
@@ -70,12 +67,10 @@ int WebPConfigInitInternal(WebPConfig* config,
     case WEBP_PRESET_ICON:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_TEXT:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
       config->segments = 2;
       break;
     case WEBP_PRESET_DEFAULT:
@@ -111,7 +106,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 3)
+  if (config->preprocessing < 0 || config->preprocessing > 1)
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -127,14 +122,11 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->image_hint >= WEBP_HINT_LAST)
     return 0;
-  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
-    return 0;
-  if (config->thread_level < 0 || config->thread_level > 1)
-    return 0;
-  if (config->low_memory < 0 || config->low_memory > 1)
-    return 0;
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/cost.c b/drivers/webp/enc/cost.c
index 09699f8044..92e0cc713c 100644
--- a/drivers/webp/enc/cost.c
+++ b/drivers/webp/enc/cost.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes
@@ -13,6 +11,10 @@
 
 #include "./cost.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
 
@@ -73,7 +75,7 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
 
 // fixed costs for coding levels, deduce from the coding tree.
 // This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+const uint16_t VP8LevelFixedCosts[2048] = {
      0,  256,  256,  256,  256,  432,  618,  630,
    731,  640,  640,  828,  901,  948, 1021, 1101,
   1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
@@ -357,7 +359,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
 
   for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
     for (band = 0; band < NUM_BANDS; ++band) {
-      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+      for(ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
         uint16_t* const table = proba->level_cost_[ctype][band][ctx];
         const int cost_base = VP8BitCost(1, p[1]);
@@ -383,107 +385,110 @@ const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
 // note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
-  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
-    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
-    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
-    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
-    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
-    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
-    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
-    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
-    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
-    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
-  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
-    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
-    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
-    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
-    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
-    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
-    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
-    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
-    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
-    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
-  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
-    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
-    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
-    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
-    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
-    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
-    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
-    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
-    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
-    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
-  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
-    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
-    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
-    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
-    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
-    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
-    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
-    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
-    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
-    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
-  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
-    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
-    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
-    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
-    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
-    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
-    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
-    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
-    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
-    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
-  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
-    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
-    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
-    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
-    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
-    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
-    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
-    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
-    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
-    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
-  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
-    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
-    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
-    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
-    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
-    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
-    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
-    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
-    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
-    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
-  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
-    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
-    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
-    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
-    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
-    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
-    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
-    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
-    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
-    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
-  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
-    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
-    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
-    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
-    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
-    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
-    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
-    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
-    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
-    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
-  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
-    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
-    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
-    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
-    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
-    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
-    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
-    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
-    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
-    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
+  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
+    {  403,  680, 1507, 1519, 2060, 2005, 1992, 1914, 1924, 1733 },
+    {  353, 1121,  973, 1895, 2060, 1787, 1671, 1516, 2012, 1868 },
+    {  770,  852, 1581,  632, 1393, 1780, 1823, 1936, 1074, 1218 },
+    {  510, 1270, 1467, 1319,  847, 1279, 1792, 2094, 1080, 1353 },
+    {  488, 1322,  918, 1573, 1300,  883, 1814, 1752, 1756, 1502 },
+    {  425,  992, 1820, 1514, 1843, 2440,  937, 1771, 1924, 1129 },
+    {  363, 1248, 1257, 1970, 2194, 2385, 1569,  953, 1951, 1601 },
+    {  723, 1257, 1631,  964,  963, 1508, 1697, 1824,  671, 1418 },
+    {  635, 1038, 1573,  930, 1673, 1413, 1410, 1687, 1410,  749 } },
+  { {  451,  613, 1345, 1702, 1870, 1716, 1728, 1766, 2190, 2310 },
+    {  678,  453, 1171, 1443, 1925, 1831, 2045, 1781, 1887, 1602 },
+    {  711,  666,  674, 1718, 1910, 1493, 1775, 1193, 2325, 2325 },
+    {  883,  854, 1583,  542, 1800, 1878, 1664, 2149, 1207, 1087 },
+    {  669,  994, 1248, 1122,  949, 1179, 1376, 1729, 1070, 1244 },
+    {  715, 1026,  715, 1350, 1430,  930, 1717, 1296, 1479, 1479 },
+    {  544,  841, 1656, 1450, 2094, 3883, 1010, 1759, 2076,  809 },
+    {  610,  855,  957, 1553, 2067, 1561, 1704,  824, 2066, 1226 },
+    {  833,  960, 1416,  819, 1277, 1619, 1501, 1617,  757, 1182 },
+    {  711,  964, 1252,  879, 1441, 1828, 1508, 1636, 1594,  734 } },
+  { {  605,  764,  734, 1713, 1747, 1192, 1819, 1353, 1877, 2392 },
+    {  866,  641,  586, 1622, 2072, 1431, 1888, 1346, 2189, 1764 },
+    {  901,  851,  456, 2165, 2281, 1405, 1739, 1193, 2183, 2443 },
+    {  770, 1045,  952, 1078, 1342, 1191, 1436, 1063, 1303,  995 },
+    {  901, 1086,  727, 1170,  884, 1105, 1267, 1401, 1739, 1337 },
+    {  951, 1162,  595, 1488, 1388,  703, 1790, 1366, 2057, 1724 },
+    {  534,  986, 1273, 1987, 3273, 1485, 1024, 1399, 1583,  866 },
+    {  699, 1182,  695, 1978, 1726, 1986, 1326,  714, 1750, 1672 },
+    {  951, 1217, 1209,  920, 1062, 1441, 1548,  999,  952,  932 },
+    {  733, 1284,  784, 1256, 1557, 1098, 1257, 1357, 1414,  908 } },
+  { {  316, 1075, 1653, 1220, 2145, 2051, 1730, 2131, 1884, 1790 },
+    {  745,  516, 1404,  894, 1599, 2375, 2013, 2105, 1475, 1381 },
+    {  516,  729, 1088, 1319, 1637, 3426, 1636, 1275, 1531, 1453 },
+    {  894,  943, 2138,  468, 1704, 2259, 2069, 1763, 1266, 1158 },
+    {  605, 1025, 1235,  871, 1170, 1767, 1493, 1500, 1104, 1258 },
+    {  739,  826, 1207, 1151, 1412,  846, 1305, 2726, 1014, 1569 },
+    {  558,  825, 1820, 1398, 3344, 1556, 1218, 1550, 1228,  878 },
+    {  429,  951, 1089, 1816, 3861, 3861, 1556,  969, 1568, 1828 },
+    {  883,  961, 1752,  769, 1468, 1810, 2081, 2346,  613, 1298 },
+    {  803,  895, 1372,  641, 1303, 1708, 1686, 1700, 1306, 1033 } },
+  { {  439, 1267, 1270, 1579,  963, 1193, 1723, 1729, 1198, 1993 },
+    {  705,  725, 1029, 1153, 1176, 1103, 1821, 1567, 1259, 1574 },
+    {  723,  859,  802, 1253,  972, 1202, 1407, 1665, 1520, 1674 },
+    {  894,  960, 1254,  887, 1052, 1607, 1344, 1349,  865, 1150 },
+    {  833, 1312, 1337, 1205,  572, 1288, 1414, 1529, 1088, 1430 },
+    {  842, 1279, 1068, 1861,  862,  688, 1861, 1630, 1039, 1381 },
+    {  766,  938, 1279, 1546, 3338, 1550, 1031, 1542, 1288,  640 },
+    {  715, 1090,  835, 1609, 1100, 1100, 1603, 1019, 1102, 1617 },
+    {  894, 1813, 1500, 1188,  789, 1194, 1491, 1919,  617, 1333 },
+    {  610, 1076, 1644, 1281, 1283,  975, 1179, 1688, 1434,  889 } },
+  { {  544,  971, 1146, 1849, 1221,  740, 1857, 1621, 1683, 2430 },
+    {  723,  705,  961, 1371, 1426,  821, 2081, 2079, 1839, 1380 },
+    {  783,  857,  703, 2145, 1419,  814, 1791, 1310, 1609, 2206 },
+    {  997, 1000, 1153,  792, 1229, 1162, 1810, 1418,  942,  979 },
+    {  901, 1226,  883, 1289,  793,  715, 1904, 1649, 1319, 3108 },
+    {  979, 1478,  782, 2216, 1454,  455, 3092, 1591, 1997, 1664 },
+    {  663, 1110, 1504, 1114, 1522, 3311,  676, 1522, 1530, 1024 },
+    {  605, 1138, 1153, 1314, 1569, 1315, 1157,  804, 1574, 1320 },
+    {  770, 1216, 1218, 1227,  869, 1384, 1232, 1375,  834, 1239 },
+    {  775, 1007,  843, 1216, 1225, 1074, 2527, 1479, 1149,  975 } },
+  { {  477,  817, 1309, 1439, 1708, 1454, 1159, 1241, 1945, 1672 },
+    {  577,  796, 1112, 1271, 1618, 1458, 1087, 1345, 1831, 1265 },
+    {  663,  776,  753, 1940, 1690, 1690, 1227, 1097, 3149, 1361 },
+    {  766, 1299, 1744, 1161, 1565, 1106, 1045, 1230, 1232,  707 },
+    {  915, 1026, 1404, 1182, 1184,  851, 1428, 2425, 1043,  789 },
+    {  883, 1456,  790, 1082, 1086,  985, 1083, 1484, 1238, 1160 },
+    {  507, 1345, 2261, 1995, 1847, 3636,  653, 1761, 2287,  933 },
+    {  553, 1193, 1470, 2057, 2059, 2059,  833,  779, 2058, 1263 },
+    {  766, 1275, 1515, 1039,  957, 1554, 1286, 1540, 1289,  705 },
+    {  499, 1378, 1496, 1385, 1850, 1850, 1044, 2465, 1515,  720 } },
+  { {  553,  930,  978, 2077, 1968, 1481, 1457,  761, 1957, 2362 },
+    {  694,  864,  905, 1720, 1670, 1621, 1429,  718, 2125, 1477 },
+    {  699,  968,  658, 3190, 2024, 1479, 1865,  750, 2060, 2320 },
+    {  733, 1308, 1296, 1062, 1576, 1322, 1062, 1112, 1172,  816 },
+    {  920,  927, 1052,  939,  947, 1156, 1152, 1073, 3056, 1268 },
+    {  723, 1534,  711, 1547, 1294,  892, 1553,  928, 1815, 1561 },
+    {  663, 1366, 1583, 2111, 1712, 3501,  522, 1155, 2130, 1133 },
+    {  614, 1731, 1188, 2343, 1944, 3733, 1287,  487, 3546, 1758 },
+    {  770, 1585, 1312,  826,  884, 2673, 1185, 1006, 1195, 1195 },
+    {  758, 1333, 1273, 1023, 1621, 1162, 1351,  833, 1479,  862 } },
+  { {  376, 1193, 1446, 1149, 1545, 1577, 1870, 1789, 1175, 1823 },
+    {  803,  633, 1136, 1058, 1350, 1323, 1598, 2247, 1072, 1252 },
+    {  614, 1048,  943,  981, 1152, 1869, 1461, 1020, 1618, 1618 },
+    { 1107, 1085, 1282,  592, 1779, 1933, 1648, 2403,  691, 1246 },
+    {  851, 1309, 1223, 1243,  895, 1593, 1792, 2317,  627, 1076 },
+    {  770, 1216, 1030, 1125,  921,  981, 1629, 1131, 1049, 1646 },
+    {  626, 1469, 1456, 1081, 1489, 3278,  981, 1232, 1498,  733 },
+    {  617, 1201,  812, 1220, 1476, 1476, 1478,  970, 1228, 1488 },
+    { 1179, 1393, 1540,  999, 1243, 1503, 1916, 1925,  414, 1614 },
+    {  943, 1088, 1490,  682, 1112, 1372, 1756, 1505,  966,  966 } },
+  { {  322, 1142, 1589, 1396, 2144, 1859, 1359, 1925, 2084, 1518 },
+    {  617,  625, 1241, 1234, 2121, 1615, 1524, 1858, 1720, 1004 },
+    {  553,  851,  786, 1299, 1452, 1560, 1372, 1561, 1967, 1713 },
+    {  770,  977, 1396,  568, 1893, 1639, 1540, 2108, 1430, 1013 },
+    {  684, 1120, 1375,  982,  930, 2719, 1638, 1643,  933,  993 },
+    {  553, 1103,  996, 1356, 1361, 1005, 1507, 1761, 1184, 1268 },
+    {  419, 1247, 1537, 1554, 1817, 3606, 1026, 1666, 1829,  923 },
+    {  439, 1139, 1101, 1257, 3710, 1922, 1205, 1040, 1931, 1529 },
+    {  979,  935, 1269,  847, 1202, 1286, 1530, 1535,  827, 1036 },
+    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
 };
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/cost.h b/drivers/webp/enc/cost.h
index 3cbad1ae4c..09b75b699d 100644
--- a/drivers/webp/enc/cost.h
+++ b/drivers/webp/enc/cost.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes.
@@ -16,12 +14,11 @@
 
 #include "./vp8enci.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-// approximate cost per level:
-extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
+extern const uint16_t VP8LevelFixedCosts[2048];   // approximate cost per level
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
 
 // Cost of coding one event with probability 'proba'.
@@ -44,7 +41,7 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/filter.c b/drivers/webp/enc/filter.c
index dd27804b55..7fb78a3949 100644
--- a/drivers/webp/enc/filter.c
+++ b/drivers/webp/enc/filter.c
@@ -1,67 +1,20 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Selecting filter level
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include <assert.h>
 #include "./vp8enci.h"
 
-// This table gives, for a given sharpness, the filtering strength to be
-// used (at least) in order to filter a given edge step delta.
-// This is constructed by brute force inspection: for all delta, we iterate
-// over all possible filtering strength / thresh until needs_filter() returns
-// true.
-#define MAX_DELTA_SIZE 64
-static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
-  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
-    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
-    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
-    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
-    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
-    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
-    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
-    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
-    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
-    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
-    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
-    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
-    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
-    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
-    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
-};
-
-int VP8FilterStrengthFromDelta(int sharpness, int delta) {
-  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
-  assert(sharpness >= 0 && sharpness <= 7);
-  return kLevelsFromDelta[sharpness][pos];
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
-// -----------------------------------------------------------------------------
 // NOTE: clip1, tables and InitTables are repeated entries of dsp.c
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
@@ -385,29 +338,28 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 // loop filter strength
 
 void VP8InitFilter(VP8EncIterator* const it) {
-  if (it->lf_stats_ != NULL) {
-    int s, i;
-    InitTables();
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      for (i = 0; i < MAX_LF_LEVELS; i++) {
-        (*it->lf_stats_)[s][i] = 0;
-      }
+  int s, i;
+  if (!it->lf_stats_) return;
+
+  InitTables();
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    for (i = 0; i < MAX_LF_LEVELS; i++) {
+      (*it->lf_stats_)[s][i] = 0;
     }
   }
 }
 
 void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
-  VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = it->enc_->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
 
   // explore +/-quant range of values around level0
-  const int delta_min = -enc->dqm_[s].quant_;
-  const int delta_max = enc->dqm_[s].quant_;
+  const int delta_min = -it->enc_->dqm_[s].quant_;
+  const int delta_max = it->enc_->dqm_[s].quant_;
   const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
 
-  if (it->lf_stats_ == NULL) return;
+  if (!it->lf_stats_) return;
 
   // NOTE: Currently we are applying filter only across the sublock edges
   // There are two reasons for that.
@@ -431,41 +383,27 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
 }
 
 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+  int s;
   VP8Encoder* const enc = it->enc_;
-  if (it->lf_stats_ != NULL) {
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      int i, best_level = 0;
-      // Improvement over filter level 0 should be at least 1e-5 (relatively)
-      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
-      for (i = 1; i < MAX_LF_LEVELS; i++) {
-        const double v = (*it->lf_stats_)[s][i];
-        if (v > best_v) {
-          best_v = v;
-          best_level = i;
-        }
-      }
-      enc->dqm_[s].fstrength_ = best_level;
-    }
-  } else if (enc->config_->filter_strength > 0) {
-    int max_level = 0;
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      VP8SegmentInfo* const dqm = &enc->dqm_[s];
-      // this '>> 3' accounts for some inverse WHT scaling
-      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
-      const int level =
-          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
-      if (level > dqm->fstrength_) {
-        dqm->fstrength_ = level;
-      }
-      if (max_level < dqm->fstrength_) {
-        max_level = dqm->fstrength_;
+
+  if (!it->lf_stats_) {
+    return;
+  }
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    int i, best_level = 0;
+    // Improvement over filter level 0 should be at least 1e-5 (relatively)
+    double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+    for (i = 1; i < MAX_LF_LEVELS; i++) {
+      const double v = (*it->lf_stats_)[s][i];
+      if (v > best_v) {
+        best_v = v;
+        best_level = i;
       }
     }
-    enc->filter_hdr_.level_ = max_level;
+    enc->dqm_[s].fstrength_ = best_level;
   }
 }
 
-// -----------------------------------------------------------------------------
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/frame.c b/drivers/webp/enc/frame.c
index 2582244c6c..bdd360069b 100644
--- a/drivers/webp/enc/frame.c
+++ b/drivers/webp/enc/frame.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   frame coding and analysis
@@ -18,7 +16,10 @@
 
 #include "./vp8enci.h"
 #include "./cost.h"
-#include "../webp/format_constants.h"  // RIFF constants
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -37,63 +38,6 @@ typedef struct {
 } VP8Residual;
 
 //------------------------------------------------------------------------------
-// multi-pass convergence
-
-#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
-                              VP8_FRAME_HEADER_SIZE)
-#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
-// we allow 2k of extra head-room in PARTITION0 limit.
-#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
-
-typedef struct {  // struct for organizing convergence in either size or PSNR
-  int is_first;
-  float dq;
-  float q, last_q;
-  double value, last_value;   // PSNR or size
-  double target;
-  int do_size_search;
-} PassStats;
-
-static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
-  const uint64_t target_size = (uint64_t)enc->config_->target_size;
-  const int do_size_search = (target_size != 0);
-  const float target_PSNR = enc->config_->target_PSNR;
-
-  s->is_first = 1;
-  s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
-  s->target = do_size_search ? (double)target_size
-            : (target_PSNR > 0.) ? target_PSNR
-            : 40.;   // default, just in case
-  s->value = s->last_value = 0.;
-  s->do_size_search = do_size_search;
-  return do_size_search;
-}
-
-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
-static float ComputeNextQ(PassStats* const s) {
-  float dq;
-  if (s->is_first) {
-    dq = (s->value > s->target) ? -s->dq : s->dq;
-    s->is_first = 0;
-  } else if (s->value != s->last_value) {
-    const double slope = (s->target - s->value) / (s->last_value - s->value);
-    dq = (float)(slope * (s->last_q - s->q));
-  } else {
-    dq = 0.;  // we're done?!
-  }
-  // Limit variable to avoid large swings.
-  s->dq = Clamp(dq, -30.f, 30.f);
-  s->last_q = s->q;
-  s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
-  return s->q;
-}
-
-//------------------------------------------------------------------------------
 // Tables for level coding
 
 const uint8_t VP8EncBands[16 + 1] = {
@@ -101,10 +45,10 @@ const uint8_t VP8EncBands[16 + 1] = {
   0  // sentinel
 };
 
-const uint8_t VP8Cat3[] = { 173, 148, 140 };
-const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
-const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
-const uint8_t VP8Cat6[] =
+static const uint8_t kCat3[] = { 173, 148, 140 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
+static const uint8_t kCat6[] =
     { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
 
 //------------------------------------------------------------------------------
@@ -169,15 +113,14 @@ static int Record(int bit, proba_t* const stats) {
 // Note: no need to record the fixed probas.
 static int RecordCoeffs(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  proba_t* s = res->stats[n][ctx];
+  proba_t* s = res->stats[VP8EncBands[n]][ctx];
   if (res->last  < 0) {
     Record(0, s + 0);
     return 0;
   }
   while (n <= res->last) {
     int v;
-    Record(1, s + 0);  // order of record doesn't matter
+    Record(1, s + 0);
     while ((v = res->coeffs[n++]) == 0) {
       Record(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
@@ -231,7 +174,8 @@ static int BranchCost(int nb, int total, int proba) {
   return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }
 
-static int FinalizeTokenProbas(VP8Proba* const proba) {
+static int FinalizeTokenProbas(VP8Encoder* const enc) {
+  VP8Proba* const proba = &enc->proba_;
   int has_changed = 0;
   int size = 0;
   int t, b, c, p;
@@ -268,47 +212,6 @@ static int FinalizeTokenProbas(VP8Proba* const proba) {
 }
 
 //------------------------------------------------------------------------------
-// Finalize Segment probability based on the coding tree
-
-static int GetProba(int a, int b) {
-  const int total = a + b;
-  return (total == 0) ? 255     // that's the default probability.
-                      : (255 * a + total / 2) / total;  // rounded proba
-}
-
-static void SetSegmentProbas(VP8Encoder* const enc) {
-  int p[NUM_MB_SEGMENTS] = { 0 };
-  int n;
-
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
-  }
-  if (enc->pic_->stats != NULL) {
-    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
-      enc->pic_->stats->segment_size[n] = p[n];
-    }
-  }
-  if (enc->segment_hdr_.num_segments_ > 1) {
-    uint8_t* const probas = enc->proba_.segments_;
-    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
-    probas[1] = GetProba(p[0], p[1]);
-    probas[2] = GetProba(p[2], p[3]);
-
-    enc->segment_hdr_.update_map_ =
-        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
-    enc->segment_hdr_.size_ =
-        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
-        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
-        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
-        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
-  } else {
-    enc->segment_hdr_.update_map_ = 0;
-    enc->segment_hdr_.size_ = 0;
-  }
-}
-
-//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.
 
 static void InitResidual(int first, int coeff_type,
@@ -336,38 +239,39 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 //------------------------------------------------------------------------------
 // Mode costs
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
+  int p0 = res->prob[VP8EncBands[n]][ctx][0];
+  const uint16_t* t = res->cost[VP8EncBands[n]][ctx];
   int cost;
 
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
-  cost = VP8BitCost(1, p0);
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
+  cost = 0;
+  while (n <= res->last) {
+    const int v = res->coeffs[n];
     const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
+    ++n;
+    if (v == 0) {
+      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
+      cost += t[0];
+      t = res->cost[b][0];
+      continue;
+    }
+    cost += VP8BitCost(1, p0);
+    if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
+      // short-case for "VP8LevelCost(t, 1)" (256 is VP8LevelFixedCosts[1]):
+      cost += 256 + t[1];
+      p0 = res->prob[b][1][0];
+      t = res->cost[b][1];
+    } else {
+      cost += VP8LevelCost(t, abs(v));
+      p0 = res->prob[b][2][0];
+      t = res->cost[b][2];
     }
   }
+  if (n < 16) cost += VP8BitCost(0, p0);
   return cost;
 }
 
@@ -438,8 +342,7 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
 
 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
   int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  const uint8_t* p = res->prob[n][ctx];
+  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
   if (!VP8PutBit(bw, res->last >= 0, p[0])) {
     return 0;
   }
@@ -468,30 +371,30 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
       } else {
         int mask;
         const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
           VP8PutBit(bw, 0, p[8]);
           VP8PutBit(bw, 0, p[9]);
           v -= 3 + (8 << 0);
           mask = 1 << 2;
-          tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          tab = kCat3;
+        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
           VP8PutBit(bw, 0, p[8]);
           VP8PutBit(bw, 1, p[9]);
           v -= 3 + (8 << 1);
           mask = 1 << 3;
-          tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          tab = kCat4;
+        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
           VP8PutBit(bw, 1, p[8]);
           VP8PutBit(bw, 0, p[10]);
           v -= 3 + (8 << 2);
           mask = 1 << 4;
-          tab = VP8Cat5;
-        } else {                         // VP8Cat6 (11b)
+          tab = kCat5;
+        } else {                         // kCat6 (11b)
           VP8PutBit(bw, 1, p[8]);
           VP8PutBit(bw, 1, p[10]);
           v -= 3 + (8 << 3);
           mask = 1 << 10;
-          tab = VP8Cat6;
+          tab = kCat6;
         }
         while (mask) {
           VP8PutBit(bw, !!(v & mask), *tab++);
@@ -508,7 +411,8 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
   return 1;
 }
 
-static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
+static void CodeResiduals(VP8BitWriter* const bw,
+                          VP8EncIterator* const it,
                           const VP8ModeScore* const rd) {
   int x, y, ch;
   VP8Residual res;
@@ -608,23 +512,146 @@ static void RecordResiduals(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Token buffer
 
-#if !defined(DISABLE_TOKEN_BUFFER)
+#ifdef USE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  b->rows_ = NULL;
+  b->tokens_ = NULL;
+  b->last_ = &b->rows_;
+  b->left_ = 0;
+  b->error_ = 0;
+}
+
+int VP8TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  *b->last_ = page;
+  b->last_ = &page->next_;
+  b->left_ = MAX_NUM_TOKEN;
+  b->tokens_ = page->tokens_;
+  return 1;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    const VP8Tokens* p = b->rows_;
+    while (p != NULL) {
+      const VP8Tokens* const next = p->next_;
+      free((void*)p);
+      p = next;
+    }
+    VP8TBufferInit(b);
+  }
+}
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas) {
+  VP8Tokens* p = b->rows_;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const int N = (p->next_ == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      VP8PutBit(bw, (p->tokens_[n] >> 15) & 1, probas[p->tokens_[n] & 0x7fff]);
+    }
+    p = p->next_;
+  }
+  return 1;
+}
+
+#define TOKEN_ID(b, ctx, p) ((p) + NUM_PROBAS * ((ctx) + (b) * NUM_CTX))
+
+static int RecordCoeffTokens(int ctx, const VP8Residual* const res,
+                             VP8TBuffer* tokens) {
+  int n = res->first;
+  int b = VP8EncBands[n];
+  if (!VP8AddToken(tokens, res->last >= 0, TOKEN_ID(b, ctx, 0))) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = res->coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    const int base_id = TOKEN_ID(b, ctx, 0);
+    if (!VP8AddToken(tokens, v != 0, base_id + 1)) {
+      b = VP8EncBands[n];
+      ctx = 0;
+      continue;
+    }
+    if (!VP8AddToken(tokens, v > 1, base_id + 2)) {
+      b = VP8EncBands[n];
+      ctx = 1;
+    } else {
+      if (!VP8AddToken(tokens, v > 4, base_id + 3)) {
+        if (VP8AddToken(tokens, v != 2, base_id + 4))
+          VP8AddToken(tokens, v == 4, base_id + 5);
+      } else if (!VP8AddToken(tokens, v > 10, base_id + 6)) {
+        if (!VP8AddToken(tokens, v > 6, base_id + 7)) {
+//          VP8AddToken(tokens, v == 6, 159);
+        } else {
+//          VP8AddToken(tokens, v >= 9, 165);
+//          VP8AddToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 9);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = kCat3;
+        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 9);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = kCat4;
+        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 10);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = kCat5;
+        } else {                         // kCat6 (11b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 10);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = kCat6;
+        }
+        while (mask) {
+          // VP8AddToken(tokens, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      ctx = 2;
+    }
+    b = VP8EncBands[n];
+    // VP8PutBitUniform(bw, sign);
+    if (n == 16 || !VP8AddToken(tokens, n <= res->last, TOKEN_ID(b, ctx, 0))) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
 
-static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
-                         VP8TBuffer* const tokens) {
+static void RecordTokens(VP8EncIterator* const it,
+                         const VP8ModeScore* const rd, VP8TBuffer tokens[2]) {
   int x, y, ch;
   VP8Residual res;
   VP8Encoder* const enc = it->enc_;
 
   VP8IteratorNzToBytes(it);
   if (it->mb_->type_ == 1) {   // i16x16
-    const int ctx = it->top_nz_[8] + it->left_nz_[8];
     InitResidual(0, 1, enc, &res);
     SetResidualCoeffs(rd->y_dc_levels, &res);
-    it->top_nz_[8] = it->left_nz_[8] =
-        VP8RecordCoeffTokens(ctx, 1,
-                             res.first, res.last, res.coeffs, tokens);
-    RecordCoeffs(ctx, &res);
+// TODO(skal): FIX ->    it->top_nz_[8] = it->left_nz_[8] =
+      RecordCoeffTokens(it->top_nz_[8] + it->left_nz_[8], &res, &tokens[0]);
     InitResidual(1, 0, enc, &res);
   } else {
     InitResidual(0, 3, enc, &res);
@@ -636,9 +663,7 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
       SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
-          VP8RecordCoeffTokens(ctx, res.coeff_type,
-                               res.first, res.last, res.coeffs, tokens);
-      RecordCoeffs(ctx, &res);
+          RecordCoeffTokens(ctx, &res, &tokens[0]);
     }
   }
 
@@ -650,16 +675,13 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
         SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            VP8RecordCoeffTokens(ctx, 2,
-                                 res.first, res.last, res.coeffs, tokens);
-        RecordCoeffs(ctx, &res);
+            RecordCoeffTokens(ctx, &res, &tokens[1]);
       }
     }
   }
-  VP8IteratorBytesToNz(it);
 }
 
-#endif    // !DISABLE_TOKEN_BUFFER
+#endif    // USE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
@@ -675,10 +697,7 @@ static void SetBlock(uint8_t* p, int value, int size) {
 #endif
 
 static void ResetSSE(VP8Encoder* const enc) {
-  enc->sse_[0] = 0;
-  enc->sse_[1] = 0;
-  enc->sse_[2] = 0;
-  // Note: enc->sse_[3] is managed by alpha.c
+  memset(enc->sse_, 0, sizeof(enc->sse_));
   enc->sse_count_ = 0;
 }
 
@@ -717,7 +736,6 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
         const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
         *info = (b > 255) ? 255 : b; break;
       }
-      case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
     };
   }
@@ -728,149 +746,62 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
-static double GetPSNR(uint64_t mse, uint64_t size) {
-  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
-}
-
 //------------------------------------------------------------------------------
-//  StatLoop(): only collect statistics (number of skips, token usage, ...).
-//  This is used for deciding optimal probabilities. It also modifies the
-//  quantizer value if some target (size, PSNR) was specified.
+// Main loops
+//
+//  VP8EncLoop(): does the final bitstream coding.
 
-static void SetLoopParams(VP8Encoder* const enc, float q) {
-  // Make sure the quality parameter is inside valid bounds
-  q = Clamp(q, 0.f, 100.f);
+static void ResetAfterSkip(VP8EncIterator* const it) {
+  if (it->mb_->type_ == 1) {
+    *it->nz_ = 0;  // reset all predictors
+    it->left_nz_[8] = 0;
+  } else {
+    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
+  }
+}
 
-  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
-  SetSegmentProbas(enc);            // compute segment probabilities
+int VP8EncLoop(VP8Encoder* const enc) {
+  int i, s, p;
+  int ok = 1;
+  VP8EncIterator it;
+  VP8ModeScore info;
+  const int dont_use_skip = !enc->proba_.use_skip_proba_;
+  const int rd_opt = enc->rd_opt_level_;
+  const int kAverageBytesPerMB = 5;     // TODO: have a kTable[quality/10]
+  const int bytes_per_parts =
+    enc->mb_w_ * enc->mb_h_ * kAverageBytesPerMB / enc->num_parts_;
+
+  // Initialize the bit-writers
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
 
   ResetStats(enc);
   ResetSSE(enc);
-}
-
-static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
-                            int nb_mbs, int percent_delta,
-                            PassStats* const s) {
-  VP8EncIterator it;
-  uint64_t size = 0;
-  uint64_t size_p0 = 0;
-  uint64_t distortion = 0;
-  const uint64_t pixel_count = nb_mbs * 384;
 
   VP8IteratorInit(enc, &it);
-  SetLoopParams(enc, s->q);
+  VP8InitFilter(&it);
   do {
-    VP8ModeScore info;
-    VP8IteratorImport(&it, NULL);
-    if (VP8Decimate(&it, &info, rd_opt)) {
-      // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+    VP8IteratorImport(&it);
+    // Warning! order is important: first call VP8Decimate() and
+    // *then* decide how to code the skip decision if there's one.
+    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+      CodeResiduals(it.bw_, &it, &info);
+    } else {   // reset predictors after a skip
+      ResetAfterSkip(&it);
     }
-    RecordResiduals(&it, &info);
-    size += info.R + info.H;
-    size_p0 += info.H;
-    distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
-      return 0;
-    VP8IteratorSaveBoundary(&it);
-  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
-
-  size_p0 += enc->segment_hdr_.size_;
-  if (s->do_size_search) {
-    size += FinalizeSkipProba(enc);
-    size += FinalizeTokenProbas(&enc->proba_);
-    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
-    s->value = (double)size;
-  } else {
-    s->value = GetPSNR(distortion, pixel_count);
-  }
-  return size_p0;
-}
-
-static int StatLoop(VP8Encoder* const enc) {
-  const int method = enc->method_;
-  const int do_search = enc->do_search_;
-  const int fast_probe = ((method == 0 || method == 3) && !do_search);
-  int num_pass_left = enc->config_->pass;
-  const int task_percent = 20;
-  const int percent_per_pass =
-      (task_percent + num_pass_left / 2) / num_pass_left;
-  const int final_percent = enc->percent_ + task_percent;
-  const VP8RDLevel rd_opt =
-      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
-  int nb_mbs = enc->mb_w_ * enc->mb_h_;
-  PassStats stats;
-
-  InitPassStats(enc, &stats);
-  ResetTokenStats(enc);
-
-  // Fast mode: quick analysis pass over few mbs. Better than nothing.
-  if (fast_probe) {
-    if (method == 3) {  // we need more stats for method 3 to be reliable.
-      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
-    } else {
-      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
     }
-  }
-
-  while (num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    const uint64_t size_p0 =
-        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
-    if (size_p0 == 0) return 0;
-#if (DEBUG_SEARCH > 0)
-    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
-           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
 #endif
-    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
-    }
-    if (is_last_pass) {
-      break;
-    }
-    // If no target size: just do several pass without changing 'q'
-    if (do_search) {
-      ComputeNextQ(&stats);
-      if (fabs(stats.dq) <= DQ_LIMIT) break;
-    }
-  }
-  if (!do_search || !stats.do_size_search) {
-    // Need to finalize probas now, since it wasn't done during the search.
-    FinalizeSkipProba(enc);
-    FinalizeTokenProbas(&enc->proba_);
-  }
-  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
-  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
-}
-
-//------------------------------------------------------------------------------
-// Main loops
-//
-
-static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
-
-static int PreLoopInitialize(VP8Encoder* const enc) {
-  int p;
-  int ok = 1;
-  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
-  const int bytes_per_parts =
-      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
-  // Initialize the bit-writers
-  for (p = 0; ok && p < enc->num_parts_; ++p) {
-    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
-  }
-  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
-  return ok;
-}
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
 
-static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
-  VP8Encoder* const enc = it->enc_;
   if (ok) {      // Finalize the partitions, check for extra errors.
-    int p;
     for (p = 0; p < enc->num_parts_; ++p) {
       VP8BitWriterFinish(enc->parts_ + p);
       ok &= !enc->parts_[p].error_;
@@ -878,191 +809,131 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   }
 
   if (ok) {      // All good. Finish up.
-    if (enc->pic_->stats != NULL) {  // finalize byte counters...
-      int i, s;
+    if (enc->pic_->stats) {           // finalize byte counters...
       for (i = 0; i <= 2; ++i) {
         for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+          enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
         }
       }
     }
-    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+    VP8AdjustFilterStrength(&it);     // ...and store filter stats.
   } else {
     // Something bad happened -> need to do some memory cleanup.
     VP8EncFreeBitWriters(enc);
   }
+
   return ok;
 }
 
 //------------------------------------------------------------------------------
-//  VP8EncLoop(): does the final bitstream coding.
+//  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
+//                 This is used for deciding optimal probabilities. It also
+//                 modifies the quantizer value if some target (size, PNSR)
+//                 was specified.
 
-static void ResetAfterSkip(VP8EncIterator* const it) {
-  if (it->mb_->type_ == 1) {
-    *it->nz_ = 0;  // reset all predictors
-    it->left_nz_[8] = 0;
-  } else {
-    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
-  }
-}
+#define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better
 
-int VP8EncLoop(VP8Encoder* const enc) {
+static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
+                       float* const PSNR, int percent_delta) {
   VP8EncIterator it;
-  int ok = PreLoopInitialize(enc);
-  if (!ok) return 0;
+  uint64_t size = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
 
-  StatLoop(enc);  // stats-collection loop
+  // Make sure the quality parameter is inside valid bounds
+  if (q < 0.) {
+    q = 0;
+  } else if (q > 100.) {
+    q = 100;
+  }
+
+  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+
+  ResetStats(enc);
+  ResetTokenStats(enc);
 
   VP8IteratorInit(enc, &it);
-  VP8InitFilter(&it);
   do {
     VP8ModeScore info;
-    const int dont_use_skip = !enc->proba_.use_skip_proba_;
-    const VP8RDLevel rd_opt = enc->rd_opt_level_;
-
-    VP8IteratorImport(&it, NULL);
-    // Warning! order is important: first call VP8Decimate() and
-    // *then* decide how to code the skip decision if there's one.
-    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
-      CodeResiduals(it.bw_, &it, &info);
-    } else {   // reset predictors after a skip
-      ResetAfterSkip(&it);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
+    VP8IteratorImport(&it);
+    if (VP8Decimate(&it, &info, rd_opt)) {
+      // Just record the number of skips and act like skip_proba is not used.
+      enc->proba_.nb_skip_++;
     }
-#endif
-    StoreSideInfo(&it);
-    VP8StoreFilterStats(&it);
-    VP8IteratorExport(&it);
-    ok = VP8IteratorProgress(&it, 20);
-    VP8IteratorSaveBoundary(&it);
-  } while (ok && VP8IteratorNext(&it));
+    RecordResiduals(&it, &info);
+    size += info.R;
+    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+      return 0;
+  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
+  size += FinalizeSkipProba(enc);
+  size += FinalizeTokenProbas(enc);
+  size += enc->segment_hdr_.size_;
+  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;
 
-  return PostLoopFinalize(&it, ok);
+  if (PSNR) {
+    *PSNR = (float)(10.* log10(255. * 255. * pixel_count / distortion));
+  }
+  return (int)size;
 }
 
-//------------------------------------------------------------------------------
-// Single pass using Token Buffer.
-
-#if !defined(DISABLE_TOKEN_BUFFER)
+// successive refinement increments.
+static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };
 
-#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+int VP8StatLoop(VP8Encoder* const enc) {
+  const int do_search =
+    (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0);
+  const int fast_probe = (enc->method_ < 2 && !do_search);
+  float q = enc->config_->quality;
+  const int max_passes = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
+  const int final_percent = enc->percent_ + task_percent;
+  int pass;
+  int nb_mbs;
 
-int VP8EncTokenLoop(VP8Encoder* const enc) {
-  // Roughly refresh the proba eight times per pass
-  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
-  int num_pass_left = enc->config_->pass;
-  const int do_search = enc->do_search_;
-  VP8EncIterator it;
-  VP8Proba* const proba = &enc->proba_;
-  const VP8RDLevel rd_opt = enc->rd_opt_level_;
-  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
-  PassStats stats;
-  int ok;
-
-  InitPassStats(enc, &stats);
-  ok = PreLoopInitialize(enc);
-  if (!ok) return 0;
-
-  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
-
-  assert(enc->num_parts_ == 1);
-  assert(enc->use_tokens_);
-  assert(proba->use_skip_proba_ == 0);
-  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
-  assert(num_pass_left > 0);
-
-  while (ok && num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    uint64_t size_p0 = 0;
-    uint64_t distortion = 0;
-    int cnt = max_count;
-    VP8IteratorInit(enc, &it);
-    SetLoopParams(enc, stats.q);
-    if (is_last_pass) {
-      ResetTokenStats(enc);
-      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
-    }
-    VP8TBufferClear(&enc->tokens_);
-    do {
-      VP8ModeScore info;
-      VP8IteratorImport(&it, NULL);
-      if (--cnt < 0) {
-        FinalizeTokenProbas(proba);
-        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
-        cnt = max_count;
-      }
-      VP8Decimate(&it, &info, rd_opt);
-      RecordTokens(&it, &info, &enc->tokens_);
-      size_p0 += info.H;
-      distortion += info.D;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (enc->use_layer_) {
-        VP8EncCodeLayerBlock(&it);
-      }
-#endif
-      if (is_last_pass) {
-        StoreSideInfo(&it);
-        VP8StoreFilterStats(&it);
-        VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  nb_mbs = enc->mb_w_ * enc->mb_h_;
+  if (fast_probe && nb_mbs > 100) nb_mbs = 100;
+
+  // No target size: just do several pass without changing 'q'
+  if (!do_search) {
+    for (pass = 0; pass < max_passes; ++pass) {
+      const int rd_opt = (enc->method_ > 2);
+      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
+        return 0;
       }
-      VP8IteratorSaveBoundary(&it);
-    } while (ok && VP8IteratorNext(&it));
-    if (!ok) break;
-
-    size_p0 += enc->segment_hdr_.size_;
-    if (stats.do_size_search) {
-      uint64_t size = FinalizeTokenProbas(&enc->proba_);
-      size += VP8EstimateTokenSize(&enc->tokens_,
-                                   (const uint8_t*)proba->coeffs_);
-      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
-      size += HEADER_SIZE_ESTIMATE;
-      stats.value = (double)size;
-    } else {  // compute and store PSNR
-      stats.value = GetPSNR(distortion, pixel_count);
     }
-
-#if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
-           num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
+  } else {
+    // binary search for a size close to target
+    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
+      const int rd_opt = 1;
+      float PSNR;
+      int criterion;
+      const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR,
+                                   percent_per_pass);
+#if DEBUG_SEARCH
+      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
-    }
-    if (is_last_pass) {
-      break;   // done
-    }
-    if (do_search) {
-      ComputeNextQ(&stats);  // Adjust q
-    }
-  }
-  if (ok) {
-    if (!stats.do_size_search) {
-      FinalizeTokenProbas(&enc->proba_);
+      if (!size) return 0;
+      if (enc->config_->target_PSNR > 0) {
+        criterion = (PSNR < enc->config_->target_PSNR);
+      } else {
+        criterion = (size < enc->config_->target_size);
+      }
+      // dichotomize
+      if (criterion) {
+        q += dqs[pass];
+      } else {
+        q -= dqs[pass];
+      }
     }
-    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
-                       (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
-  return PostLoopFinalize(&it, ok);
-}
-
-#else
-
-int VP8EncTokenLoop(VP8Encoder* const enc) {
-  (void)enc;
-  return 0;   // we shouldn't be here.
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
 }
 
-#endif    // DISABLE_TOKEN_BUFFER
-
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/histogram.c b/drivers/webp/enc/histogram.c
index abd253bd7c..ca838e064d 100644
--- a/drivers/webp/enc/histogram.c
+++ b/drivers/webp/enc/histogram.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -57,9 +55,9 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
   VP8LHistogram* bulk;
-  const uint64_t total_size = sizeof(*set)
-                            + (uint64_t)size * sizeof(*set->histograms)
-                            + (uint64_t)size * sizeof(**set->histograms);
+  const uint64_t total_size = (uint64_t)sizeof(*set)
+                            + size * sizeof(*set->histograms)
+                            + size * sizeof(**set->histograms);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -90,14 +88,18 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
     ++histo->literal_[literal_ix];
   } else {
-    int code, extra_bits;
-    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
     ++histo->literal_[256 + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
     ++histo->distance_[code];
   }
 }
 
+
+
 static double BitsEntropy(const int* const array, int n) {
   double retval = 0.;
   int sum = 0;
@@ -147,6 +149,25 @@ static double BitsEntropy(const int* const array, int n) {
   }
 }
 
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  double retval = BitsEntropy(&p->literal_[0], VP8LHistogramNumCodes(p))
+                + BitsEntropy(&p->red_[0], 256)
+                + BitsEntropy(&p->blue_[0], 256)
+                + BitsEntropy(&p->alpha_[0], 256)
+                + BitsEntropy(&p->distance_[0], NUM_DISTANCE_CODES);
+  // Compute the extra bits cost.
+  int i;
+  for (i = 2; i < NUM_LENGTH_CODES - 2; ++i) {
+    retval +=
+        (i >> 1) * p->literal_[256 + i + 2];
+  }
+  for (i = 2; i < NUM_DISTANCE_CODES - 2; ++i) {
+    retval += (i >> 1) * p->distance_[i + 2];
+  }
+  return retval;
+}
+
+
 // Returns the cost encode the rle-encoded entropy code.
 // The constants in this function are experimental.
 static double HuffmanCost(const int* const population, int length) {
@@ -186,150 +207,19 @@ static double HuffmanCost(const int* const population, int length) {
   return retval;
 }
 
-static double PopulationCost(const int* const population, int length) {
-  return BitsEntropy(population, length) + HuffmanCost(population, length);
-}
-
-static double ExtraCost(const int* const population, int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
-  return cost;
+// Estimates the Huffman dictionary + other block overhead size.
+static double HistogramEstimateBitsHeader(const VP8LHistogram* const p) {
+  return HuffmanCost(&p->alpha_[0], 256) +
+         HuffmanCost(&p->red_[0], 256) +
+         HuffmanCost(&p->literal_[0], VP8LHistogramNumCodes(p)) +
+         HuffmanCost(&p->blue_[0], 256) +
+         HuffmanCost(&p->distance_[0], NUM_DISTANCE_CODES);
 }
 
-// Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
-  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
-       + PopulationCost(p->red_, 256)
-       + PopulationCost(p->blue_, 256)
-       + PopulationCost(p->alpha_, 256)
-       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
-       + BitsEntropy(p->red_, 256)
-       + BitsEntropy(p->blue_, 256)
-       + BitsEntropy(p->alpha_, 256)
-       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-// -----------------------------------------------------------------------------
-// Various histogram combine/cost-eval functions
-
-// Adds 'in' histogram to 'out'
-static void HistogramAdd(const VP8LHistogram* const in,
-                         VP8LHistogram* const out) {
-  int i;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] += in->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] += in->distance_[i];
-  }
-  for (i = 0; i < 256; ++i) {
-    out->red_[i] += in->red_[i];
-    out->blue_[i] += in->blue_[i];
-    out->alpha_[i] += in->alpha_[i];
-  }
-}
-
-// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
-// to the threshold value 'cost_threshold'. The score returned is
-//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
-// Since the previous score passed is 'cost_threshold', we only need to compare
-// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
-// early.
-static double HistogramAddEval(const VP8LHistogram* const a,
-                               const VP8LHistogram* const b,
-                               VP8LHistogram* const out,
-                               double cost_threshold) {
-  double cost = 0;
-  const double sum_cost = a->bit_cost_ + b->bit_cost_;
-  int i;
-
-  cost_threshold += sum_cost;
-
-  // palette_code_bits_ is part of the cost evaluation for literal_.
-  // TODO(skal): remove/simplify this palette_code_bits_?
-  out->palette_code_bits_ =
-      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
-                                                        b->palette_code_bits_;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
-  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(out->red_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(out->blue_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
-  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(out->alpha_, 256);
-
-  out->bit_cost_ = cost;
-  return cost - sum_cost;
+  return HistogramEstimateBitsHeader(p) + VP8LHistogramEstimateBitsBulk(p);
 }
 
-// Same as HistogramAddEval(), except that the resulting histogram
-// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
-// the term C(b) which is constant over all the evaluations.
-static double HistogramAddThresh(const VP8LHistogram* const a,
-                                 const VP8LHistogram* const b,
-                                 double cost_threshold) {
-  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
-  int i;
-  double cost = -a->bit_cost_;
-
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    tmp[i] = a->literal_[i] + b->literal_[i];
-  }
-  // note that the tests are ordered so that the usually largest
-  // cost shares come first.
-  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
-  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    tmp[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
-  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(tmp, 256);
-
-  return cost;
-}
-
-// -----------------------------------------------------------------------------
-
 static void HistogramBuildImage(int xsize, int histo_bits,
                                 const VP8LBackwardRefs* const backward_refs,
                                 VP8LHistogramSet* const image) {
@@ -359,15 +249,14 @@ static uint32_t MyRand(uint32_t *seed) {
 }
 
 static int HistogramCombine(const VP8LHistogramSet* const in,
-                            VP8LHistogramSet* const out, int iter_mult,
-                            int num_pairs, int num_tries_no_success) {
+                            VP8LHistogramSet* const out, int num_pairs) {
   int ok = 0;
   int i, iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
-  int out_size = in->size;
-  const int outer_iters = in->size * iter_mult;
   const int min_cluster_size = 2;
+  int out_size = in->size;
+  const int outer_iters = in->size * 3;
   VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
   VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
   VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
@@ -382,26 +271,29 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
 
   // Collapse similar histograms in 'out'.
   for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+    // We pick the best pair to be combined out of 'inner_iters' pairs.
     double best_cost_diff = 0.;
-    int best_idx1 = -1, best_idx2 = 1;
+    int best_idx1 = 0, best_idx2 = 1;
     int j;
-    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
     seed += iter;
-    for (j = 0; j < num_tries; ++j) {
+    for (j = 0; j < num_pairs; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
       const uint32_t idx1 = MyRand(&seed) % out_size;
-      const uint32_t tmp = (j & 7) + 1;
+      const uint32_t tmp = ((j & 7) + 1) % (out_size - 1);
       const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
       const uint32_t idx2 = (idx1 + diff + 1) % out_size;
       if (idx1 == idx2) {
         continue;
       }
+      *cur_combo = *out->histograms[idx1];
+      VP8LHistogramAdd(cur_combo, out->histograms[idx2]);
+      cur_combo->bit_cost_ = VP8LHistogramEstimateBits(cur_combo);
       // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
-                                        out->histograms[idx2],
-                                        cur_combo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+      curr_cost_diff = cur_combo->bit_cost_
+                     - out->histograms[idx1]->bit_cost_
+                     - out->histograms[idx2]->bit_cost_;
+      if (best_cost_diff > curr_cost_diff) {    // found a better pair?
         {     // swap cur/best combo histograms
           VP8LHistogram* const tmp_histo = cur_combo;
           cur_combo = best_combo;
@@ -413,7 +305,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       }
     }
 
-    if (best_idx1 >= 0) {
+    if (best_cost_diff < 0.0) {
       *out->histograms[best_idx1] = *best_combo;
       // swap best_idx2 slot with last one (which is now unused)
       --out_size;
@@ -423,7 +315,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       }
       tries_with_no_success = 0;
     }
-    if (++tries_with_no_success >= num_tries_no_success) {
+    if (++tries_with_no_success >= 50) {
       break;
     }
   }
@@ -438,11 +330,20 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
 // -----------------------------------------------------------------------------
 // Histogram refinement
 
-// What is the bit cost of moving square_histogram from cur_symbol to candidate.
+// What is the bit cost of moving square_histogram from
+// cur_symbol to candidate_symbol.
+// TODO(skal): we don't really need to copy the histogram and Add(). Instead
+// we just need VP8LDualHistogramEstimateBits(A, B) estimation function.
 static double HistogramDistance(const VP8LHistogram* const square_histogram,
-                                const VP8LHistogram* const candidate,
-                                double cost_threshold) {
-  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
+                                const VP8LHistogram* const candidate) {
+  const double previous_bit_cost = candidate->bit_cost_;
+  double new_bit_cost;
+  VP8LHistogram modified_histo;
+  modified_histo = *candidate;
+  VP8LHistogramAdd(&modified_histo, square_histogram);
+  new_bit_cost = VP8LHistogramEstimateBits(&modified_histo);
+
+  return new_bit_cost - previous_bit_cost;
 }
 
 // Find the best 'out' histogram for each of the 'in' histograms.
@@ -453,12 +354,11 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   int i;
   for (i = 0; i < in->size; ++i) {
     int best_out = 0;
-    double best_bits =
-        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
+    double best_bits = HistogramDistance(in->histograms[i], out->histograms[0]);
     int k;
     for (k = 1; k < out->size; ++k) {
       const double cur_bits =
-          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
+          HistogramDistance(in->histograms[i], out->histograms[k]);
       if (cur_bits < best_bits) {
         best_bits = cur_bits;
         best_out = k;
@@ -472,7 +372,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
     HistogramClear(out->histograms[i]);
   }
   for (i = 0; i < in->size; ++i) {
-    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
+    VP8LHistogramAdd(out->histograms[symbols[i]], in->histograms[i]);
   }
 }
 
@@ -484,13 +384,8 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+  const int num_histo_pairs = 10 + quality / 2;  // For HistogramCombine().
   const int histo_image_raw_size = histo_xsize * histo_ysize;
-
-  // Heuristic params for HistogramCombine().
-  const int num_tries_no_success = 8 + (quality >> 1);
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
-
   VP8LHistogramSet* const image_out =
       VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
   if (image_out == NULL) return 0;
@@ -498,8 +393,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Build histogram image.
   HistogramBuildImage(xsize, histo_bits, refs, image_out);
   // Collapse similar histograms.
-  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
-                        num_tries_no_success)) {
+  if (!HistogramCombine(image_out, image_in, num_histo_pairs)) {
     goto Error;
   }
   // Find the optimal map from original histograms to the final ones.
diff --git a/drivers/webp/enc/histogram.h b/drivers/webp/enc/histogram.h
index 4d346a857b..ec573c5c85 100644
--- a/drivers/webp/enc/histogram.h
+++ b/drivers/webp/enc/histogram.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -24,7 +22,7 @@
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -82,6 +80,22 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 // represent the entropy code itself.
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
 
+static WEBP_INLINE void VP8LHistogramAdd(VP8LHistogram* const p,
+                                         const VP8LHistogram* const a) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    p->literal_[i] += a->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    p->distance_[i] += a->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    p->red_[i] += a->red_[i];
+    p->blue_[i] += a->blue_[i];
+    p->alpha_[i] += a->alpha_[i];
+  }
+}
+
 static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
   return 256 + NUM_LENGTH_CODES +
       ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
@@ -94,7 +108,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              VP8LHistogramSet* const image_in,
                              uint16_t* const histogram_symbols);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/enc/iterator.c b/drivers/webp/enc/iterator.c
index e42ad001ac..86e473bcf0 100644
--- a/drivers/webp/enc/iterator.c
+++ b/drivers/webp/enc/iterator.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // VP8Iterator: block iterator
@@ -15,16 +13,21 @@
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // VP8Iterator
 //------------------------------------------------------------------------------
 
 static void InitLeft(VP8EncIterator* const it) {
-  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+  const VP8Encoder* const enc = it->enc_;
+  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
       (it->y_ > 0) ? 129 : 127;
-  memset(it->y_left_, 129, 16);
-  memset(it->u_left_, 129, 8);
-  memset(it->v_left_, 129, 8);
+  memset(enc->y_left_, 129, 16);
+  memset(enc->u_left_, 129, 8);
+  memset(enc->v_left_, 129, 8);
   it->left_nz_[8] = 0;
 }
 
@@ -35,60 +38,43 @@ static void InitTop(VP8EncIterator* const it) {
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }
 
-void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
+void VP8IteratorReset(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   it->x_ = 0;
-  it->y_ = y;
-  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
-  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
+  it->y_ = 0;
+  it->y_offset_ = 0;
+  it->uv_offset_ = 0;
+  it->mb_ = enc->mb_info_;
+  it->preds_ = enc->preds_;
   it->nz_ = enc->nz_;
-  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
-  it->y_top_ = enc->y_top_;
-  it->uv_top_ = enc->uv_top_;
-  InitLeft(it);
-}
-
-void VP8IteratorReset(VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  VP8IteratorSetRow(it, 0);
-  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
+  it->bw_ = &enc->parts_[0];
+  it->done_ = enc->mb_w_* enc->mb_h_;
   InitTop(it);
   InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
 
-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
-  it->count_down_ = it->count_down0_ = count_down;
-}
-
-int VP8IteratorIsDone(const VP8EncIterator* const it) {
-  return (it->count_down_ <= 0);
-}
-
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
   it->uv_stride_ = enc->pic_->uv_stride;
-  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
-  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
-  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
-  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
+  // TODO(later): for multithreading, these should be owned by 'it'.
+  it->yuv_in_   = enc->yuv_in_;
+  it->yuv_out_  = enc->yuv_out_;
+  it->yuv_out2_ = enc->yuv_out2_;
+  it->yuv_p_    = enc->yuv_p_;
   it->lf_stats_ = enc->lf_stats_;
   it->percent0_ = enc->percent_;
-  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
-  it->u_left_ = it->y_left_ + 16 + 16;
-  it->v_left_ = it->u_left_ + 16;
   VP8IteratorReset(it);
 }
 
 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
   VP8Encoder* const enc = it->enc_;
-  if (delta && enc->pic_->progress_hook != NULL) {
-    const int done = it->count_down0_ - it->count_down_;
-    const int percent = (it->count_down0_ <= 0)
+  if (delta && enc->pic_->progress_hook) {
+    const int percent = (enc->mb_h_ <= 1)
                       ? it->percent0_
-                      : it->percent0_ + delta * done / it->count_down0_;
+                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
     return WebPReportProgress(enc->pic_, percent, &enc->percent_);
   }
   return 1;
@@ -98,8 +84,6 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.
 
-static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
-
 static void ImportBlock(const uint8_t* src, int src_stride,
                         uint8_t* dst, int w, int h, int size) {
   int i;
@@ -117,55 +101,30 @@ static void ImportBlock(const uint8_t* src, int src_stride,
   }
 }
 
-static void ImportLine(const uint8_t* src, int src_stride,
-                       uint8_t* dst, int len, int total_len) {
-  int i;
-  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
-  for (; i < total_len; ++i) dst[i] = dst[len - 1];
-}
-
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
+void VP8IteratorImport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
-  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
   const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
   const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  const int w = MinSize(pic->width - x * 16, 16);
-  const int h = MinSize(pic->height - y * 16, 16);
-  const int uv_w = (w + 1) >> 1;
-  const int uv_h = (h + 1) >> 1;
-
-  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
-  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
-  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
-
-  if (tmp_32 == NULL) return;
-
-  // Import source (uncompressed) samples into boundary.
-  if (x == 0) {
-    InitLeft(it);
-  } else {
-    if (y == 0) {
-      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
-    } else {
-      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
-      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
-      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
-    }
-    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
-    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
-    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
-  }
-
-  it->y_top_  = tmp_32 + 0;
-  it->uv_top_ = tmp_32 + 16;
-  if (y == 0) {
-    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
-  } else {
-    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
-    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
-    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
+  uint8_t* const udst = it->yuv_in_ + U_OFF;
+  uint8_t* const vdst = it->yuv_in_ + V_OFF;
+  int w = (pic->width - x * 16);
+  int h = (pic->height - y * 16);
+
+  if (w > 16) w = 16;
+  if (h > 16) h = 16;
+
+  // Luma plane
+  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
+
+  {   // U/V planes
+    const int uv_w = (w + 1) >> 1;
+    const int uv_h = (h + 1) >> 1;
+    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
+    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
   }
 }
 
@@ -281,44 +240,48 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 #undef BIT
 
 //------------------------------------------------------------------------------
-// Advance to the next position, doing the bookkeeping.
+// Advance to the next position, doing the bookeeping.
 
-void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save) {
   VP8Encoder* const enc = it->enc_;
-  const int x = it->x_, y = it->y_;
-  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
-  if (x < enc->mb_w_ - 1) {   // left
-    int i;
-    for (i = 0; i < 16; ++i) {
-      it->y_left_[i] = ysrc[15 + i * BPS];
+  if (block_to_save) {
+    const int x = it->x_, y = it->y_;
+    const uint8_t* const ysrc = block_to_save + Y_OFF;
+    const uint8_t* const usrc = block_to_save + U_OFF;
+    if (x < enc->mb_w_ - 1) {   // left
+      int i;
+      for (i = 0; i < 16; ++i) {
+        enc->y_left_[i] = ysrc[15 + i * BPS];
+      }
+      for (i = 0; i < 8; ++i) {
+        enc->u_left_[i] = usrc[7 + i * BPS];
+        enc->v_left_[i] = usrc[15 + i * BPS];
+      }
+      // top-left (before 'top'!)
+      enc->y_left_[-1] = enc->y_top_[x * 16 + 15];
+      enc->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
+      enc->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
     }
-    for (i = 0; i < 8; ++i) {
-      it->u_left_[i] = uvsrc[7 + i * BPS];
-      it->v_left_[i] = uvsrc[15 + i * BPS];
+    if (y < enc->mb_h_ - 1) {  // top
+      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
+      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
     }
-    // top-left (before 'top'!)
-    it->y_left_[-1] = it->y_top_[15];
-    it->u_left_[-1] = it->uv_top_[0 + 7];
-    it->v_left_[-1] = it->uv_top_[8 + 7];
   }
-  if (y < enc->mb_h_ - 1) {  // top
-    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
-    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
-  }
-}
 
-int VP8IteratorNext(VP8EncIterator* const it) {
+  it->mb_++;
   it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
-    VP8IteratorSetRow(it, ++it->y_);
+  it->nz_++;
+  it->x_++;
+  if (it->x_ == enc->mb_w_) {
+    it->x_ = 0;
+    it->y_++;
+    it->bw_ = &enc->parts_[it->y_ & (enc->num_parts_ - 1)];
+    it->preds_ = enc->preds_ + it->y_ * 4 * enc->preds_w_;
+    it->nz_ = enc->nz_;
+    InitLeft(it);
   }
-  return (0 < --it->count_down_);
+  return (0 < --it->done_);
 }
 
 //------------------------------------------------------------------------------
@@ -405,15 +368,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
 
   // Import the boundary samples
   for (i = 0; i < 17; ++i) {    // left
-    it->i4_boundary_[i] = it->y_left_[15 - i];
+    it->i4_boundary_[i] = enc->y_left_[15 - i];
   }
   for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = it->y_top_[i];
+    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
   }
   // top-right samples have a special case on the far right of the picture
   if (it->x_ < enc->mb_w_ - 1) {
     for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = it->y_top_[i];
+      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
     }
   } else {    // else, replicate the last valid pixel four times
     for (i = 16; i < 16 + 4; ++i) {
@@ -454,3 +417,6 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/layer.c b/drivers/webp/enc/layer.c
index 2402362359..423127df63 100644
--- a/drivers/webp/enc/layer.c
+++ b/drivers/webp/enc/layer.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@@ -15,6 +13,10 @@
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 
 void VP8EncInitLayer(VP8Encoder* const enc) {
@@ -42,3 +44,6 @@ void VP8EncDeleteLayer(VP8Encoder* enc) {
   free(enc->layer_data_);
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/picture.c b/drivers/webp/enc/picture.c
index 011690d065..44eed06083 100644
--- a/drivers/webp/enc/picture.c
+++ b/drivers/webp/enc/picture.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // WebPPicture utils: colorspace conversion, crop, ...
@@ -16,15 +14,14 @@
 #include <math.h>
 
 #include "./vp8enci.h"
-#include "../utils/alpha_processing.h"
-#include "../utils/random.h"
 #include "../utils/rescaler.h"
 #include "../utils/utils.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 
-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define HALVE(x) (((x) + 1) >> 1)
 #define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
@@ -35,10 +32,6 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
 
-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
@@ -123,7 +116,6 @@ int WebPPictureAlloc(WebPPicture* picture) {
         picture->v0 = mem;
         mem += uv0_size;
       }
-      (void)mem;  // makes the static analyzer happy
     } else {
       void* memory;
       const uint64_t argb_size = (uint64_t)width * height;
@@ -298,11 +290,8 @@ int WebPPictureView(const WebPPicture* src,
     dst->y = src->y + top * src->y_stride + left;
     dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
     dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->y_stride = src->y_stride;
-    dst->uv_stride = src->uv_stride;
     if (src->a != NULL) {
       dst->a = src->a + top * src->a_stride + left;
-      dst->a_stride = src->a_stride;
     }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (src->u0 != NULL) {
@@ -310,12 +299,10 @@ int WebPPictureView(const WebPPicture* src,
           IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
       dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
       dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
-      dst->uv0_stride = src->uv0_stride;
     }
 #endif
   } else {
     dst->argb = src->argb + top * src->argb_stride + left;
-    dst->argb_stride = src->argb_stride;
   }
   return 1;
 }
@@ -401,28 +388,6 @@ static void RescalePlane(const uint8_t* src,
   }
 }
 
-static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
-  uint32_t* ptr = pic->argb;
-  int y;
-  for (y = 0; y < pic->height; ++y) {
-    WebPMultARGBRow(ptr, pic->width, inverse);
-    ptr += pic->argb_stride;
-  }
-}
-
-static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
-  const uint8_t* ptr_a = pic->a;
-  if (ptr_a != NULL) {
-    uint8_t* ptr_y = pic->y;
-    int y;
-    for (y = 0; y < pic->height; ++y) {
-      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
-      ptr_y += pic->y_stride;
-      ptr_a += pic->a_stride;
-    }
-  }
-}
-
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
@@ -453,19 +418,9 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
-    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
 
-    // We take transparency into account on the luma plane only. That's not
-    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
     RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
                  tmp.y, width, height, tmp.y_stride, work, 1);
-    AlphaMultiplyY(&tmp, 1);
-
     RescalePlane(pic->u,
                  HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
                  tmp.u,
@@ -475,6 +430,10 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                  tmp.v,
                  HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
 
+    if (tmp.a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (tmp.u0 != NULL) {
       const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
@@ -492,16 +451,13 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
-    // In order to correctly interpolate colors, we need to apply the alpha
-    // weighting first (black-matting), scale the RGB values, and remove
-    // the premultiplication afterward (while preserving the alpha channel).
-    AlphaMultiplyARGB(pic, 0);
+
     RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
                  pic->argb_stride * 4,
                  (uint8_t*)tmp.argb, width, height,
                  tmp.argb_stride * 4,
                  work, 4);
-    AlphaMultiplyARGB(&tmp, 1);
+
   }
   WebPPictureFree(pic);
   free(work);
@@ -590,101 +546,20 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
-
-static int kLinearToGammaTab[kGammaTabSize + 1];
-static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
-
-static void InitGammaTables(void) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = 1. / kGammaScale;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
-    }
-    kGammaTablesOk = 1;
-  }
-}
-
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  (void)shift;
-  return v;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
-
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
-
+// TODO: we can do better than simply 2x2 averaging on U/V samples.
+#define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
+                   (ptr)[rgb_stride] + (ptr)[rgb_stride + step])
+#define SUM2H(ptr) (2 * (ptr)[0] + 2 * (ptr)[step])
+#define SUM2V(ptr) (2 * (ptr)[0] + 2 * (ptr)[rgb_stride])
+#define SUM1(ptr)  (4 * (ptr)[0])
 #define RGB_TO_UV(x, y, SUM) {                           \
   const int src = (2 * (step * (x) + (y) * rgb_stride)); \
   const int dst = (x) + (y) * picture->uv_stride;        \
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
+  picture->u[dst] = VP8RGBToU(r, g, b);                  \
+  picture->v[dst] = VP8RGBToV(r, g, b);                  \
 }
 
 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -693,8 +568,8 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
-  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
+  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
+  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
 }
 
 static void MakeGray(WebPPicture* const picture) {
@@ -713,14 +588,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                               const uint8_t* const a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
-                              float dithering,
                               WebPPicture* const picture) {
   const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
   int x, y;
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;
 
   picture->colorspace = uv_csp;
   picture->use_argb = 0;
@@ -729,15 +602,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
-
   // Import luma plane
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const int offset = step * x + y * rgb_stride;
       picture->y[x + y * picture->y_stride] =
-          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
+          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
     }
   }
 
@@ -785,7 +655,6 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
 
   if (has_alpha) {
     assert(step >= 4);
-    assert(picture->a != NULL);
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         picture->a[x + y * picture->a_stride] =
@@ -808,7 +677,7 @@ static int Import(WebPPicture* const picture,
 
   if (!picture->use_argb) {
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
+                              picture);
   }
   if (import_alpha) {
     picture->colorspace |= WEBP_CSP_ALPHA_BIT;
@@ -823,7 +692,10 @@ static int Import(WebPPicture* const picture,
       for (x = 0; x < width; ++x) {
         const int offset = step * x + y * rgb_stride;
         const uint32_t argb =
-            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+            0xff000000u |
+            (r_ptr[offset] << 16) |
+            (g_ptr[offset] <<  8) |
+            (b_ptr[offset]);
         picture->argb[x + y * picture->argb_stride] = argb;
       }
     }
@@ -833,7 +705,7 @@ static int Import(WebPPicture* const picture,
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         const int offset = step * x + y * rgb_stride;
-        const uint32_t argb = ((uint32_t)a_ptr[offset] << 24) |
+        const uint32_t argb = (a_ptr[offset] << 24) |
                               (r_ptr[offset] << 16) |
                               (g_ptr[offset] <<  8) |
                               (b_ptr[offset]);
@@ -884,7 +756,8 @@ int WebPPictureImportBGRX(WebPPicture* picture,
 
 int WebPPictureYUVAToARGB(WebPPicture* picture) {
   if (picture == NULL) return 0;
-  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+  if (picture->memory_ == NULL || picture->y == NULL ||
+      picture->u == NULL || picture->v == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
   }
   if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
@@ -907,7 +780,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
 
     // First row, with replicated top samples.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
     cur_y += picture->y_stride;
     dst += argb_stride;
     // Center rows.
@@ -928,11 +801,11 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     // Insert alpha values if needed, in replacement for the default 0xff ones.
     if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
       for (y = 0; y < height; ++y) {
-        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        uint32_t* const dst = picture->argb + y * picture->argb_stride;
         const uint8_t* const src = picture->a + y * picture->a_stride;
         int x;
         for (x = 0; x < width; ++x) {
-          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+          dst[x] = (dst[x] & 0x00ffffffu) | (src[x] << 24);
         }
       }
     }
@@ -940,8 +813,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
   return 1;
 }
 
-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   if (picture == NULL) return 0;
   if (picture->argb == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
@@ -957,8 +829,7 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
     PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
     tmp.use_argb = 0;
     tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
-                            &tmp)) {
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
       return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     // Copy back the YUV specs into 'picture'.
@@ -970,10 +841,6 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
   return 1;
 }
 
-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
-}
-
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.
 
@@ -1039,220 +906,67 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
-//------------------------------------------------------------------------------
-// Blend color and remove transparency info
-
-#define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
-#define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
-
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
-  const int red = (background_rgb >> 16) & 0xff;
-  const int green = (background_rgb >> 8) & 0xff;
-  const int blue = (background_rgb >> 0) & 0xff;
-  VP8Random rg;
-  int x, y;
-  if (pic == NULL) return;
-  VP8InitRandom(&rg, 0.f);
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
-    const int Y0 = RGBToY(red, green, blue, &rg);
-    // VP8RGBToU/V expects the u/v values summed over four pixels
-    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
-    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
-      // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
-        if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
-        }
-      }
-      // Chroma blending every even line
-      if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
-        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
-        for (x = 0; x < uv_width; ++x) {
-          // Average four alpha values into a single blending weight.
-          // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
-              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
-              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-        if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-      }
-      memset(a_ptr, 0xff, pic->width);
-    }
-  } else {
-    uint32_t* argb = pic->argb;
-    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = (argb[x] >> 24) & 0xff;
-        if (alpha != 0xff) {
-          if (alpha > 0) {
-            int r = (argb[x] >> 16) & 0xff;
-            int g = (argb[x] >>  8) & 0xff;
-            int b = (argb[x] >>  0) & 0xff;
-            r = BLEND(red, r, alpha);
-            g = BLEND(green, g, alpha);
-            b = BLEND(blue, b, alpha);
-            argb[x] = MakeARGB32(r, g, b);
-          } else {
-            argb[x] = background;
-          }
-        }
-      }
-      argb += pic->argb_stride;
-    }
-  }
-}
-
-#undef BLEND
-#undef BLEND_10BIT
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-// search radius. Shouldn't be too large.
-#define RADIUS 2
-
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  return (float)total_sse;
-}
-#undef RADIUS
 
 //------------------------------------------------------------------------------
 // Distortion
 
 // Max value returned in case of exact similarity.
 static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
 
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
                           int type, float result[5]) {
+  int c;
   DistoStats stats[5];
   int has_alpha;
-  int uv_w, uv_h;
 
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
+  if (pic1 == NULL || pic2 == NULL ||
+      pic1->width != pic2->width || pic1->height != pic2->height ||
+      pic1->y == NULL || pic2->y == NULL ||
+      pic1->u == NULL || pic2->u == NULL ||
+      pic1->v == NULL || pic2->v == NULL ||
       result == NULL) {
     return 0;
   }
   // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
+  if (pic1->use_argb == 1 || pic1->use_argb != pic2->use_argb) {
     return 0;
   }
 
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+  has_alpha = !!(pic1->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(pic2->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (pic1->a == NULL || pic2->a == NULL))) {
     return 0;
   }
 
   memset(stats, 0, sizeof(stats));
-
-  uv_w = HALVE(src->width);
-  uv_h = HALVE(src->height);
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
-      if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
-      }
-      result[4] = GetPSNR(total_sse / total_pixels);
-    }
-  } else {
-    int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
+  VP8SSIMAccumulatePlane(pic1->y, pic1->y_stride,
+                         pic2->y, pic2->y_stride,
+                         pic1->width, pic1->height, &stats[0]);
+  VP8SSIMAccumulatePlane(pic1->u, pic1->uv_stride,
+                         pic2->u, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[1]);
+  VP8SSIMAccumulatePlane(pic1->v, pic1->uv_stride,
+                         pic2->v, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[2]);
+  if (has_alpha) {
+    VP8SSIMAccumulatePlane(pic1->a, pic1->a_stride,
+                           pic2->a, pic2->a_stride,
+                           pic1->width, pic1->height, &stats[3]);
+  }
+  for (c = 0; c <= 4; ++c) {
+    if (type == 1) {
+      const double v = VP8SSIMGet(&stats[c]);
+      result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                   : kMinDistortion_dB);
+    } else {
+      const double v = VP8SSIMGetSquaredError(&stats[c]);
+      result[c] = (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                                   : kMinDistortion_dB);
     }
+    // Accumulate forward
+    if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
   }
   return 1;
 }
@@ -1300,10 +1014,10 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
   return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }
 
-ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
-ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
-ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR);
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA);
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);
 
 #undef ENCODE_FUNC
 
@@ -1313,12 +1027,15 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
   return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
 }
 
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);
 
 #undef LOSSLESS_ENCODE_FUNC
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/quant.c b/drivers/webp/enc/quant.c
index e1d202b5a3..ea153849c8 100644
--- a/drivers/webp/enc/quant.c
+++ b/drivers/webp/enc/quant.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   Quantization
@@ -13,7 +11,6 @@
 
 #include <assert.h>
 #include <math.h>
-#include <stdlib.h>  // for abs()
 
 #include "./vp8enci.h"
 #include "./cost.h"
@@ -25,78 +22,16 @@
 
 #define MID_ALPHA 64      // neutral value for susceptibility
 #define MIN_ALPHA 30      // lowest usable value for susceptibility
-#define MAX_ALPHA 100     // higher meaningful value for susceptibility
+#define MAX_ALPHA 100     // higher meaninful value for susceptibility
 
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
-#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
-
-// number of non-zero coeffs below which we consider the block very flat
-// (and apply a penalty to complex predictions)
-#define FLATNESS_LIMIT_I16 10      // I16 mode
-#define FLATNESS_LIMIT_I4  3       // I4 mode
-#define FLATNESS_LIMIT_UV  2       // UV mode
-#define FLATNESS_PENALTY   140     // roughly ~1bit per block
-
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
-// #define DEBUG_BLOCK
-
-//------------------------------------------------------------------------------
-
-#if defined(DEBUG_BLOCK)
-
-#include <stdio.h>
-#include <stdlib.h>
-
-static void PrintBlockInfo(const VP8EncIterator* const it,
-                           const VP8ModeScore* const rd) {
-  int i, j;
-  const int is_i16 = (it->mb_->type_ == 1);
-  printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
-    }
-    printf("\n");
-  }
-  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
-    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
-    (int)rd->score);
-  if (is_i16) {
-    printf("Mode: %d\n", rd->mode_i16);
-    printf("y_dc_levels:");
-    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
-    printf("\n");
-  } else {
-    printf("Modes[16]: ");
-    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
-    printf("\n");
-  }
-  printf("y_ac_levels:\n");
-  for (j = 0; j < 16; ++j) {
-    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
-      printf("%4d ", rd->y_ac_levels[j][i]);
-    }
-    printf("\n");
-  }
-  printf("\n");
-  printf("uv_levels (mode=%d):\n", rd->mode_uv);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 16; ++i) {
-      printf("%4d ", rd->uv_levels[j][i]);
-    }
-    printf("\n");
-  }
-}
-
-#endif   // DEBUG_BLOCK
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -165,13 +100,31 @@ static const uint16_t kAcTable2[128] = {
   385, 393, 401, 409, 416, 424, 432, 440
 };
 
-static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
-  { 96, 110 }, { 96, 108 }, { 110, 115 }
+static const uint16_t kCoeffThresh[16] = {
+  0,  10, 20, 30,
+  10, 20, 30, 30,
+  20, 30, 30, 30,
+  30, 30, 30, 30
+};
+
+// TODO(skal): tune more. Coeff thresholding?
+static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 }
 };
 
-// Sharpening by (slightly) raising the hi-frequency coeffs.
+// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
 // Hack-ish but helpful for mid-bitrate range. Use with care.
-#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 static const uint8_t kFreqSharpening[16] = {
   0,  30, 60, 90,
   30, 60, 90, 90,
@@ -184,30 +137,20 @@ static const uint8_t kFreqSharpening[16] = {
 
 // Returns the average quantizer
 static int ExpandMatrix(VP8Matrix* const m, int type) {
-  int i, sum;
-  for (i = 0; i < 2; ++i) {
-    const int is_ac_coeff = (i > 0);
-    const int bias = kBiasMatrices[type][is_ac_coeff];
-    m->iq_[i] = (1 << QFIX) / m->q_[i];
-    m->bias_[i] = BIAS(bias);
-    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
-    //   * zero if coeff <= zthresh
-    //   * non-zero if coeff > zthresh
-    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
-  }
+  int i;
+  int sum = 0;
   for (i = 2; i < 16; ++i) {
     m->q_[i] = m->q_[1];
-    m->iq_[i] = m->iq_[1];
-    m->bias_[i] = m->bias_[1];
-    m->zthresh_[i] = m->zthresh_[1];
   }
-  for (sum = 0, i = 0; i < 16; ++i) {
-    if (type == 0) {  // we only use sharpening for AC luma coeffs
-      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
-    } else {
-      m->sharpen_[i] = 0;
-    }
-    sum += m->q_[i];
+  for (i = 0; i < 16; ++i) {
+    const int j = kZigzag[i];
+    const int bias = kBiasMatrices[type][j];
+    m->iq_[j] = (1 << QFIX) / m->q_[j];
+    m->bias_[j] = BIAS(bias);
+    // TODO(skal): tune kCoeffThresh[]
+    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
+    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
+    sum += m->q_[j];
   }
   return (sum + 8) >> 4;
 }
@@ -235,17 +178,17 @@ static void SetupMatrices(VP8Encoder* enc) {
     q16 = ExpandMatrix(&m->y2_, 1);
     quv = ExpandMatrix(&m->uv_, 2);
 
-    m->lambda_i4_          = (3 * q4 * q4) >> 7;
-    m->lambda_i16_         = (3 * q16 * q16);
-    m->lambda_uv_          = (3 * quv * quv) >> 6;
-    m->lambda_mode_        = (1 * q4 * q4) >> 7;
-    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
-    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
-    m->lambda_trellis_uv_  = (quv *quv) << 1;
-    m->tlambda_            = (tlambda_scale * q4) >> 5;
-
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
-    m->max_edge_  = 0;
+    // TODO: Switch to kLambda*[] tables?
+    {
+      m->lambda_i4_  = (3 * q4 * q4) >> 7;
+      m->lambda_i16_ = (3 * q16 * q16);
+      m->lambda_uv_  = (3 * quv * quv) >> 6;
+      m->lambda_mode_    = (1 * q4 * q4) >> 7;
+      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
+      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
+      m->lambda_trellis_uv_  = (quv *quv) << 1;
+      m->tlambda_            = (tlambda_scale * q4) >> 5;
+    }
   }
 }
 
@@ -254,21 +197,16 @@ static void SetupMatrices(VP8Encoder* enc) {
 
 // Very small filter-strength values have close to no visual effect. So we can
 // save a little decoding-CPU by turning filtering off for these.
-#define FSTRENGTH_CUTOFF 2
+#define FSTRENGTH_CUTOFF 3
 
 static void SetupFilterStrength(VP8Encoder* const enc) {
   int i;
-  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
-  const int level0 = 5 * enc->config_->filter_strength;
+  const int level0 = enc->config_->filter_strength;
   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
-    VP8SegmentInfo* const m = &enc->dqm_[i];
-    // We focus on the quantization of AC coeffs.
-    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
-    const int base_strength =
-        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
-    // Segments with lower complexity ('beta') will be less filtered.
-    const int f = base_strength * level0 / (256 + m->beta_);
-    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
+    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
+    const int f = level / (256 + enc->dqm_[i].beta_);
+    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
   }
   // We record the initial strength (mainly for the case of 1-segment only).
   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
@@ -286,90 +224,28 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
 // We want to emulate jpeg-like behaviour where the expected "good" quality
 // is around q=75. Internally, our "good" middle is around c=50. So we
 // map accordingly using linear piece-wise function
-static double QualityToCompression(double c) {
-  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
-  // The file size roughly scales as pow(quantizer, 3.). Actually, the
-  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
-  // in the mid-quant range. So we scale the compressibility inversely to
-  // this power-law: quant ~= compression ^ 1/3. This law holds well for
-  // low quant. Finer modeling for high-quant would make use of kAcTable[]
-  // more explicitly.
-  const double v = pow(linear_c, 1 / 3.);
-  return v;
-}
-
-static double QualityToJPEGCompression(double c, double alpha) {
-  // We map the complexity 'alpha' and quality setting 'c' to a compression
-  // exponent empirically matched to the compression curve of libjpeg6b.
-  // On average, the WebP output size will be roughly similar to that of a
-  // JPEG file compressed with same quality factor.
-  const double amin = 0.30;
-  const double amax = 0.85;
-  const double exp_min = 0.4;
-  const double exp_max = 0.9;
-  const double slope = (exp_min - exp_max) / (amax - amin);
-  // Linearly interpolate 'expn' from exp_min to exp_max
-  // in the [amin, amax] range.
-  const double expn = (alpha > amax) ? exp_min
-                    : (alpha < amin) ? exp_max
-                    : exp_max + slope * (alpha - amin);
-  const double v = pow(c, expn);
-  return v;
-}
-
-static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
-                                 const VP8SegmentInfo* const S2) {
-  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
-}
-
-static void SimplifySegments(VP8Encoder* const enc) {
-  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
-  const int num_segments = enc->segment_hdr_.num_segments_;
-  int num_final_segments = 1;
-  int s1, s2;
-  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
-    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
-    int found = 0;
-    // check if we already have similar segment
-    for (s2 = 0; s2 < num_final_segments; ++s2) {
-      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
-      if (SegmentsAreEquivalent(S1, S2)) {
-        found = 1;
-        break;
-      }
-    }
-    map[s1] = s2;
-    if (!found) {
-      if (num_final_segments != s1) {
-        enc->dqm_[num_final_segments] = enc->dqm_[s1];
-      }
-      ++num_final_segments;
-    }
-  }
-  if (num_final_segments < num_segments) {  // Remap
-    int i = enc->mb_w_ * enc->mb_h_;
-    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
-    enc->segment_hdr_.num_segments_ = num_final_segments;
-    // Replicate the trailing segment infos (it's mostly cosmetics)
-    for (i = num_final_segments; i < num_segments; ++i) {
-      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
-    }
-  }
+static double QualityToCompression(double q) {
+  const double c = q / 100.;
+  return (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
 }
 
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   int i;
   int dq_uv_ac, dq_uv_dc;
-  const int num_segments = enc->segment_hdr_.num_segments_;
+  const int num_segments = enc->config_->segments;
   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
-  const double Q = quality / 100.;
-  const double c_base = enc->config_->emulate_jpeg_size ?
-      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
-      QualityToCompression(Q);
+  const double c_base = QualityToCompression(quality);
   for (i = 0; i < num_segments; ++i) {
-    // We modulate the base coefficient to accommodate for the quantization
-    // susceptibility and allow denser segments to be quantized more.
-    const double expn = 1. - amp * enc->dqm_[i].alpha_;
+    // The file size roughly scales as pow(quantizer, 3.). Actually, the
+    // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+    // in the mid-quant range. So we scale the compressibility inversely to
+    // this power-law: quant ~= compression ^ 1/3. This law holds well for
+    // low quant. Finer modelling for high-quant would make use of kAcTable[]
+    // more explicitely.
+    // Additionally, we modulate the base exponent 1/3 to accommodate for the
+    // quantization susceptibility and allow denser segments to be quantized
+    // more.
+    const double expn = (1. - amp * enc->dqm_[i].alpha_) / 3.;
     const double c = pow(c_base, expn);
     const int q = (int)(127. * (1. - c));
     assert(expn > 0.);
@@ -405,11 +281,9 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   enc->dq_uv_dc_ = dq_uv_dc;
   enc->dq_uv_ac_ = dq_uv_ac;
 
-  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
-
-  if (num_segments > 1) SimplifySegments(enc);
+  SetupMatrices(enc);
 
-  SetupMatrices(enc);         // finalize quantization matrices
+  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 }
 
 //------------------------------------------------------------------------------
@@ -425,14 +299,16 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };
 
 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
   VP8EncPredLuma16(it->yuv_p_, left, top);
 }
 
 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 
@@ -488,7 +364,6 @@ static void InitScore(VP8ModeScore* const rd) {
   rd->D  = 0;
   rd->SD = 0;
   rd->R  = 0;
-  rd->H  = 0;
   rd->nz = 0;
   rd->score = MAX_COST;
 }
@@ -497,7 +372,6 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  = src->D;
   dst->SD = src->SD;
   dst->R  = src->R;
-  dst->H  = src->H;
   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
   dst->score = src->score;
 }
@@ -506,7 +380,6 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  += src->D;
   dst->SD += src->SD;
   dst->R  += src->R;
-  dst->H  += src->H;
   dst->nz |= src->nz;     // here, new nz bits are accumulated.
   dst->score += src->score;
 }
@@ -535,7 +408,7 @@ typedef struct {
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
@@ -598,10 +471,11 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     // note: it's important to take sign of the _original_ coeff,
     // so we don't have to consider level < 0 afterward.
     const int sign = (in[j] < 0);
-    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    int level0 = QUANTDIV(coeff0, iQ, B);
-    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0;
+    if (coeff0 > 2047) coeff0 = 2047;
 
+    level0 = QUANTDIV(coeff0, iQ, B);
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
@@ -613,7 +487,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
       cur->sign = sign;
       cur->level = level;
       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level >= 2048 || level < 0) {   // node is dead?
         cur->cost = MAX_COST;
         continue;
       }
@@ -706,10 +580,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               VP8ModeScore* const rd,
                               uint8_t* const yuv_out,
                               int mode) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
@@ -718,7 +592,7 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   }
   VP8FTransformWHT(tmp[0], dc_tmp);
-  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
 
   if (DO_TRELLIS_I16 && it->do_trellis_) {
     int x, y;
@@ -813,18 +687,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 
 //------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
-// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
-
-static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
-  // We look at the first three AC coefficients to determine what is the average
-  // delta between each sub-4x4 block.
-  const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
-  max_v = (v2 > max_v) ? v2 : max_v;
-  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
-}
+// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
 
 static void SwapPtr(uint8_t** a, uint8_t** b) {
   uint8_t* const tmp = *a;
@@ -836,23 +699,9 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
-  score_t score = 0;
-  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
-    int i;
-    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
-      score += (levels[i] != 0);
-      if (score > thresh) return 0;
-    }
-    levels += 16;
-  }
-  return 1;
-}
-
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 16;
-  VP8Encoder* const enc = it->enc_;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8Encoder* const enc = it->enc_;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -860,7 +709,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   int mode;
 
   rd->mode_i16 = -1;
-  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+  for (mode = 0; mode < 4; ++mode) {
     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
     int nz;
 
@@ -871,13 +720,8 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
     rd16.D = VP8SSE16x16(src, tmp_dst);
     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
             : 0;
-    rd16.H = VP8FixedCostsI16[mode];
     rd16.R = VP8GetCostLuma16(it, &rd16);
-    if (mode > 0 &&
-        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
-      // penalty to avoid flat area to be mispredicted by complex mode
-      rd16.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd16.R += VP8FixedCostsI16[mode];
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
     SetRDScore(lambda, &rd16);
@@ -892,13 +736,6 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   VP8SetIntra16Mode(it, rd->mode_i16);
-
-  // we have a blocky macroblock (only DCs are non-zero) with fairly high
-  // distortion, record max delta so we can later adjust the minimal filtering
-  // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
-    StoreMaxDelta(dqm, rd->y_dc_levels);
-  }
 }
 
 //------------------------------------------------------------------------------
@@ -928,11 +765,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
 
   InitScore(&rd_best);
-  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
-  SetRDScore(dqm->lambda_mode_, &rd_best);
+  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
   VP8IteratorStartI4(it);
   do {
-    const int kNumBlocks = 1;
     VP8ModeScore rd_i4;
     int mode;
     int best_mode = -1;
@@ -956,11 +791,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
       rd_tmp.SD =
           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                   : 0;
-      rd_tmp.H = mode_costs[mode];
       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
-      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
-        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
-      }
+      rd_tmp.R += mode_costs[mode];
 
       SetRDScore(lambda, &rd_tmp);
       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
@@ -972,17 +804,14 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
     AddScore(&rd_best, &rd_i4);
-    if (rd_best.score >= rd->score) {
-      return 0;
-    }
-    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
-    if (total_header_bits > enc->max_i4_header_bits_) {
+    total_header_bits += mode_costs[best_mode];
+    if (rd_best.score >= rd->score ||
+        total_header_bits > enc->max_i4_header_bits_) {
       return 0;
     }
     // Copy selected samples if not in the right place already.
-    if (best_block != best_blocks + VP8Scan[it->i4_]) {
+    if (best_block != best_blocks + VP8Scan[it->i4_])
       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
-    }
     rd->modes_i4[it->i4_] = best_mode;
     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   } while (VP8IteratorRotateI4(it, best_blocks));
@@ -998,7 +827,6 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 8;
   const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
@@ -1010,7 +838,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
   rd->mode_uv = -1;
   InitScore(&rd_best);
-  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+  for (mode = 0; mode < 4; ++mode) {
     VP8ModeScore rd_uv;
 
     // Reconstruct
@@ -1019,11 +847,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
-    rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
-    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
-      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd_uv.R += VP8FixedCostsUV[mode];
 
     SetRDScore(lambda, &rd_uv);
     if (mode == 0 || rd_uv.score < rd_best.score) {
@@ -1042,10 +867,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const VP8Encoder* const enc = it->enc_;
-  const int is_i16 = (it->mb_->type_ == 1);
+  const int i16 = (it->mb_->type_ == 1);
   int nz = 0;
 
-  if (is_i16) {
+  if (i16) {
     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
   } else {
     VP8IteratorStartI4(it);
@@ -1064,66 +889,11 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   rd->nz = nz;
 }
 
-// Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
-  const int is_i16 = (it->mb_->type_ == 1);
-  score_t best_score = MAX_COST;
-
-  if (try_both_i4_i16 || is_i16) {
-    int mode;
-    int best_mode = -1;
-    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
-      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF;
-      const score_t score = VP8SSE16x16(src, ref);
-      if (score < best_score) {
-        best_mode = mode;
-        best_score = score;
-      }
-    }
-    VP8SetIntra16Mode(it, best_mode);
-  }
-  if (try_both_i4_i16 || !is_i16) {
-    uint8_t modes_i4[16];
-    // We don't evaluate the rate here, but just account for it through a
-    // constant penalty (i4 mode usually needs more bits compared to i16).
-    score_t score_i4 = (score_t)I4_PENALTY;
-
-    VP8IteratorStartI4(it);
-    do {
-      int mode;
-      int best_sub_mode = -1;
-      score_t best_sub_score = MAX_COST;
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
-
-      // TODO(skal): we don't really need the prediction pixels here,
-      // but just the distortion against 'src'.
-      VP8MakeIntra4Preds(it);
-      for (mode = 0; mode < NUM_BMODES; ++mode) {
-        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
-        const score_t score = VP8SSE4x4(src, ref);
-        if (score < best_sub_score) {
-          best_sub_mode = mode;
-          best_sub_score = score;
-        }
-      }
-      modes_i4[it->i4_] = best_sub_mode;
-      score_i4 += best_sub_score;
-      if (score_i4 >= best_score) break;
-    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
-    if (score_i4 < best_score) {
-      VP8SetIntra4Mode(it, modes_i4);
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 // Entry point
 
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
-                VP8RDLevel rd_opt) {
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
   int is_skipped;
-  const int method = it->enc_->method_;
 
   InitScore(rd);
 
@@ -1132,21 +902,22 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   VP8MakeLuma16Preds(it);
   VP8MakeChroma8Preds(it);
 
-  if (rd_opt > RD_OPT_NONE) {
-    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
+  // for rd_opt = 2, we perform trellis-quant on the final decision only.
+  // for rd_opt > 2, we use it for every scoring (=much slower).
+  if (rd_opt > 0) {
+    it->do_trellis_ = (rd_opt > 2);
     PickBestIntra16(it, rd);
-    if (method >= 2) {
+    if (it->enc_->method_ >= 2) {
       PickBestIntra4(it, rd);
     }
     PickBestUV(it, rd);
-    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
+    if (rd_opt == 2) {
       it->do_trellis_ = 1;
       SimpleQuantize(it, rd);
     }
   } else {
-    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
-    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
-    DistoRefine(it, (method >= 2));
+    // TODO: for method_ == 2, pick the best intra4/intra16 based on SSE
+    it->do_trellis_ = (it->enc_->method_ == 2);
     SimpleQuantize(it, rd);
   }
   is_skipped = (rd->nz == 0);
@@ -1154,3 +925,6 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   return is_skipped;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/syntax.c b/drivers/webp/enc/syntax.c
index 08cfe79ece..7c8c7b1a84 100644
--- a/drivers/webp/enc/syntax.c
+++ b/drivers/webp/enc/syntax.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Header syntax writing
@@ -13,20 +11,35 @@
 
 #include <assert.h>
 
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"  // RIFF constants
-#include "../webp/mux_types.h"         // ALPHA_FLAG
+#include "../webp/format_constants.h"
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Helper functions
 
+// TODO(later): Move to webp/format_constants.h?
+static void PutLE24(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+}
+
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  PutLE24(data, val);
+  data[3] = (val >> 24) & 0xff;
+}
+
 static int IsVP8XNeeded(const VP8Encoder* const enc) {
   return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
                              // This could change in the future.
 }
 
 static int PutPaddingByte(const WebPPicture* const pic) {
+
   const uint8_t pad_byte[1] = { 0 };
   return !!pic->writer(pad_byte, 1, pic);
 }
@@ -60,14 +73,14 @@ static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
   assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
 
   if (enc->has_alpha_) {
-    flags |= ALPHA_FLAG;
+    flags |= ALPHA_FLAG_BIT;
   }
 
   PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
   PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
   PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
   PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
-  if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+  if(!pic->writer(vp8x, sizeof(vp8x), pic)) {
     return VP8_ENC_ERROR_BAD_WRITE;
   }
   return VP8_ENC_OK;
@@ -314,9 +327,7 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
 
   PutSegmentHeader(bw, enc);
   PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
-                  enc->num_parts_ == 4 ? 2 :
-                  enc->num_parts_ == 2 ? 1 : 0, 2);
+  VP8PutValue(bw, enc->config_->partitions, 2);
   PutQuant(bw, enc);
   VP8PutBitUniform(bw, 0);   // no proba update
   VP8WriteProbas(bw, &enc->proba_);
@@ -421,3 +432,6 @@ int VP8EncWrite(VP8Encoder* const enc) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/token.c b/drivers/webp/enc/token.c
deleted file mode 100644
index e696642f16..0000000000
--- a/drivers/webp/enc/token.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Paginated token buffer
-//
-//  A 'token' is a bit value associated with a probability, either fixed
-// or a later-to-be-determined after statistics have been collected.
-// For dynamic probability, we just record the slot id (idx) for the probability
-// value in the final probability array (uint8_t* probas in VP8EmitTokens).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "./cost.h"
-#include "./vp8enci.h"
-
-#if !defined(DISABLE_TOKEN_BUFFER)
-
-// we use pages to reduce the number of memcpy()
-#define MAX_NUM_TOKEN 8192          // max number of token per page
-#define FIXED_PROBA_BIT (1u << 14)
-
-struct VP8Tokens {
-  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
-                                    // bit #14: constant proba or idx
-                                    // bits 0..13: slot or constant proba
-  VP8Tokens* next_;
-};
-
-//------------------------------------------------------------------------------
-
-void VP8TBufferInit(VP8TBuffer* const b) {
-  b->tokens_ = NULL;
-  b->pages_ = NULL;
-  b->last_page_ = &b->pages_;
-  b->left_ = 0;
-  b->error_ = 0;
-}
-
-void VP8TBufferClear(VP8TBuffer* const b) {
-  if (b != NULL) {
-    const VP8Tokens* p = b->pages_;
-    while (p != NULL) {
-      const VP8Tokens* const next = p->next_;
-      free((void*)p);
-      p = next;
-    }
-    VP8TBufferInit(b);
-  }
-}
-
-static int TBufferNewPage(VP8TBuffer* const b) {
-  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
-  if (page == NULL) {
-    b->error_ = 1;
-    return 0;
-  }
-  *b->last_page_ = page;
-  b->last_page_ = &page->next_;
-  b->left_ = MAX_NUM_TOKEN;
-  b->tokens_ = page->tokens_;
-  page->next_ = NULL;
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-
-#define TOKEN_ID(t, b, ctx, p) \
-    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
-
-static WEBP_INLINE int AddToken(VP8TBuffer* const b,
-                                int bit, uint32_t proba_idx) {
-  assert(proba_idx < FIXED_PROBA_BIT);
-  assert(bit == 0 || bit == 1);
-  if (b->left_ > 0 || TBufferNewPage(b)) {
-    const int slot = --b->left_;
-    b->tokens_[slot] = (bit << 15) | proba_idx;
-  }
-  return bit;
-}
-
-static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
-                                         int bit, int proba) {
-  assert(proba < 256);
-  assert(bit == 0 || bit == 1);
-  if (b->left_ > 0 || TBufferNewPage(b)) {
-    const int slot = --b->left_;
-    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
-  }
-}
-
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
-                         const int16_t* const coeffs,
-                         VP8TBuffer* const tokens) {
-  int n = first;
-  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
-  if (!AddToken(tokens, last >= 0, base_id + 0)) {
-    return 0;
-  }
-
-  while (n < 16) {
-    const int c = coeffs[n++];
-    const int sign = c < 0;
-    int v = sign ? -c : c;
-    if (!AddToken(tokens, v != 0, base_id + 1)) {
-      ctx = 0;
-      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
-      continue;
-    }
-    if (!AddToken(tokens, v > 1, base_id + 2)) {
-      ctx = 1;
-    } else {
-      if (!AddToken(tokens, v > 4, base_id + 3)) {
-        if (AddToken(tokens, v != 2, base_id + 4))
-          AddToken(tokens, v == 4, base_id + 5);
-      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
-        if (!AddToken(tokens, v > 6, base_id + 7)) {
-          AddConstantToken(tokens, v == 6, 159);
-        } else {
-          AddConstantToken(tokens, v >= 9, 165);
-          AddConstantToken(tokens, !(v & 1), 145);
-        }
-      } else {
-        int mask;
-        const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 0, base_id + 9);
-          v -= 3 + (8 << 0);
-          mask = 1 << 2;
-          tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 1, base_id + 9);
-          v -= 3 + (8 << 1);
-          mask = 1 << 3;
-          tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 0, base_id + 10);
-          v -= 3 + (8 << 2);
-          mask = 1 << 4;
-          tab = VP8Cat5;
-        } else {                         // VP8Cat6 (11b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 1, base_id + 10);
-          v -= 3 + (8 << 3);
-          mask = 1 << 10;
-          tab = VP8Cat6;
-        }
-        while (mask) {
-          AddConstantToken(tokens, !!(v & mask), *tab++);
-          mask >>= 1;
-        }
-      }
-      ctx = 2;
-    }
-    AddConstantToken(tokens, sign, 128);
-    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
-    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
-      return 1;   // EOB
-    }
-  }
-  return 1;
-}
-
-#undef TOKEN_ID
-
-//------------------------------------------------------------------------------
-// This function works, but isn't currently used. Saved for later.
-
-#if 0
-
-static void Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-}
-
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
-  const VP8Tokens* p = b->pages_;
-  while (p != NULL) {
-    const int N = (p->next_ == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      if (!(token & FIXED_PROBA_BIT)) {
-        Record((token >> 15) & 1, stats + (token & 0x3fffu));
-      }
-    }
-    p = p->next_;
-  }
-}
-
-#endif   // 0
-
-//------------------------------------------------------------------------------
-// Final coding pass, with known probabilities
-
-int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas, int final_pass) {
-  const VP8Tokens* p = b->pages_;
-  (void)final_pass;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const VP8Tokens* const next = p->next_;
-    const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      const int bit = (token >> 15) & 1;
-      if (token & FIXED_PROBA_BIT) {
-        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
-      } else {
-        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
-      }
-    }
-    if (final_pass) free((void*)p);
-    p = next;
-  }
-  if (final_pass) b->pages_ = NULL;
-  return 1;
-}
-
-// Size estimation
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
-  size_t size = 0;
-  const VP8Tokens* p = b->pages_;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const VP8Tokens* const next = p->next_;
-    const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      const int bit = token & (1 << 15);
-      if (token & FIXED_PROBA_BIT) {
-        size += VP8BitCost(bit, token & 0xffu);
-      } else {
-        size += VP8BitCost(bit, probas[token & 0x3fffu]);
-      }
-    }
-    p = next;
-  }
-  return size;
-}
-
-//------------------------------------------------------------------------------
-
-#else     // DISABLE_TOKEN_BUFFER
-
-void VP8TBufferInit(VP8TBuffer* const b) {
-  (void)b;
-}
-void VP8TBufferClear(VP8TBuffer* const b) {
-  (void)b;
-}
-
-#endif    // !DISABLE_TOKEN_BUFFER
-
diff --git a/drivers/webp/enc/tree.c b/drivers/webp/enc/tree.c
index e5d05e5221..8b25e5e488 100644
--- a/drivers/webp/enc/tree.c
+++ b/drivers/webp/enc/tree.c
@@ -1,24 +1,27 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// Coding of token probabilities, intra modes and segments.
+// Token probabilities
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Default probabilities
 
 // Paragraph 13.5
 const uint8_t
   VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -315,7 +318,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
   VP8EncIterator it;
   VP8IteratorInit(enc, &it);
   do {
-    const VP8MBInfo* const mb = it.mb_;
+    const VP8MBInfo* mb = it.mb_;
     const uint8_t* preds = it.preds_;
     if (enc->segment_hdr_.update_map_) {
       PutSegment(bw, mb->segment_, enc->proba_.segments_);
@@ -340,7 +343,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
       }
     }
     PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it));
+  } while (VP8IteratorNext(&it, 0));
 }
 
 //------------------------------------------------------------------------------
@@ -502,3 +505,6 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
   }
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/vp8enci.h b/drivers/webp/enc/vp8enci.h
index 71adf6c38a..a77778c0d8 100644
--- a/drivers/webp/enc/vp8enci.h
+++ b/drivers/webp/enc/vp8enci.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   WebP encoder: internal header.
@@ -18,9 +16,8 @@
 #include "../webp/encode.h"
 #include "../dsp/dsp.h"
 #include "../utils/bit_writer.h"
-#include "../utils/thread.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -29,9 +26,12 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 0
 
+// size of histogram used by CollectHistogram.
+#define MAX_COEFF_THRESH   64
+
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
        B_TM_PRED = 1,
@@ -47,8 +47,7 @@ enum { B_DC_PRED = 0,   // 4x4 modes
 
        // Luma16 or UV modes
        DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       NUM_PRED_MODES = 4
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED
      };
 
 enum { NUM_MB_SEGMENTS = 4,
@@ -57,24 +56,16 @@ enum { NUM_MB_SEGMENTS = 4,
        NUM_BANDS = 8,
        NUM_CTX = 3,
        NUM_PROBAS = 11,
-       MAX_LF_LEVELS = 64,       // Maximum loop filter level
-       MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
-       MAX_LEVEL = 2047          // max level (note: max codable is 2047 + 67)
+       MAX_LF_LEVELS = 64,      // Maximum loop filter level
+       MAX_VARIABLE_LEVEL = 67  // last (inclusive) level with variable cost
      };
 
-typedef enum {   // Rate-distortion optimization levels
-  RD_OPT_NONE        = 0,  // no rd-opt
-  RD_OPT_BASIC       = 1,  // basic scoring (no trellis)
-  RD_OPT_TRELLIS     = 2,  // perform trellis-quant on the final decision only
-  RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
-} VP8RDLevel;
-
 // YUV-cache parameters. Cache is 16-pixels wide.
 // The original or reconstructed samples can be accessed using VP8Scan[]
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
+//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@@ -169,17 +160,7 @@ typedef int64_t score_t;     // type used for scores, rate, distortion
 static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
   return (n * iQ + B) >> QFIX;
 }
-
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   31
-typedef struct VP8Histogram VP8Histogram;
-struct VP8Histogram {
-  // TODO(skal): we only need to store the max_value and last_non_zero actually.
-  int distribution[MAX_COEFF_THRESH + 1];
-};
-
-// Uncomment the following to remove token-buffer code:
-// #define DISABLE_TOKEN_BUFFER
+extern const uint8_t VP8Zigzag[16];
 
 //------------------------------------------------------------------------------
 // Headers
@@ -248,19 +229,16 @@ typedef struct {
   int beta_;       // filter-susceptibility, range [0,255].
   int quant_;      // final segment quantizer.
   int fstrength_;  // final in-loop filtering strength
-  int max_edge_;   // max edge delta (for filtering strength)
-  int min_disto_;  // minimum distortion required to trigger filtering record
   // reactivities
   int lambda_i16_, lambda_i4_, lambda_uv_;
   int lambda_mode_, lambda_trellis_, tlambda_;
   int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
 } VP8SegmentInfo;
 
-// Handy transient struct to accumulate score and info during RD-optimization
+// Handy transcient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
-  score_t D, SD;              // Distortion, spectral distortion
-  score_t H, R, score;        // header bits, rate, score.
+  score_t D, SD, R, score;    // Distortion, spectral distortion, rate, score.
   int16_t y_dc_levels[16];    // Quantized levels for luma-DC, luma-AC, chroma.
   int16_t y_ac_levels[16][16];
   int16_t uv_levels[4 + 4][16];
@@ -274,11 +252,12 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
+  int y_offset_, uv_offset_;       // offset to the luma / chroma planes
   int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // input samples
-  uint8_t*      yuv_out_;          // output samples
-  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
-  uint8_t*      yuv_p_;            // scratch buffer for prediction
+  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
+  uint8_t*      yuv_out_;          // ''
+  uint8_t*      yuv_out2_;         // ''
+  uint8_t*      yuv_p_;            // ''
   VP8Encoder*   enc_;              // back-pointer
   VP8MBInfo*    mb_;               // current macroblock
   VP8BitWriter* bw_;               // current bit-writer
@@ -294,43 +273,24 @@ typedef struct {
   uint64_t      uv_bits_;          // macroblock bit-cost for chroma
   LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
   int           do_trellis_;       // if true, perform extra level optimisation
-  int           count_down_;       // number of mb still to be processed
-  int           count_down0_;      // starting counter value (for progress)
+  int           done_;             // true when scan is finished
   int           percent0_;         // saved initial progress percent
-
-  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
-  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
-  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
-
-  uint8_t* y_top_;     // top luma samples at position 'x_'
-  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
-
-  // memory for storing y/u/v_left_ and yuv_in_/out_*
-  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
-  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;
 
   // in iterator.c
-// must be called first
+// must be called first.
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan
+// restart a scan.
 void VP8IteratorReset(VP8EncIterator* const it);
-// reset iterator position to row 'y'
-void VP8IteratorSetRow(VP8EncIterator* const it, int y);
-// set count down (=number of iterations to go)
-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
-// return true if iteration is finished
-int VP8IteratorIsDone(const VP8EncIterator* const it);
-// Import uncompressed samples from source.
-// If tmp_32 is not NULL, import boundary samples too.
-// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
+// import samples from source
+void VP8IteratorImport(const VP8EncIterator* const it);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns false if not finished.
-int VP8IteratorNext(VP8EncIterator* const it);
-// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
-void VP8IteratorSaveBoundary(VP8EncIterator* const it);
+// go to next macroblock. Returns !done_. If *block_to_save is non-null, will
+// save the boundary values to top_/left_ arrays. block_to_save can be
+// it->yuv_out_ or it->yuv_in_.
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                         int final_delta_percent);
@@ -354,40 +314,44 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment);
 //------------------------------------------------------------------------------
 // Paginated token buffer
 
-typedef struct VP8Tokens VP8Tokens;  // struct details in token.c
-
-typedef struct {
-#if !defined(DISABLE_TOKEN_BUFFER)
-  VP8Tokens* pages_;        // first page
-  VP8Tokens** last_page_;   // last page
-  uint16_t* tokens_;        // set to (*last_page_)->tokens_
-  int left_;          // how many free tokens left before the page is full.
-#endif
-  int error_;         // true in case of malloc error
-} VP8TBuffer;
-
-void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
-void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate pages memory
+// WIP: #define USE_TOKEN_BUFFER
 
-#if !defined(DISABLE_TOKEN_BUFFER)
+#ifdef USE_TOKEN_BUFFER
 
-// Finalizes bitstream when probabilities are known.
-// Deletes the allocated token memory if final_pass is true.
-int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas, int final_pass);
+#define MAX_NUM_TOKEN 2048
 
-// record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
-                         const int16_t* const coeffs,
-                         VP8TBuffer* const tokens);
+typedef struct VP8Tokens VP8Tokens;
+struct VP8Tokens {
+  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit, bits 0..14: slot
+  int left_;
+  VP8Tokens* next_;
+};
 
-// Estimate the final coded size given a set of 'probas'.
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
+typedef struct {
+  VP8Tokens* rows_;
+  uint16_t* tokens_;    // set to (*last_)->tokens_
+  VP8Tokens** last_;
+  int left_;
+  int error_;  // true in case of malloc error
+} VP8TBuffer;
 
-// unused for now
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
+void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
+int VP8TBufferNewPage(VP8TBuffer* const b);  // allocate a new page
+void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate memory
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas);
+
+static WEBP_INLINE int VP8AddToken(VP8TBuffer* const b,
+                                   int bit, int proba_idx) {
+  if (b->left_ > 0 || VP8TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  return bit;
+}
 
-#endif  // !DISABLE_TOKEN_BUFFER
+#endif  // USE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
 // VP8Encoder
@@ -412,7 +376,6 @@ struct VP8Encoder {
   // per-partition boolean decoders.
   VP8BitWriter bw_;                         // part0
   VP8BitWriter parts_[MAX_NUM_PARTITIONS];  // token partitions
-  VP8TBuffer tokens_;                       // token buffer
 
   int percent_;                             // for progress
 
@@ -420,7 +383,6 @@ struct VP8Encoder {
   int has_alpha_;
   uint8_t* alpha_data_;       // non-NULL if transparency is present
   uint32_t alpha_data_size_;
-  WebPWorker alpha_worker_;
 
   // enhancement layer
   int use_layer_;
@@ -432,7 +394,6 @@ struct VP8Encoder {
   VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
   int base_quant_;                 // nominal quantizer value. Only used
                                    // for relative coding of segments' quant.
-  int alpha_;                      // global susceptibility (<=> complexity)
   int uv_alpha_;                   // U/V quantization susceptibility
   // global offset of quantizers, shared by all segments
   int dq_y1_dc_;
@@ -448,20 +409,25 @@ struct VP8Encoder {
   int      block_count_[3];
 
   // quality/speed settings
-  int method_;               // 0=fastest, 6=best/slowest.
-  VP8RDLevel rd_opt_level_;  // Deduced from method_.
-  int max_i4_header_bits_;   // partition #0 safeness factor
-  int thread_level_;         // derived from config->thread_level
-  int do_search_;            // derived from config->target_XXX
-  int use_tokens_;           // if true, use token buffer
+  int method_;              // 0=fastest, 6=best/slowest.
+  int rd_opt_level_;        // Deduced from method_.
+  int max_i4_header_bits_;  // partition #0 safeness factor
 
   // Memory
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
+  uint8_t*   yuv_in_;    // input samples
+  uint8_t*   yuv_out_;   // output samples
+  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
+  uint8_t*   yuv_p_;     // scratch buffer for prediction
   uint8_t   *y_top_;     // top luma samples.
   uint8_t   *uv_top_;    // top u/v samples.
-                         // U and V are packed into 16 bytes (8 U + 8 V)
+                         // U and V are packed into 16 pixels (8 U + 8 V)
+  uint8_t   *y_left_;    // left luma samples (adressable from index -1 to 15).
+  uint8_t   *u_left_;    // left u samples (adressable from index -1 to 7)
+  uint8_t   *v_left_;    // left v samples (adressable from index -1 to 7)
+
   LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
@@ -489,11 +455,6 @@ void VP8EncFreeBitWriters(VP8Encoder* const enc);
 
   // in frame.c
 extern const uint8_t VP8EncBands[16 + 1];
-extern const uint8_t VP8Cat3[];
-extern const uint8_t VP8Cat4[];
-extern const uint8_t VP8Cat5[];
-extern const uint8_t VP8Cat6[];
-
 // Form all the four Intra16x16 predictions in the yuv_p_ cache
 void VP8MakeLuma16Preds(const VP8EncIterator* const it);
 // Form all the four Chroma8x8 predictions in the yuv_p_ cache
@@ -505,9 +466,9 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it);
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);
 int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd);
-// Main coding calls
+// Main stat / coding passes
 int VP8EncLoop(VP8Encoder* const enc);
-int VP8EncTokenLoop(VP8Encoder* const enc);
+int VP8StatLoop(VP8Encoder* const enc);
 
   // in webpenc.c
 // Assign an error code to a picture. Return false for convenience.
@@ -524,14 +485,12 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
-                VP8RDLevel rd_opt);
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
 
   // in alpha.c
 void VP8EncInitAlpha(VP8Encoder* const enc);    // initialize alpha compression
-int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
-int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
+void VP8EncDeleteAlpha(VP8Encoder* const enc);  // delete compressed data
 
   // in layer.c
 void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
@@ -557,13 +516,9 @@ void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
 void VP8AdjustFilterStrength(VP8EncIterator* const it);
 
-// returns the approximate filtering strength needed to smooth a edge
-// step of 'delta', given a sharpness parameter 'sharpness'.
-int VP8FilterStrengthFromDelta(int sharpness, int delta);
-
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/vp8l.c b/drivers/webp/enc/vp8l.c
index 15726318e2..9c202f8d36 100644
--- a/drivers/webp/enc/vp8l.c
+++ b/drivers/webp/enc/vp8l.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // main entry for the lossless encoder.
@@ -25,6 +23,10 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
 #define MAX_COLORS_FOR_GRAPH      64
@@ -35,8 +37,7 @@
 static int CompareColors(const void* p1, const void* p2) {
   const uint32_t a = *(const uint32_t*)p1;
   const uint32_t b = *(const uint32_t*)p2;
-  assert(a != b);
-  return (a < b) ? -1 : 1;
+  return (a < b) ? -1 : (a > b) ? 1 : 0;
 }
 
 // If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
@@ -84,7 +85,7 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
     argb += pic->argb_stride;
   }
 
-  // TODO(skal): could we reuse in_use[] to speed up EncodePalette()?
+  // TODO(skal): could we reuse in_use[] to speed up ApplyPalette()?
   num_colors = 0;
   for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
     if (in_use[i]) {
@@ -164,6 +165,9 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       }
       if (pred_entropy < 0.95 * non_pred_entropy) {
         enc->use_predict_ = 1;
+        // TODO(vikasa): Observed some correlation of cross_color transform with
+        // predict. Need to investigate this further and add separate heuristic
+        // for setting use_cross_color flag.
         enc->use_cross_color_ = 1;
       }
     }
@@ -216,7 +220,7 @@ static int GetHuffBitLengthsAndCodes(
   }
 
   // Create Huffman trees.
-  for (i = 0; ok && (i < histogram_image_size); ++i) {
+  for (i = 0; i < histogram_image_size; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     VP8LHistogram* const histo = histogram_image->histograms[i];
     ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
@@ -227,11 +231,7 @@ static int GetHuffBitLengthsAndCodes(
   }
 
  End:
-  if (!ok) {
-    free(mem_buf);
-    // If one VP8LCreateHuffmanTree() above fails, we need to clean up behind.
-    memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
-  }
+  if (!ok) free(mem_buf);
   return ok;
 }
 
@@ -406,10 +406,9 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
 }
 
 static void WriteHuffmanCode(VP8LBitWriter* const bw,
-                             const HuffmanTreeCode* const code,
-                             int code_index) {
-  const int depth = code->code_lengths[code_index];
-  const int symbol = code->codes[code_index];
+                             const HuffmanTreeCode* const code, int index) {
+  const int depth = code->code_lengths[index];
+  const int symbol = code->codes[index];
   VP8LWriteBits(bw, depth, symbol);
 }
 
@@ -444,12 +443,12 @@ static void StoreImageToBitMask(
       int bits, n_bits;
       int code, distance;
 
-      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+      PrefixEncode(v->len, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes, 256 + code);
       VP8LWriteBits(bw, n_bits, bits);
 
       distance = PixOrCopyDistance(v);
-      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+      PrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
       VP8LWriteBits(bw, n_bits, bits);
     }
@@ -530,12 +529,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
-
-  if (histogram_image == NULL || histogram_symbols == NULL) {
-    free(histogram_image);
-    free(histogram_symbols);
-    return 0;
-  }
+  if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
 
   // Calculate backward references from ARGB image.
   if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
@@ -558,9 +552,6 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
     goto Error;
   }
-  // Free combined histograms.
-  free(histogram_image);
-  histogram_image = NULL;
 
   // Color Cache parameters.
   VP8LWriteBits(bw, 1, use_color_cache);
@@ -580,10 +571,10 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       uint32_t i;
       if (histogram_argb == NULL) goto Error;
       for (i = 0; i < histogram_image_xysize; ++i) {
-        const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = 0xff000000 | (symbol_index << 8);
-        if (symbol_index >= max_index) {
-          max_index = symbol_index + 1;
+        const int index = histogram_symbols[i] & 0xffff;
+        histogram_argb[i] = 0xff000000 | (index << 8);
+        if (index >= max_index) {
+          max_index = index + 1;
         }
       }
       histogram_image_size = max_index;
@@ -607,6 +598,9 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       ClearHuffmanTreeIfOnlyOneSymbol(codes);
     }
   }
+  // Free combined histograms.
+  free(histogram_image);
+  histogram_image = NULL;
 
   // Store actual literals.
   StoreImageToBitMask(bw, width, histogram_bits, &refs,
@@ -614,7 +608,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
   ok = 1;
 
  Error:
-  free(histogram_image);
+  if (!ok) free(histogram_image);
 
   VP8LClearBackwardRefs(&refs);
   if (huffman_codes != NULL) {
@@ -695,7 +689,7 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
+  const int step = (quality == 0) ? 32 : 8;
 
   VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
                           enc->argb_, enc->transform_data_);
@@ -712,6 +706,13 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
 
 // -----------------------------------------------------------------------------
 
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+  data[3] = (val >> 24) & 0xff;
+}
+
 static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
                                          size_t riff_size, size_t vp8l_size) {
   uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
@@ -806,94 +807,61 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   return err;
 }
 
-static void ApplyPalette(uint32_t* src, uint32_t* dst,
-                         uint32_t src_stride, uint32_t dst_stride,
-                         const uint32_t* palette, int palette_size,
-                         int width, int height, int xbits, uint8_t* row) {
-  int i, x, y;
-  int use_LUT = 1;
-  for (i = 0; i < palette_size; ++i) {
-    if ((palette[i] & 0xffff00ffu) != 0) {
-      use_LUT = 0;
-      break;
-    }
-  }
+// Bundles multiple (2, 4 or 8) pixels into a single pixel.
+// Returns the new xsize.
+static void BundleColorMap(const WebPPicture* const pic,
+                           int xbits, uint32_t* bundled_argb, int xs) {
+  int y;
+  const int bit_depth = 1 << (3 - xbits);
+  uint32_t code = 0;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
 
-  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
-    for (i = 0; i < palette_size; ++i) {
-      const int color = (palette[i] >> 8) & 0xff;
-      inv_palette[color] = i;
-    }
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int color = (src[x] >> 8) & 0xff;
-        row[x] = inv_palette[color];
-      }
-      VP8LBundleColorMap(row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix = palette[0];
-    int last_idx = 0;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t pix = src[x];
-        if (pix != last_pix) {
-          for (i = 0; i < palette_size; ++i) {
-            if (pix == palette[i]) {
-              last_idx = i;
-              last_pix = pix;
-              break;
-            }
-          }
-        }
-        row[x] = last_idx;
+  for (y = 0; y < height; ++y) {
+    int x;
+    for (x = 0; x < width; ++x) {
+      const int mask = (1 << xbits) - 1;
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0;
       }
-      VP8LBundleColorMap(row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+      // TODO(vikasa): simplify the bundling logic.
+      code |= (argb[x] & 0xff00) << (bit_depth * xsub);
+      bundled_argb[y * xs + (x >> xbits)] = 0xff000000 | code;
     }
+    argb += pic->argb_stride;
   }
 }
 
 // Note: Expects "enc->palette_" to be set properly.
 // Also, "enc->palette_" will be modified after this call and should not be used
 // later.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
-                                       VP8LEncoder* const enc, int quality) {
+static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
+                                      VP8LEncoder* const enc, int quality) {
   WebPEncodingError err = VP8_ENC_OK;
-  int i;
+  int i, x, y;
   const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst;
+  uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
   uint32_t* const palette = enc->palette_;
   const int palette_size = enc->palette_size_;
-  uint8_t* row = NULL;
-  int xbits;
 
   // Replace each input pixel by corresponding palette index.
-  // This is done line by line.
-  if (palette_size <= 4) {
-    xbits = (palette_size <= 2) ? 3 : 2;
-  } else {
-    xbits = (palette_size <= 16) ? 1 : 0;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const uint32_t pix = argb[x];
+      for (i = 0; i < palette_size; ++i) {
+        if (pix == palette[i]) {
+          argb[x] = 0xff000000u | (i << 8);
+          break;
+        }
+      }
+    }
+    argb += pic->argb_stride;
   }
 
-  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) goto Error;
-  dst = enc->argb_;
-
-  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
-  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
-               palette, palette_size, width, height, xbits, row);
-
   // Save palette to bitstream.
   VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
   VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
@@ -907,21 +875,36 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
     goto Error;
   }
 
+  if (palette_size <= 16) {
+    // Image can be packed (multiple pixels per uint32_t).
+    int xbits = 1;
+    if (palette_size <= 2) {
+      xbits = 3;
+    } else if (palette_size <= 4) {
+      xbits = 2;
+    }
+    err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+    if (err != VP8_ENC_OK) goto Error;
+    BundleColorMap(pic, xbits, enc->argb_, enc->current_width_);
+  }
+
  Error:
-  free(row);
   return err;
 }
 
 // -----------------------------------------------------------------------------
 
-static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const uint64_t hist_size = sizeof(VP8LHistogram);
+static int GetHistoBits(const WebPConfig* const config,
+                        const WebPPicture* const pic) {
+  const int width = pic->width;
+  const int height = pic->height;
+  const size_t hist_size = sizeof(VP8LHistogram);
   // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = (use_palette ? 9 : 7) - method;
+  int histo_bits = 7 - config->method;
   while (1) {
-    const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                     VP8LSubSampleSize(height, histo_bits) *
-                                     hist_size;
+    const size_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                   VP8LSubSampleSize(height, histo_bits) *
+                                   hist_size;
     if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
     ++histo_bits;
   }
@@ -929,14 +912,13 @@ static int GetHistoBits(int method, int use_palette, int width, int height) {
          (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }
 
-static void FinishEncParams(VP8LEncoder* const enc) {
+static void InitEncParams(VP8LEncoder* const enc) {
   const WebPConfig* const config = enc->config_;
-  const WebPPicture* const pic = enc->pic_;
+  const WebPPicture* const picture = enc->pic_;
   const int method = config->method;
   const float quality = config->quality;
-  const int use_palette = enc->use_palette_;
   enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
-  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
+  enc->histo_bits_ = GetHistoBits(config, picture);
   enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
 }
 
@@ -952,9 +934,6 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   }
   enc->config_ = config;
   enc->pic_ = picture;
-
-  VP8LDspInit();
-
   return enc;
 }
 
@@ -981,6 +960,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
+  InitEncParams(enc);
+
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
@@ -989,10 +970,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
-  FinishEncParams(enc);
-
   if (enc->use_palette_) {
-    err = EncodePalette(bw, enc, quality);
+    err = ApplyPalette(bw, enc, quality);
     if (err != VP8_ENC_OK) goto Error;
     // Color cache is disabled for palette.
     enc->cache_bits_ = 0;
@@ -1166,3 +1145,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/vp8li.h b/drivers/webp/enc/vp8li.h
index 96d6faed64..eae90dd61f 100644
--- a/drivers/webp/enc/vp8li.h
+++ b/drivers/webp/enc/vp8li.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Lossless encoder: internal header.
@@ -19,7 +17,7 @@
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -63,7 +61,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/webpenc.c b/drivers/webp/enc/webpenc.c
index 207cce6beb..3c275589fc 100644
--- a/drivers/webp/enc/webpenc.c
+++ b/drivers/webp/enc/webpenc.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // WebP encoder: main entry point
@@ -22,6 +20,10 @@
 
 // #define PRINT_MEMORY_INFO
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef PRINT_MEMORY_INFO
 #include <stdio.h>
 #endif
@@ -91,53 +93,34 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
   enc->nz_[-1] = 0;   // constant
 }
 
-// Mapping from config->method_ to coding tools used.
-//-------------------+---+---+---+---+---+---+---+
-//   Method          | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
-//-------------------+---+---+---+---+---+---+---+
-// fast probe        | x |   |   | x |   |   |   |
-//-------------------+---+---+---+---+---+---+---+
-// dynamic proba     | ~ | x | x | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// basic rd-opt      |   |   |   | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// disto-score i4/16 |   |   | x |   |   |   |   |
-//-------------------+---+---+---+---+---+---+---+
-// rd-opt i4/16      |   |   | ~ | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// token buffer (opt)|   |   |   | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// Trellis           |   |   |   |   |   | x |Ful|
-//-------------------+---+---+---+---+---+---+---+
-// full-SNS          |   |   |   |   | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
+// Map configured quality level to coding tools used.
+//-------------+---+---+---+---+---+---+
+//   Quality   | 0 | 1 | 2 | 3 | 4 | 5 +
+//-------------+---+---+---+---+---+---+
+// dynamic prob| ~ | x | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// rd-opt modes|   |   | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// fast i4/i16 | x | x |   |   |   |   |
+//-------------+---+---+---+---+---+---+
+// rd-opt i4/16|   |   | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// Trellis     |   | x |   |   | x | x |
+//-------------+---+---+---+---+---+---+
+// full-SNS    |   |   |   |   |   | x |
+//-------------+---+---+---+---+---+---+
 
 static void MapConfigToTools(VP8Encoder* const enc) {
-  const WebPConfig* const config = enc->config_;
-  const int method = config->method;
-  const int limit = 100 - config->partition_limit;
+  const int method = enc->config_->method;
+  const int limit = 100 - enc->config_->partition_limit;
   enc->method_ = method;
-  enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
-                     : (method >= 5) ? RD_OPT_TRELLIS
-                     : (method >= 3) ? RD_OPT_BASIC
-                     : RD_OPT_NONE;
+  enc->rd_opt_level_ = (method >= 6) ? 3
+                     : (method >= 5) ? 2
+                     : (method >= 3) ? 1
+                     : 0;
   enc->max_i4_header_bits_ =
       256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
       (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
-
-  enc->thread_level_ = config->thread_level;
-
-  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
-  if (!config->low_memory) {
-#if !defined(DISABLE_TOKEN_BUFFER)
-    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
-#endif
-    if (enc->use_tokens_) {
-      enc->num_parts_ = 1;   // doesn't work with multi-partition
-    }
-  }
 }
 
 // Memory scaling with dimensions:
@@ -153,7 +136,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //             non-zero: 196
 //             lf-stats: 2048
 //                total: 68635
-// Transient object sizes:
+// Transcient object sizes:
 //       VP8EncIterator: 352
 //         VP8ModeScore: 912
 //       VP8SegmentInfo: 532
@@ -171,16 +154,20 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   const int preds_h = 4 * mb_h + 1;
   const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
+  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t);
+  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
   const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + ALIGN_CST;                        // align all
+  const size_t samples_size = (2 * top_stride +         // top-luma/u/v
+                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
+                               2 * ALIGN_CST)           // align all
+                               * sizeof(uint8_t);
   const size_t lf_stats_size =
       config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
   VP8Encoder* enc;
   uint8_t* mem;
   const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
                       + ALIGN_CST                      // cache alignment
+                      + cache_size                     // working caches
                       + info_size                      // modes info
                       + preds_size                     // prediction modes
                       + samples_size                   // top/left samples
@@ -191,15 +178,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   printf("===================================\n");
   printf("Memory used:\n"
          "             encoder: %ld\n"
+         "         block cache: %ld\n"
          "                info: %ld\n"
          "               preds: %ld\n"
          "         top samples: %ld\n"
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
-  printf("Transient object sizes:\n"
+  printf("Transcient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
          "      VP8SegmentInfo: %ld\n"
@@ -224,11 +212,19 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_w_ = mb_w;
   enc->mb_h_ = mb_h;
   enc->preds_w_ = preds_w;
+  enc->yuv_in_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out2_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_p_ = (uint8_t*)mem;
+  mem += PRED_SIZE;
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
   mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
+  enc->nz_ = 1 + (uint32_t*)mem;
   mem += nz_size;
   enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
   mem += lf_stats_size;
@@ -238,7 +234,13 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->y_top_ = (uint8_t*)mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
-  assert(mem <= (uint8_t*)enc + size);
+  mem = (uint8_t*)DO_ALIGN(mem + 1);
+  enc->y_left_ = (uint8_t*)mem;
+  mem += 16 + 16;
+  enc->u_left_ = (uint8_t*)mem;
+  mem += 16;
+  enc->v_left_ = (uint8_t*)mem;
+  mem += 8;
 
   enc->config_ = config;
   enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
@@ -257,27 +259,23 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   VP8EncInitLayer(enc);
 #endif
 
-  VP8TBufferInit(&enc->tokens_);
   return enc;
 }
 
-static int DeleteVP8Encoder(VP8Encoder* enc) {
-  int ok = 1;
+static void DeleteVP8Encoder(VP8Encoder* enc) {
   if (enc != NULL) {
-    ok = VP8EncDeleteAlpha(enc);
+    VP8EncDeleteAlpha(enc);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     VP8EncDeleteLayer(enc);
 #endif
-    VP8TBufferClear(&enc->tokens_);
     free(enc);
   }
-  return ok;
 }
 
 //------------------------------------------------------------------------------
 
 static double GetPSNR(uint64_t err, uint64_t size) {
-  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+  return err ? 10. * log10(255. * 255. * size / err) : 99.;
 }
 
 static void FinalizePSNR(const VP8Encoder* const enc) {
@@ -334,7 +332,7 @@ int WebPReportProgress(const WebPPicture* const pic,
 //------------------------------------------------------------------------------
 
 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
-  int ok = 0;
+  int ok;
 
   if (pic == NULL)
     return 0;
@@ -353,48 +351,32 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
     if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
-      // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (pic->argb != NULL) {
+        if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
+      } else {
+        return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
       }
     }
 
     enc = InitVP8Encoder(config, pic);
     if (enc == NULL) return 0;  // pic->error is already set.
     // Note: each of the tasks below account for 20% in the progress report.
-    ok = VP8EncAnalyze(enc);
-
-    // Analysis is done, proceed to actual coding.
-    ok = ok && VP8EncStartAlpha(enc);   // possibly done in parallel
-    if (!enc->use_tokens_) {
-      ok = ok && VP8EncLoop(enc);
-    } else {
-      ok = ok && VP8EncTokenLoop(enc);
-    }
-    ok = ok && VP8EncFinishAlpha(enc);
+    ok = VP8EncAnalyze(enc)
+      && VP8StatLoop(enc)
+      && VP8EncLoop(enc)
+      && VP8EncFinishAlpha(enc)
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-    ok = ok && VP8EncFinishLayer(enc);
+      && VP8EncFinishLayer(enc)
 #endif
-
-    ok = ok && VP8EncWrite(enc);
+      && VP8EncWrite(enc);
     StoreStats(enc);
     if (!ok) {
       VP8EncFreeBitWriters(enc);
     }
-    ok &= DeleteVP8Encoder(enc);  // must always be called, even if !ok
+    DeleteVP8Encoder(enc);
   } else {
-    // Make sure we have ARGB samples.
-    if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
-      return 0;
-    }
+    if (pic->argb == NULL)
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
 
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
   }
@@ -402,3 +384,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/encode.h b/drivers/webp/encode.h
index 7a428b4e6e..2e37cfabe7 100644
--- a/drivers/webp/encode.h
+++ b/drivers/webp/encode.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   WebP encoder: main interface
@@ -16,22 +14,11 @@
 
 #include "./types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0202    // MAJOR(8b) + MINOR(8b)
-
-// Note: forward declaring enumerations is not allowed in (strict) C and C++,
-// the types are left here for reference.
-// typedef enum WebPImageHint WebPImageHint;
-// typedef enum WebPEncCSP WebPEncCSP;
-// typedef enum WebPPreset WebPPreset;
-// typedef enum WebPEncodingError WebPEncodingError;
-typedef struct WebPConfig WebPConfig;
-typedef struct WebPPicture WebPPicture;   // main structure for I/O
-typedef struct WebPAuxStats WebPAuxStats;
-typedef struct WebPMemoryWriter WebPMemoryWriter;
+#define WEBP_ENCODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
@@ -79,7 +66,7 @@ WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra,
 // Coding parameters
 
 // Image characteristics hint for the underlying encoder.
-typedef enum WebPImageHint {
+typedef enum {
   WEBP_HINT_DEFAULT = 0,  // default preset.
   WEBP_HINT_PICTURE,      // digital picture, like portrait, inner shot
   WEBP_HINT_PHOTO,        // outdoor photograph, with natural lighting
@@ -87,8 +74,7 @@ typedef enum WebPImageHint {
   WEBP_HINT_LAST
 } WebPImageHint;
 
-// Compression parameters.
-struct WebPConfig {
+typedef struct {
   int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
   float quality;          // between 0 (smallest file) and 100 (biggest)
   int method;             // quality/speed trade-off (0=fast, 6=slower-better)
@@ -117,26 +103,19 @@ struct WebPConfig {
 
   int show_compressed;    // if true, export the compressed picture back.
                           // In-loop filtering is not applied.
-  int preprocessing;      // preprocessing filter:
-                          // 0=none, 1=segment-smooth, 2=pseudo-random dithering
+  int preprocessing;      // preprocessing filter (0=none, 1=segment-smooth)
   int partitions;         // log2(number of token partitions) in [0..3]. Default
                           // is set to 0 for easier progressive decoding.
   int partition_limit;    // quality degradation allowed to fit the 512k limit
                           // on prediction modes coding (0: no degradation,
                           // 100: maximum possible degradation).
-  int emulate_jpeg_size;  // If true, compression parameters will be remapped
-                          // to better match the expected output size from
-                          // JPEG compression. Generally, the output size will
-                          // be similar but the degradation will be lower.
-  int thread_level;       // If non-zero, try and use multi-threaded encoding.
-  int low_memory;         // If set, reduce memory usage (but increase CPU use).
-
-  uint32_t pad[5];        // padding for later use
-};
+
+  uint32_t pad[8];        // padding for later use
+} WebPConfig;
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
 // of source picture. These presets are used when calling WebPConfigPreset().
-typedef enum WebPPreset {
+typedef enum {
   WEBP_PRESET_DEFAULT = 0,  // default preset.
   WEBP_PRESET_PICTURE,      // digital picture, like portrait, inner shot
   WEBP_PRESET_PHOTO,        // outdoor photograph, with natural lighting
@@ -173,9 +152,11 @@ WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config);
 
 //------------------------------------------------------------------------------
 // Input / Output
-// Structure for storing auxiliary statistics (mostly for lossy encoding).
 
-struct WebPAuxStats {
+typedef struct WebPPicture WebPPicture;   // main structure for I/O
+
+// Structure for storing auxiliary statistics (mostly for lossy encoding).
+typedef struct {
   int coded_size;         // final size
 
   float PSNR[5];          // peak-signal-to-noise ratio for Y/U/V/All/Alpha
@@ -201,7 +182,7 @@ struct WebPAuxStats {
   int lossless_size;           // final lossless size
 
   uint32_t pad[4];        // padding for later use
-};
+} WebPAuxStats;
 
 // Signature for output function. Should return true if writing was successful.
 // data/data_size is the segment of data to write, and 'picture' is for
@@ -211,19 +192,18 @@ typedef int (*WebPWriterFunction)(const uint8_t* data, size_t data_size,
 
 // WebPMemoryWrite: a special WebPWriterFunction that writes to memory using
 // the following WebPMemoryWriter object (to be set as a custom_ptr).
-struct WebPMemoryWriter {
+typedef struct {
   uint8_t* mem;       // final buffer (of size 'max_size', larger than 'size').
   size_t   size;      // final size
   size_t   max_size;  // total capacity
   uint32_t pad[1];    // padding for later use
-};
+} WebPMemoryWriter;
 
 // The following must be called first before any use.
 WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
 
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
-// writer.mem must be freed using the call 'free(writer.mem)'.
 WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
                                  const WebPPicture* picture);
 
@@ -232,8 +212,7 @@ WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
 // everything is OK.
 typedef int (*WebPProgressHook)(int percent, const WebPPicture* picture);
 
-// Color spaces.
-typedef enum WebPEncCSP {
+typedef enum {
   // chroma sampling
   WEBP_YUV420 = 0,   // 4:2:0
   WEBP_YUV422 = 1,   // 4:2:2
@@ -249,7 +228,7 @@ typedef enum WebPEncCSP {
 } WebPEncCSP;
 
 // Encoding error conditions.
-typedef enum WebPEncodingError {
+typedef enum {
   VP8_ENC_OK = 0,
   VP8_ENC_ERROR_OUT_OF_MEMORY,            // memory error allocating objects
   VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY,  // memory error while flushing bits
@@ -269,6 +248,7 @@ typedef enum WebPEncodingError {
 
 // Main exchange structure (input samples, output bytes, statistics)
 struct WebPPicture {
+
   //   INPUT
   //////////////
   // Main flag for encoder selecting between ARGB or YUV input.
@@ -363,19 +343,18 @@ WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* picture);
 // preserved.
 WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
 
-// Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return, *dst
-// will fully own the copied pixels (this is not a view). The 'dst' picture need
-// not be initialized as its content is overwritten.
+// Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return,
+// *dst will fully own the copied pixels (this is not a view).
 // Returns false in case of memory allocation error.
 WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 
-// Compute PSNR, SSIM or LSIM distortion metric between two pictures.
+// Compute PSNR or SSIM distortion between two pictures.
 // Result is in dB, stores in result[] in the Y/U/V/Alpha/All order.
-// Returns false in case of error (src and ref don't have same dimension, ...)
+// Returns false in case of error (pic1 and pic2 don't have same dimension, ...)
 // Warning: this function is rather CPU-intensive.
 WEBP_EXTERN(int) WebPPictureDistortion(
-    const WebPPicture* src, const WebPPicture* ref,
-    int metric_type,           // 0 = PSNR, 1 = SSIM, 2 = LSIM
+    const WebPPicture* pic1, const WebPPicture* pic2,
+    int metric_type,           // 0 = PSNR, 1 = SSIM
     float result[5]);
 
 // self-crops a picture to the rectangle defined by top/left/width/height.
@@ -396,9 +375,7 @@ WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
 // the top and left coordinates will be snapped to even values.
 // Picture 'src' must out-live 'dst' picture. Self-extraction of view is allowed
 // ('src' equal to 'dst') as a mean of fast-cropping (but note that doing so,
-// the original dimension will be lost). Picture 'dst' need not be initialized
-// with WebPPictureInit() if it is different from 'src', since its content will
-// be overwritten.
+// the original dimension will be lost).
 // Returns false in case of memory allocation error or invalid parameters.
 WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
                                  int left, int top, int width, int height,
@@ -444,13 +421,6 @@ WEBP_EXTERN(int) WebPPictureImportBGRX(
 WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
                                        WebPEncCSP colorspace);
 
-// Same as WebPPictureARGBToYUVA(), but the conversion is done using
-// pseudo-random dithering with a strength 'dithering' between
-// 0.0 (no dithering) and 1.0 (maximum dithering). This is useful
-// for photographic picture.
-WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
-    WebPPicture* picture, WebPEncCSP colorspace, float dithering);
-
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A.
 // Note that the use of this method is discouraged if one has access to the
@@ -469,11 +439,6 @@ WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
 // alpha plane can be ignored altogether e.g.).
 WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture);
 
-// Remove the transparency information (if present) by blending the color with
-// the background color 'background_rgb' (specified as 24bit RGB triplet).
-// After this call, all alpha values are reset to 0xff.
-WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
-
 //------------------------------------------------------------------------------
 // Main call
 
@@ -491,7 +456,7 @@ WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/format_constants.h b/drivers/webp/format_constants.h
index 4c04b50c6a..7ce498f672 100644
--- a/drivers/webp/format_constants.h
+++ b/drivers/webp/format_constants.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //  Internal header for constants related to WebP file format.
@@ -14,9 +12,6 @@
 #ifndef WEBP_WEBP_FORMAT_CONSTANTS_H_
 #define WEBP_WEBP_FORMAT_CONSTANTS_H_
 
-// Create fourcc of the chunk from the chunk tag characters.
-#define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
-
 // VP8 related constants.
 #define VP8_SIGNATURE 0x9d012a              // Signature in VP8 data.
 #define VP8_MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
@@ -70,16 +65,23 @@ typedef enum {
 #define CHUNK_SIZE_BYTES   4     // Size needed to store chunk's size.
 #define CHUNK_HEADER_SIZE  8     // Size of a chunk header.
 #define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
-#define ANMF_CHUNK_SIZE    16    // Size of an ANMF chunk.
-#define ANIM_CHUNK_SIZE    6     // Size of an ANIM chunk.
-#define FRGM_CHUNK_SIZE    6     // Size of a FRGM chunk.
+#define FRAME_CHUNK_SIZE   15    // Size of a FRM chunk.
+#define LOOP_CHUNK_SIZE    2     // Size of a LOOP chunk.
+#define TILE_CHUNK_SIZE    6     // Size of a TILE chunk.
 #define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
 
-#define MAX_CANVAS_SIZE     (1 << 24)     // 24-bit max for VP8X width/height.
-#define MAX_IMAGE_AREA      (1ULL << 32)  // 32-bit max for width x height.
-#define MAX_LOOP_COUNT      (1 << 16)     // maximum value for loop-count
-#define MAX_DURATION        (1 << 24)     // maximum duration
-#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame/fragment x/y offset
+#define TILING_FLAG_BIT    0x01  // Set if tiles are possibly used.
+#define ANIMATION_FLAG_BIT 0x02  // Set if some animation is expected
+#define ICC_FLAG_BIT       0x04  // Whether ICC is present or not.
+#define METADATA_FLAG_BIT  0x08  // Set if some META chunk is possibly present.
+#define ALPHA_FLAG_BIT     0x10  // Should be same as the ALPHA_FLAG in mux.h
+#define ROTATION_FLAG_BITS 0xe0  // all 3 bits for rotation + symmetry
+
+#define MAX_CANVAS_SIZE     (1 << 24)    // 24-bit max for VP8X width/height.
+#define MAX_IMAGE_AREA      (1ULL << 32) // 32-bit max for width x height.
+#define MAX_LOOP_COUNT      (1 << 16)    // maximum value for loop-count
+#define MAX_DURATION        (1 << 24)    // maximum duration
+#define MAX_POSITION_OFFSET (1 << 24)    // maximum frame/tile x/y offset
 
 // Maximum chunk payload is such that adding the header and padding won't
 // overflow a uint32_t.
diff --git a/drivers/webp/mux.h b/drivers/webp/mux.h
index eb57f51c64..5139af80fa 100644
--- a/drivers/webp/mux.h
+++ b/drivers/webp/mux.h
@@ -1,72 +1,60 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  RIFF container manipulation for WebP images.
+//  RIFF container manipulation for WEBP images.
 //
 // Authors: Urvang (urvang@google.com)
 //          Vikas (vikasa@google.com)
 
 // This API allows manipulation of WebP container images containing features
-// like color profile, metadata, animation and fragmented images.
+// like Color profile, XMP metadata, Animation and Tiling.
+//
+// Code Example#1: Creating a MUX with image data, color profile and XMP
+// metadata.
+//
+//   int copy_data = 0;
+//   WebPMux* mux = WebPMuxNew();
+//   // ... (Prepare image data).
+//   WebPMuxSetImage(mux, &image, copy_data);
+//   // ... (Prepare ICCP color profile data).
+//   WebPMuxSetColorProfile(mux, &icc_profile, copy_data);
+//   // ... (Prepare XMP metadata).
+//   WebPMuxSetMetadata(mux, &xmp, copy_data);
+//   // Get data from mux in WebP RIFF format.
+//   WebPMuxAssemble(mux, &output_data);
+//   WebPMuxDelete(mux);
+//   // ... (Consume output_data; e.g. write output_data.bytes_ to file).
+//   WebPDataClear(&output_data);
 //
-// Code Example#1: Create a WebPMux object with image data, color profile and
-// XMP metadata.
-/*
-  int copy_data = 0;
-  WebPMux* mux = WebPMuxNew();
-  // ... (Prepare image data).
-  WebPMuxSetImage(mux, &image, copy_data);
-  // ... (Prepare ICCP color profile data).
-  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
-  // ... (Prepare XMP metadata).
-  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
-  // Get data from mux in WebP RIFF format.
-  WebPMuxAssemble(mux, &output_data);
-  WebPMuxDelete(mux);
-  // ... (Consume output_data; e.g. write output_data.bytes to file).
-  WebPDataClear(&output_data);
-*/
-
 // Code Example#2: Get image and color profile data from a WebP file.
-/*
-  int copy_data = 0;
-  // ... (Read data from file).
-  WebPMux* mux = WebPMuxCreate(&data, copy_data);
-  WebPMuxGetFrame(mux, 1, &image);
-  // ... (Consume image; e.g. call WebPDecode() to decode the data).
-  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
-  // ... (Consume icc_data).
-  WebPMuxDelete(mux);
-  free(data);
-*/
+//
+//   int copy_data = 0;
+//   // ... (Read data from file).
+//   WebPMux* mux = WebPMuxCreate(&data, copy_data);
+//   WebPMuxGetImage(mux, &image);
+//   // ... (Consume image; e.g. call WebPDecode() to decode the data).
+//   WebPMuxGetColorProfile(mux, &icc_profile);
+//   // ... (Consume icc_data).
+//   WebPMuxDelete(mux);
+//   free(data);
 
 #ifndef WEBP_WEBP_MUX_H_
 #define WEBP_WEBP_MUX_H_
 
-#include "./mux_types.h"
+#include "./types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define WEBP_MUX_ABI_VERSION 0x0101        // MAJOR(8b) + MINOR(8b)
-
-// Note: forward declaring enumerations is not allowed in (strict) C and C++,
-// the types are left here for reference.
-// typedef enum WebPMuxError WebPMuxError;
-// typedef enum WebPChunkId WebPChunkId;
-typedef struct WebPMux WebPMux;   // main opaque object.
-typedef struct WebPMuxFrameInfo WebPMuxFrameInfo;
-typedef struct WebPMuxAnimParams WebPMuxAnimParams;
+#define WEBP_MUX_ABI_VERSION 0x0100        // MAJOR(8b) + MINOR(8b)
 
 // Error codes
-typedef enum WebPMuxError {
+typedef enum {
   WEBP_MUX_OK                 =  1,
   WEBP_MUX_NOT_FOUND          =  0,
   WEBP_MUX_INVALID_ARGUMENT   = -1,
@@ -75,26 +63,51 @@ typedef enum WebPMuxError {
   WEBP_MUX_NOT_ENOUGH_DATA    = -4
 } WebPMuxError;
 
+// Flag values for different features used in VP8X chunk.
+typedef enum {
+  TILE_FLAG       = 0x00000001,
+  ANIMATION_FLAG  = 0x00000002,
+  ICCP_FLAG       = 0x00000004,
+  META_FLAG       = 0x00000008,
+  ALPHA_FLAG      = 0x00000010
+} WebPFeatureFlags;
+
 // IDs for different types of chunks.
-typedef enum WebPChunkId {
+typedef enum {
   WEBP_CHUNK_VP8X,     // VP8X
   WEBP_CHUNK_ICCP,     // ICCP
-  WEBP_CHUNK_ANIM,     // ANIM
-  WEBP_CHUNK_ANMF,     // ANMF
-  WEBP_CHUNK_FRGM,     // FRGM
+  WEBP_CHUNK_LOOP,     // LOOP
+  WEBP_CHUNK_FRAME,    // FRM
+  WEBP_CHUNK_TILE,     // TILE
   WEBP_CHUNK_ALPHA,    // ALPH
   WEBP_CHUNK_IMAGE,    // VP8/VP8L
-  WEBP_CHUNK_EXIF,     // EXIF
-  WEBP_CHUNK_XMP,      // XMP
+  WEBP_CHUNK_META,     // META
   WEBP_CHUNK_UNKNOWN,  // Other chunks.
   WEBP_CHUNK_NIL
 } WebPChunkId;
 
+typedef struct WebPMux WebPMux;   // main opaque object.
+
+// Data type used to describe 'raw' data, e.g., chunk data
+// (ICC profile, metadata) and WebP compressed image data.
+typedef struct {
+  const uint8_t* bytes_;
+  size_t size_;
+} WebPData;
+
 //------------------------------------------------------------------------------
+// Manipulation of a WebPData object.
+
+// Initializes the contents of the 'webp_data' object with default values.
+WEBP_EXTERN(void) WebPDataInit(WebPData* webp_data);
+
+// Clears the contents of the 'webp_data' object by calling free(). Does not
+// deallocate the object itself.
+WEBP_EXTERN(void) WebPDataClear(WebPData* webp_data);
 
-// Returns the version number of the mux library, packed in hexadecimal using
-// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetMuxVersion(void);
+// Allocates necessary storage for 'dst' and copies the contents of 'src'.
+// Returns true on success.
+WEBP_EXTERN(int) WebPDataCopy(const WebPData* src, WebPData* dst);
 
 //------------------------------------------------------------------------------
 // Life of a Mux object
@@ -123,8 +136,8 @@ WEBP_EXTERN(WebPMux*) WebPMuxCreateInternal(const WebPData*, int, int);
 // Creates a mux object from raw data given in WebP RIFF format.
 // Parameters:
 //   bitstream - (in) the bitstream data in WebP RIFF format
-//   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
 // Returns:
 //   A pointer to the mux object created from given data - on success.
 //   NULL - In case of invalid data or memory error.
@@ -134,219 +147,295 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 }
 
 //------------------------------------------------------------------------------
-// Non-image chunks.
+// Single Image.
 
-// Note: Only non-image related chunks should be managed through chunk APIs.
-// (Image related chunks are: "ANMF", "FRGM", "VP8 ", "VP8L" and "ALPH").
-// To add, get and delete images, use WebPMuxSetImage(), WebPMuxPushFrame(),
-// WebPMuxGetFrame() and WebPMuxDeleteFrame().
+// Sets the image in the mux object. Any existing images (including frame/tile)
+// will be removed.
+// Parameters:
+//   mux - (in/out) object in which the image is to be set
+//   bitstream - (in) can either be a raw VP8/VP8L bitstream or a single-image
+//               WebP file (non-animated and non-tiled)
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
+//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(WebPMux* mux,
+                                          const WebPData* bitstream,
+                                          int copy_data);
 
-// Adds a chunk with id 'fourcc' and data 'chunk_data' in the mux object.
-// Any existing chunk(s) with the same id will be removed.
+// Gets image data from the mux object.
+// The content of 'bitstream' is allocated using malloc(), and NOT
+// owned by the 'mux' object. It MUST be deallocated by the caller by calling
+// WebPDataClear().
 // Parameters:
-//   mux - (in/out) object to which the chunk is to be added
-//   fourcc - (in) a character array containing the fourcc of the given chunk;
-//                 e.g., "ICCP", "XMP ", "EXIF" etc.
-//   chunk_data - (in) the chunk data to be added
-//   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//   mux - (in) object from which the image is to be fetched
+//   bitstream - (out) the image data
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux, fourcc or chunk_data is NULL
-//                               or if fourcc corresponds to an image chunk.
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux or bitstream is NULL
+//                               OR mux contains animation/tiling.
+//   WEBP_MUX_NOT_FOUND - if image is not present in mux object.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxGetImage(const WebPMux* mux,
+                                          WebPData* bitstream);
+
+// Deletes the image in the mux object.
+// Parameters:
+//   mux - (in/out) object from which the image is to be deleted
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
+//                               OR if mux contains animation/tiling.
+//   WEBP_MUX_NOT_FOUND - if image is not present in mux object.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxDeleteImage(WebPMux* mux);
+
+//------------------------------------------------------------------------------
+// XMP Metadata.
+
+// Sets the XMP metadata in the mux object. Any existing metadata chunk(s) will
+// be removed.
+// Parameters:
+//   mux - (in/out) object to which the XMP metadata is to be added
+//   metadata - (in) the XMP metadata data to be added
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or metadata is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
-    WebPMux* mux, const char fourcc[4], const WebPData* chunk_data,
-    int copy_data);
+WEBP_EXTERN(WebPMuxError) WebPMuxSetMetadata(WebPMux* mux,
+                                             const WebPData* metadata,
+                                             int copy_data);
 
-// Gets a reference to the data of the chunk with id 'fourcc' in the mux object.
+// Gets a reference to the XMP metadata in the mux object.
 // The caller should NOT free the returned data.
 // Parameters:
-//   mux - (in) object from which the chunk data is to be fetched
-//   fourcc - (in) a character array containing the fourcc of the chunk;
-//                 e.g., "ICCP", "XMP ", "EXIF" etc.
-//   chunk_data - (out) returned chunk data
+//   mux - (in) object from which the XMP metadata is to be fetched
+//   metadata - (out) XMP metadata
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux, fourcc or chunk_data is NULL
-//                               or if fourcc corresponds to an image chunk.
-//   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given id.
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux or metadata is NULL.
+//   WEBP_MUX_NOT_FOUND - if metadata is not present in mux object.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk(
-    const WebPMux* mux, const char fourcc[4], WebPData* chunk_data);
+WEBP_EXTERN(WebPMuxError) WebPMuxGetMetadata(const WebPMux* mux,
+                                             WebPData* metadata);
 
-// Deletes the chunk with the given 'fourcc' from the mux object.
+// Deletes the XMP metadata in the mux object.
 // Parameters:
-//   mux - (in/out) object from which the chunk is to be deleted
-//   fourcc - (in) a character array containing the fourcc of the chunk;
-//                 e.g., "ICCP", "XMP ", "EXIF" etc.
+//   mux - (in/out) object from which XMP metadata is to be deleted
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or fourcc is NULL
-//                               or if fourcc corresponds to an image chunk.
-//   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given fourcc.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
+//   WEBP_MUX_NOT_FOUND - If mux does not contain metadata.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
-    WebPMux* mux, const char fourcc[4]);
+WEBP_EXTERN(WebPMuxError) WebPMuxDeleteMetadata(WebPMux* mux);
 
 //------------------------------------------------------------------------------
-// Images.
-
-// Encapsulates data about a single frame/fragment.
-struct WebPMuxFrameInfo {
-  WebPData    bitstream;  // image data: can be a raw VP8/VP8L bitstream
-                          // or a single-image WebP file.
-  int         x_offset;   // x-offset of the frame.
-  int         y_offset;   // y-offset of the frame.
-  int         duration;   // duration of the frame (in milliseconds).
-
-  WebPChunkId id;         // frame type: should be one of WEBP_CHUNK_ANMF,
-                          // WEBP_CHUNK_FRGM or WEBP_CHUNK_IMAGE
-  WebPMuxAnimDispose dispose_method;  // Disposal method for the frame.
-  WebPMuxAnimBlend   blend_method;    // Blend operation for the frame.
-  uint32_t    pad[1];     // padding for later use
-};
-
-// Sets the (non-animated and non-fragmented) image in the mux object.
-// Note: Any existing images (including frames/fragments) will be removed.
+// ICC Color Profile.
+
+// Sets the color profile in the mux object. Any existing color profile chunk(s)
+// will be removed.
 // Parameters:
-//   mux - (in/out) object in which the image is to be set
-//   bitstream - (in) can be a raw VP8/VP8L bitstream or a single-image
-//               WebP file (non-animated and non-fragmented)
-//   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//   mux - (in/out) object to which the color profile is to be added
+//   color_profile - (in) the color profile data to be added
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
-//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or color_profile is NULL
+//   WEBP_MUX_MEMORY_ERROR - on memory allocation error
+//   WEBP_MUX_OK - on success
+WEBP_EXTERN(WebPMuxError) WebPMuxSetColorProfile(WebPMux* mux,
+                                                 const WebPData* color_profile,
+                                                 int copy_data);
+
+// Gets a reference to the color profile in the mux object.
+// The caller should NOT free the returned data.
+// Parameters:
+//   mux - (in) object from which the color profile data is to be fetched
+//   color_profile - (out) color profile data
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux or color_profile is NULL.
+//   WEBP_MUX_NOT_FOUND - if color profile is not present in mux object.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
-    WebPMux* mux, const WebPData* bitstream, int copy_data);
-
-// Adds a frame at the end of the mux object.
-// Notes: (1) frame.id should be one of WEBP_CHUNK_ANMF or WEBP_CHUNK_FRGM
-//        (2) For setting a non-animated non-fragmented image, use
-//            WebPMuxSetImage() instead.
-//        (3) Type of frame being pushed must be same as the frames in mux.
-//        (4) As WebP only supports even offsets, any odd offset will be snapped
-//            to an even location using: offset &= ~1
+WEBP_EXTERN(WebPMuxError) WebPMuxGetColorProfile(const WebPMux* mux,
+                                                 WebPData* color_profile);
+
+// Deletes the color profile in the mux object.
+// Parameters:
+//   mux - (in/out) object from which color profile is to be deleted
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
+//   WEBP_MUX_NOT_FOUND - If mux does not contain color profile.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxDeleteColorProfile(WebPMux* mux);
+
+//------------------------------------------------------------------------------
+// Animation.
+
+// Adds an animation frame at the end of the mux object.
+// Note: as WebP only supports even offsets, any odd offset will be snapped to
+// an even location using: offset &= ~1
 // Parameters:
-//   mux - (in/out) object to which the frame is to be added
-//   frame - (in) frame data.
-//   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//   mux - (in/out) object to which an animation frame is to be added
+//   bitstream - (in) the image data corresponding to the frame. It can either
+//               be a raw VP8/VP8L bitstream or a single-image WebP file
+//               (non-animated and non-tiled)
+//   x_offset - (in) x-offset of the frame to be added
+//   y_offset - (in) y-offset of the frame to be added
+//   duration - (in) duration of the frame to be added (in milliseconds)
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or frame is NULL
-//                               or if content of 'frame' is invalid.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
-    WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
-
-// Gets the nth frame from the mux object.
-// The content of 'frame->bitstream' is allocated using malloc(), and NOT
+    WebPMux* mux, const WebPData* bitstream,
+    int x_offset, int y_offset, int duration, int copy_data);
+
+// TODO(urvang): Create a struct as follows to reduce argument list size:
+// typedef struct {
+//  WebPData bitstream;
+//  int x_offset, y_offset;
+//  int duration;
+// } FrameInfo;
+
+// Gets the nth animation frame from the mux object.
+// The content of 'bitstream' is allocated using malloc(), and NOT
 // owned by the 'mux' object. It MUST be deallocated by the caller by calling
 // WebPDataClear().
 // nth=0 has a special meaning - last position.
 // Parameters:
 //   mux - (in) object from which the info is to be fetched
 //   nth - (in) index of the frame in the mux object
-//   frame - (out) data of the returned frame
+//   bitstream - (out) the image data
+//   x_offset - (out) x-offset of the returned frame
+//   y_offset - (out) y-offset of the returned frame
+//   duration - (out) duration of the returned frame (in milliseconds)
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or frame is NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux, bitstream, x_offset,
+//                               y_offset, or duration is NULL
 //   WEBP_MUX_NOT_FOUND - if there are less than nth frames in the mux object.
 //   WEBP_MUX_BAD_DATA - if nth frame chunk in mux is invalid.
-//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
-    const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame);
+    const WebPMux* mux, uint32_t nth, WebPData* bitstream,
+    int* x_offset, int* y_offset, int* duration);
 
-// Deletes a frame from the mux object.
+// Deletes an animation frame from the mux object.
 // nth=0 has a special meaning - last position.
 // Parameters:
 //   mux - (in/out) object from which a frame is to be deleted
 //   nth - (in) The position from which the frame is to be deleted
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
 //   WEBP_MUX_NOT_FOUND - If there are less than nth frames in the mux object
 //                        before deletion.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth);
 
-//------------------------------------------------------------------------------
-// Animation.
-
-// Animation parameters.
-struct WebPMuxAnimParams {
-  uint32_t bgcolor;  // Background color of the canvas stored (in MSB order) as:
-                     // Bits 00 to 07: Alpha.
-                     // Bits 08 to 15: Red.
-                     // Bits 16 to 23: Green.
-                     // Bits 24 to 31: Blue.
-  int loop_count;    // Number of times to repeat the animation [0 = infinite].
-};
-
-// Sets the animation parameters in the mux object. Any existing ANIM chunks
-// will be removed.
+// Sets the animation loop count in the mux object. Any existing loop count
+// value(s) will be removed.
 // Parameters:
-//   mux - (in/out) object in which ANIM chunk is to be set/added
-//   params - (in) animation parameters.
+//   mux - (in/out) object in which loop chunk is to be set/added
+//   loop_count - (in) animation loop count value.
+//                Note that loop_count of zero denotes infinite loop.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
-    WebPMux* mux, const WebPMuxAnimParams* params);
+WEBP_EXTERN(WebPMuxError) WebPMuxSetLoopCount(WebPMux* mux, int loop_count);
 
-// Gets the animation parameters from the mux object.
+// Gets the animation loop count from the mux object.
 // Parameters:
-//   mux - (in) object from which the animation parameters to be fetched
-//   params - (out) animation parameters extracted from the ANIM chunk
+//   mux - (in) object from which the loop count is to be fetched
+//   loop_count - (out) the loop_count value present in the LOOP chunk
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
-//   WEBP_MUX_NOT_FOUND - if ANIM chunk is not present in mux object.
+//   WEBP_MUX_INVALID_ARGUMENT - if either of mux or loop_count is NULL
+//   WEBP_MUX_NOT_FOUND - if loop chunk is not present in mux object.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
-    const WebPMux* mux, WebPMuxAnimParams* params);
+WEBP_EXTERN(WebPMuxError) WebPMuxGetLoopCount(const WebPMux* mux,
+                                              int* loop_count);
 
 //------------------------------------------------------------------------------
-// Misc Utilities.
+// Tiling.
 
-// Gets the canvas size from the mux object.
-// Note: This method assumes that the VP8X chunk, if present, is up-to-date.
-// That is, the mux object hasn't been modified since the last call to
-// WebPMuxAssemble() or WebPMuxCreate().
+// Adds a tile at the end of the mux object.
+// Note: as WebP only supports even offsets, any odd offset will be snapped to
+// an even location using: offset &= ~1
 // Parameters:
-//   mux - (in) object from which the canvas size is to be fetched
-//   width - (out) canvas width
-//   height - (out) canvas height
+//   mux - (in/out) object to which a tile is to be added.
+//   bitstream - (in) the image data corresponding to the frame. It can either
+//               be a raw VP8/VP8L bitstream or a single-image WebP file
+//               (non-animated and non-tiled)
+//   x_offset - (in) x-offset of the tile to be added
+//   y_offset - (in) y-offset of the tile to be added
+//   copy_data - (in) value 1 indicates given data WILL copied to the mux, and
+//               value 0 indicates data will NOT be copied.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux, width or height is NULL.
-//   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL
+//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
-                                               int* width, int* height);
+WEBP_EXTERN(WebPMuxError) WebPMuxPushTile(
+    WebPMux* mux, const WebPData* bitstream,
+    int x_offset, int y_offset, int copy_data);
+
+// Gets the nth tile from the mux object.
+// The content of 'bitstream' is allocated using malloc(), and NOT
+// owned by the 'mux' object. It MUST be deallocated by the caller by calling
+// WebPDataClear().
+// nth=0 has a special meaning - last position.
+// Parameters:
+//   mux - (in) object from which the info is to be fetched
+//   nth - (in) index of the tile in the mux object
+//   bitstream - (out) the image data
+//   x_offset - (out) x-offset of the returned tile
+//   y_offset - (out) y-offset of the returned tile
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux, bitstream, x_offset or
+//                               y_offset is NULL
+//   WEBP_MUX_NOT_FOUND - if there are less than nth tiles in the mux object.
+//   WEBP_MUX_BAD_DATA - if nth tile chunk in mux is invalid.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxGetTile(
+    const WebPMux* mux, uint32_t nth, WebPData* bitstream,
+    int* x_offset, int* y_offset);
+
+// Deletes a tile from the mux object.
+// nth=0 has a special meaning - last position
+// Parameters:
+//   mux - (in/out) object from which a tile is to be deleted
+//   nth - (in) The position from which the tile is to be deleted
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL
+//   WEBP_MUX_NOT_FOUND - If there are less than nth tiles in the mux object
+//                        before deletion.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxDeleteTile(WebPMux* mux, uint32_t nth);
+
+//------------------------------------------------------------------------------
+// Misc Utilities.
 
 // Gets the feature flags from the mux object.
-// Note: This method assumes that the VP8X chunk, if present, is up-to-date.
-// That is, the mux object hasn't been modified since the last call to
-// WebPMuxAssemble() or WebPMuxCreate().
 // Parameters:
 //   mux - (in) object from which the features are to be fetched
 //   flags - (out) the flags specifying which features are present in the
 //           mux object. This will be an OR of various flag values.
 //           Enum 'WebPFeatureFlags' can be used to test individual flag values.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL.
-//   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL
+//   WEBP_MUX_NOT_FOUND - if VP8X chunk is not present in mux object.
+//   WEBP_MUX_BAD_DATA - if VP8X chunk in mux is invalid.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
                                              uint32_t* flags);
 
-// Gets number of chunks with the given 'id' in the mux object.
+// Gets number of chunks having tag value tag in the mux object.
 // Parameters:
 //   mux - (in) object from which the info is to be fetched
 //   id - (in) chunk id specifying the type of chunk
 //   num_elements - (out) number of chunks with the given chunk id
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux, or num_elements is NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux, or num_elements is NULL
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
                                            WebPChunkId id, int* num_elements);
@@ -362,15 +451,153 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
 //   assembled_data - (out) assembled WebP data
 // Returns:
 //   WEBP_MUX_BAD_DATA - if mux object is invalid.
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or assembled_data is NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if either mux, output_data or output_size is
+//                               NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
-//   WEBP_MUX_OK - on success.
+//   WEBP_MUX_OK - on success
 WEBP_EXTERN(WebPMuxError) WebPMuxAssemble(WebPMux* mux,
                                           WebPData* assembled_data);
 
 //------------------------------------------------------------------------------
+// Demux API.
+// Enables extraction of image and extended format data from WebP files.
+
+#define WEBP_DEMUX_ABI_VERSION 0x0100    // MAJOR(8b) + MINOR(8b)
+
+typedef struct WebPDemuxer WebPDemuxer;
+
+typedef enum {
+  WEBP_DEMUX_PARSING_HEADER,  // Not enough data to parse full header.
+  WEBP_DEMUX_PARSED_HEADER,   // Header parsing complete, data may be available.
+  WEBP_DEMUX_DONE             // Entire file has been parsed.
+} WebPDemuxState;
+
+//------------------------------------------------------------------------------
+// Life of a Demux object
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal(
+    const WebPData*, int, WebPDemuxState*, int);
+
+// Parses the WebP file given by 'data'.
+// A complete WebP file must be present in 'data' for the function to succeed.
+// Returns a WebPDemuxer object on successful parse, NULL otherwise.
+static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
+  return WebPDemuxInternal(data, 0, NULL, WEBP_DEMUX_ABI_VERSION);
+}
+
+// Parses the WebP file given by 'data'.
+// If 'state' is non-NULL it will be set to indicate the status of the demuxer.
+// Returns a WebPDemuxer object on successful parse, NULL otherwise.
+static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
+    const WebPData* data, WebPDemuxState* state) {
+  return WebPDemuxInternal(data, 1, state, WEBP_DEMUX_ABI_VERSION);
+}
+
+// Frees memory associated with 'dmux'.
+WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux);
+
+//------------------------------------------------------------------------------
+// Data/information extraction.
+
+typedef enum {
+  WEBP_FF_FORMAT_FLAGS,  // Extended format flags present in the 'VP8X' chunk.
+  WEBP_FF_CANVAS_WIDTH,
+  WEBP_FF_CANVAS_HEIGHT,
+  WEBP_FF_LOOP_COUNT
+} WebPFormatFeature;
+
+// Get the 'feature' value from the 'dmux'.
+// NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial()
+// returned a state > WEBP_DEMUX_PARSING_HEADER.
+WEBP_EXTERN(uint32_t) WebPDemuxGetI(
+    const WebPDemuxer* dmux, WebPFormatFeature feature);
+
+//------------------------------------------------------------------------------
+// Frame iteration.
+
+typedef struct {
+  int frame_num_;
+  int num_frames_;
+  int tile_num_;
+  int num_tiles_;
+  int x_offset_, y_offset_;  // offset relative to the canvas.
+  int width_, height_;       // dimensions of this frame or tile.
+  int duration_;   // display duration in milliseconds.
+  int complete_;   // true if 'tile_' contains a full frame. partial images may
+                   // still be decoded with the WebP incremental decoder.
+  WebPData tile_;  // The frame or tile given by 'frame_num_' and 'tile_num_'.
+
+  uint32_t pad[4];           // padding for later use
+  void* private_;
+} WebPIterator;
+
+// Retrieves frame 'frame_number' from 'dmux'.
+// 'iter->tile_' points to the first tile on return from this function.
+// Individual tiles may be extracted using WebPDemuxSetTile().
+// Setting 'frame_number' equal to 0 will return the last frame of the image.
+// Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
+// Call WebPDemuxReleaseIterator() when use of the iterator is complete.
+// NOTE: 'dmux' must persist for the lifetime of 'iter'.
+WEBP_EXTERN(int) WebPDemuxGetFrame(
+    const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
+
+// Sets 'iter->tile_' to point to the next ('iter->frame_num_' + 1) or previous
+// ('iter->frame_num_' - 1) frame. These functions do not loop.
+// Returns true on success, false otherwise.
+WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter);
+WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter);
+
+// Sets 'iter->tile_' to reflect tile number 'tile_number'.
+// Returns true if tile 'tile_number' is present, false otherwise.
+WEBP_EXTERN(int) WebPDemuxSelectTile(WebPIterator* iter, int tile_number);
+
+// Releases any memory associated with 'iter'.
+// Must be called before destroying the associated WebPDemuxer with
+// WebPDemuxDelete().
+WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter);
+
+//------------------------------------------------------------------------------
+// Chunk iteration.
+
+typedef struct {
+  // The current and total number of chunks with the fourcc given to
+  // WebPDemuxGetChunk().
+  int chunk_num_;
+  int num_chunks_;
+  WebPData chunk_;    // The payload of the chunk.
+
+  uint32_t pad[6];    // padding for later use
+  void* private_;
+} WebPChunkIterator;
+
+// Retrieves the 'chunk_number' instance of the chunk with id 'fourcc' from
+// 'dmux'.
+// 'fourcc' is a character array containing the fourcc of the chunk to return,
+// e.g., "ICCP", "META", "EXIF", etc.
+// Setting 'chunk_number' equal to 0 will return the last chunk in a set.
+// Returns true if the chunk is found, false otherwise. Image related chunk
+// payloads are accessed through WebPDemuxGetFrame() and related functions.
+// Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
+// NOTE: 'dmux' must persist for the lifetime of the iterator.
+WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                                   const char fourcc[4], int chunk_number,
+                                   WebPChunkIterator* iter);
+
+// Sets 'iter->chunk_' to point to the next ('iter->chunk_num_' + 1) or previous
+// ('iter->chunk_num_' - 1) chunk. These functions do not loop.
+// Returns true on success, false otherwise.
+WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter);
+WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter);
+
+// Releases any memory associated with 'iter'.
+// Must be called before destroying the associated WebPDemuxer with
+// WebPDemuxDelete().
+WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
+
+//------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/demux/demux.c b/drivers/webp/mux/demux.c
index f66ac6d82b..4519f7d55b 100644
--- a/drivers/webp/demux/demux.c
+++ b/drivers/webp/mux/demux.c
@@ -1,31 +1,26 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //  WebP container demux.
 //
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+#include "../webp/mux.h"
 
-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "../utils/utils.h"
-#include "../webp/decode.h"     // WebPGetFeatures
-#include "../webp/demux.h"
+#include "../webp/decode.h"  // WebPGetInfo
 #include "../webp/format_constants.h"
 
-#define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 0
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -43,12 +38,9 @@ typedef struct {
 typedef struct Frame {
   int x_offset_, y_offset_;
   int width_, height_;
-  int has_alpha_;
   int duration_;
-  WebPMuxAnimDispose dispose_method_;
-  WebPMuxAnimBlend blend_method_;
-  int is_fragment_;  // this is a frame fragment (and not a full frame).
-  int frame_num_;  // the referent frame number for use in assembling fragments.
+  int is_tile_;    // this is an image fragment from a 'TILE'.
+  int frame_num_;  // the referent frame number for use in assembling tiles.
   int complete_;   // img_components_ contains a full image.
   ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
   struct Frame* next_;
@@ -66,12 +58,9 @@ struct WebPDemuxer {
   uint32_t feature_flags_;
   int canvas_width_, canvas_height_;
   int loop_count_;
-  uint32_t bgcolor_;
   int num_frames_;
   Frame* frames_;
-  Frame** frames_tail_;
   Chunk* chunks_;  // non-image chunks
-  Chunk** chunks_tail_;
 };
 
 typedef enum {
@@ -98,12 +87,6 @@ static const ChunkParser kMasterChunks[] = {
   { { '0', '0', '0', '0' }, NULL,             NULL },
 };
 
-//------------------------------------------------------------------------------
-
-int WebPGetDemuxVersion(void) {
-  return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
-}
-
 // -----------------------------------------------------------------------------
 // MemBuffer
 
@@ -144,30 +127,43 @@ static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
   return mem->buf_ + mem->start_;
 }
 
-// Read from 'mem' and skip the read bytes.
-static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
+static WEBP_INLINE uint8_t GetByte(MemBuffer* const mem) {
   const uint8_t byte = mem->buf_[mem->start_];
   Skip(mem, 1);
   return byte;
 }
 
-static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
+// Read 16, 24 or 32 bits stored in little-endian order.
+static WEBP_INLINE int ReadLE16s(const uint8_t* const data) {
+  return (int)(data[0] << 0) | (data[1] << 8);
+}
+
+static WEBP_INLINE int ReadLE24s(const uint8_t* const data) {
+  return ReadLE16s(data) | (data[2] << 16);
+}
+
+static WEBP_INLINE uint32_t ReadLE32(const uint8_t* const data) {
+  return (uint32_t)ReadLE24s(data) | (data[3] << 24);
+}
+
+// In addition to reading, skip the read bytes.
+static WEBP_INLINE int GetLE16s(MemBuffer* const mem) {
   const uint8_t* const data = mem->buf_ + mem->start_;
-  const int val = GetLE16(data);
+  const int val = ReadLE16s(data);
   Skip(mem, 2);
   return val;
 }
 
-static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
+static WEBP_INLINE int GetLE24s(MemBuffer* const mem) {
   const uint8_t* const data = mem->buf_ + mem->start_;
-  const int val = GetLE24(data);
+  const int val = ReadLE24s(data);
   Skip(mem, 3);
   return val;
 }
 
-static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
+static WEBP_INLINE uint32_t GetLE32(MemBuffer* const mem) {
   const uint8_t* const data = mem->buf_ + mem->start_;
-  const uint32_t val = GetLE32(data);
+  const uint32_t val = ReadLE32(data);
   Skip(mem, 4);
   return val;
 }
@@ -176,37 +172,41 @@ static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
 // Secondary chunk parsing
 
 static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
-  *dmux->chunks_tail_ = chunk;
+  Chunk** c = &dmux->chunks_;
+  while (*c != NULL) c = &(*c)->next_;
+  *c = chunk;
   chunk->next_ = NULL;
-  dmux->chunks_tail_ = &chunk->next_;
 }
 
 // Add a frame to the end of the list, ensuring the last frame is complete.
 // Returns true on success, false otherwise.
 static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
-  const Frame* const last_frame = *dmux->frames_tail_;
+  const Frame* last_frame = NULL;
+  Frame** f = &dmux->frames_;
+  while (*f != NULL) {
+    last_frame = *f;
+    f = &(*f)->next_;
+  }
   if (last_frame != NULL && !last_frame->complete_) return 0;
-
-  *dmux->frames_tail_ = frame;
+  *f = frame;
   frame->next_ = NULL;
-  dmux->frames_tail_ = &frame->next_;
   return 1;
 }
 
 // Store image bearing chunks to 'frame'.
-static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
-                              MemBuffer* const mem, Frame* const frame) {
+static ParseStatus StoreFrame(int frame_num, MemBuffer* const mem,
+                              Frame* const frame) {
   int alpha_chunks = 0;
   int image_chunks = 0;
-  int done = (MemDataSize(mem) < min_size);
+  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE);
   ParseStatus status = PARSE_OK;
 
   if (done) return PARSE_NEED_MORE_DATA;
 
   do {
     const size_t chunk_start_offset = mem->start_;
-    const uint32_t fourcc = ReadLE32(mem);
-    const uint32_t payload_size = ReadLE32(mem);
+    const uint32_t fourcc = GetLE32(mem);
+    const uint32_t payload_size = GetLE32(mem);
     const uint32_t payload_size_padded = payload_size + (payload_size & 1);
     const size_t payload_available = (payload_size_padded > MemDataSize(mem))
                                    ? MemDataSize(mem) : payload_size_padded;
@@ -222,37 +222,29 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
           ++alpha_chunks;
           frame->img_components_[1].offset_ = chunk_start_offset;
           frame->img_components_[1].size_ = chunk_size;
-          frame->has_alpha_ = 1;
           frame->frame_num_ = frame_num;
           Skip(mem, payload_available);
         } else {
           goto Done;
         }
         break;
-      case MKFOURCC('V', 'P', '8', 'L'):
-        if (alpha_chunks > 0) return PARSE_ERROR;  // VP8L has its own alpha
-        // fall through
       case MKFOURCC('V', 'P', '8', ' '):
+      case MKFOURCC('V', 'P', '8', 'L'):
         if (image_chunks == 0) {
-          // Extract the bitstream features, tolerating failures when the data
-          // is incomplete.
-          WebPBitstreamFeatures features;
-          const VP8StatusCode vp8_status =
-              WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
-                              &features);
-          if (status == PARSE_NEED_MORE_DATA &&
-              vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
-            return PARSE_NEED_MORE_DATA;
-          } else if (vp8_status != VP8_STATUS_OK) {
-            // We have enough data, and yet WebPGetFeatures() failed.
-            return PARSE_ERROR;
-          }
+          int width = 0, height = 0;
           ++image_chunks;
           frame->img_components_[0].offset_ = chunk_start_offset;
           frame->img_components_[0].size_ = chunk_size;
-          frame->width_ = features.width;
-          frame->height_ = features.height;
-          frame->has_alpha_ |= features.has_alpha;
+          // Extract the width and height from the bitstream, tolerating
+          // failures when the data is incomplete.
+          if (!WebPGetInfo(mem->buf_ + frame->img_components_[0].offset_,
+                           frame->img_components_[0].size_, &width, &height) &&
+              status != PARSE_NEED_MORE_DATA) {
+            return PARSE_ERROR;
+          }
+
+          frame->width_ = width;
+          frame->height_ = height;
           frame->frame_num_ = frame_num;
           frame->complete_ = (status == PARSE_OK);
           Skip(mem, payload_available);
@@ -283,48 +275,43 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
 // Returns PARSE_OK on success with *frame pointing to the new Frame.
 // Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
 static ParseStatus NewFrame(const MemBuffer* const mem,
-                            uint32_t min_size, uint32_t actual_size,
-                            Frame** frame) {
+                            uint32_t min_size, uint32_t expected_size,
+                            uint32_t actual_size, Frame** frame) {
   if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
-  if (actual_size < min_size) return PARSE_ERROR;
+  if (actual_size < expected_size) return PARSE_ERROR;
   if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;
 
   *frame = (Frame*)calloc(1, sizeof(**frame));
   return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
 }
 
-// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
+// Parse a 'FRM ' chunk and any image bearing chunks that immediately follow.
 // 'frame_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseAnimationFrame(
+static ParseStatus ParseFrame(
     WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const uint32_t min_size = frame_chunk_size + CHUNK_HEADER_SIZE;
   int added_frame = 0;
-  int bits;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
   ParseStatus status =
-      NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
+      NewFrame(mem, min_size, FRAME_CHUNK_SIZE, frame_chunk_size, &frame);
   if (status != PARSE_OK) return status;
 
-  frame->x_offset_       = 2 * ReadLE24s(mem);
-  frame->y_offset_       = 2 * ReadLE24s(mem);
-  frame->width_          = 1 + ReadLE24s(mem);
-  frame->height_         = 1 + ReadLE24s(mem);
-  frame->duration_       = ReadLE24s(mem);
-  bits = ReadByte(mem);
-  frame->dispose_method_ =
-      (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
-  frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
+  frame->x_offset_ = 2 * GetLE24s(mem);
+  frame->y_offset_ = 2 * GetLE24s(mem);
+  frame->width_    = 1 + GetLE24s(mem);
+  frame->height_   = 1 + GetLE24s(mem);
+  frame->duration_ = 1 + GetLE24s(mem);
+  Skip(mem, frame_chunk_size - FRAME_CHUNK_SIZE);  // skip any trailing data.
   if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
-    free(frame);
     return PARSE_ERROR;
   }
 
-  // Store a frame only if the animation flag is set there is some data for
-  // this frame is available.
-  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
+  // Store a (potentially partial) frame only if the animation flag is set
+  // and there is some data in 'frame'.
+  status = StoreFrame(dmux->num_frames_ + 1, mem, frame);
+  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
       ++dmux->num_frames_;
@@ -337,43 +324,38 @@ static ParseStatus ParseAnimationFrame(
   return status;
 }
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
-// 'fragment_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseFragment(WebPDemuxer* const dmux,
-                                 uint32_t fragment_chunk_size) {
-  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
-  int added_fragment = 0;
+// Parse a 'TILE' chunk and any image bearing chunks that immediately follow.
+// 'tile_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseTile(WebPDemuxer* const dmux,
+                             uint32_t tile_chunk_size) {
+  const int has_tiles = !!(dmux->feature_flags_ & TILE_FLAG);
+  const uint32_t min_size = tile_chunk_size + CHUNK_HEADER_SIZE;
+  int added_tile = 0;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
   ParseStatus status =
-      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
+      NewFrame(mem, min_size, TILE_CHUNK_SIZE, tile_chunk_size, &frame);
   if (status != PARSE_OK) return status;
 
-  frame->is_fragment_ = 1;
-  frame->x_offset_ = 2 * ReadLE24s(mem);
-  frame->y_offset_ = 2 * ReadLE24s(mem);
-
-  // Store a fragment only if the 'fragments' flag is set and there is some
-  // data available.
-  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
-    added_fragment = AddFrame(dmux, frame);
-    if (!added_fragment) {
-      status = PARSE_ERROR;
-    } else {
-      dmux->num_frames_ = 1;
-    }
+  frame->is_tile_  = 1;
+  frame->x_offset_ = 2 * GetLE24s(mem);
+  frame->y_offset_ = 2 * GetLE24s(mem);
+  Skip(mem, tile_chunk_size - TILE_CHUNK_SIZE);  // skip any trailing data.
+
+  // Store a (potentially partial) tile only if the tile flag is set
+  // and the tile contains some data.
+  status = StoreFrame(dmux->num_frames_, mem, frame);
+  if (status != PARSE_ERROR && has_tiles && frame->frame_num_ > 0) {
+    // Note num_frames_ is incremented only when all tiles have been consumed.
+    added_tile = AddFrame(dmux, frame);
+    if (!added_tile) status = PARSE_ERROR;
   }
 
-  if (!added_fragment) free(frame);
+  if (!added_tile) free(frame);
   return status;
 }
-#endif  // WEBP_EXPERIMENTAL_FEATURES
 
-// General chunk storage, starting with the header at 'start_offset', allowing
+// General chunk storage starting with the header at 'start_offset' allowing
 // the user to request the payload via a fourcc string. 'size' includes the
 // header and the unpadded payload size.
 // Returns true on success, false otherwise.
@@ -391,20 +373,20 @@ static int StoreChunk(WebPDemuxer* const dmux,
 // -----------------------------------------------------------------------------
 // Primary chunk parsing
 
-static ParseStatus ReadHeader(MemBuffer* const mem) {
+static int ReadHeader(MemBuffer* const mem) {
   const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
   uint32_t riff_size;
 
   // Basic file level validation.
-  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
+  if (MemDataSize(mem) < min_size) return 0;
   if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
       memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
-    return PARSE_ERROR;
+    return 0;
   }
 
-  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
-  if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
-  if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  riff_size = ReadLE32(GetBuffer(mem) + TAG_SIZE);
+  if (riff_size < CHUNK_HEADER_SIZE) return 0;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
 
   // There's no point in reading past the end of the RIFF chunk
   mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
@@ -413,7 +395,7 @@ static ParseStatus ReadHeader(MemBuffer* const mem) {
   }
 
   Skip(mem, RIFF_HEADER_SIZE);
-  return PARSE_OK;
+  return 1;
 }
 
 static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
@@ -421,7 +403,6 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
   ParseStatus status;
-  int image_added = 0;
 
   if (dmux->frames_ != NULL) return PARSE_ERROR;
   if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
@@ -430,49 +411,65 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   frame = (Frame*)calloc(1, sizeof(*frame));
   if (frame == NULL) return PARSE_ERROR;
 
-  // For the single image case we allow parsing of a partial frame, but we need
-  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
+  status = StoreFrame(1, &dmux->mem_, frame);
   if (status != PARSE_ERROR) {
     const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
     // Clear any alpha when the alpha flag is missing.
     if (!has_alpha && frame->img_components_[1].size_ > 0) {
       frame->img_components_[1].offset_ = 0;
       frame->img_components_[1].size_ = 0;
-      frame->has_alpha_ = 0;
     }
 
     // Use the frame width/height as the canvas values for non-vp8x files.
-    // Also, set ALPHA_FLAG if this is a lossless image with alpha.
     if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
       dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
       dmux->canvas_width_ = frame->width_;
       dmux->canvas_height_ = frame->height_;
-      dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
-    }
-    if (!AddFrame(dmux, frame)) {
-      status = PARSE_ERROR;  // last frame was left incomplete
-    } else {
-      image_added = 1;
-      dmux->num_frames_ = 1;
     }
+    AddFrame(dmux, frame);
+    dmux->num_frames_ = 1;
+  } else {
+    free(frame);
   }
 
-  if (!image_added) free(frame);
   return status;
 }
 
-static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
   MemBuffer* const mem = &dmux->mem_;
-  int anim_chunks = 0;
+  int loop_chunks = 0;
+  uint32_t vp8x_size;
   ParseStatus status = PARSE_OK;
 
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = GetLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = GetByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + GetLE24s(mem);
+  dmux->canvas_height_ = 1 + GetLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
   do {
     int store_chunk = 1;
     const size_t chunk_start_offset = mem->start_;
-    const uint32_t fourcc = ReadLE32(mem);
-    const uint32_t chunk_size = ReadLE32(mem);
+    const uint32_t fourcc = GetLE32(mem);
+    const uint32_t chunk_size = GetLE32(mem);
     const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
 
     if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
@@ -485,50 +482,40 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
       case MKFOURCC('A', 'L', 'P', 'H'):
       case MKFOURCC('V', 'P', '8', ' '):
       case MKFOURCC('V', 'P', '8', 'L'): {
-        // check that this isn't an animation (all frames should be in an ANMF).
-        if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
-
         Rewind(mem, CHUNK_HEADER_SIZE);
         status = ParseSingleImage(dmux);
         break;
       }
-      case MKFOURCC('A', 'N', 'I', 'M'): {
-        if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;
+      case MKFOURCC('L', 'O', 'O', 'P'): {
+        if (chunk_size_padded < LOOP_CHUNK_SIZE) return PARSE_ERROR;
 
         if (MemDataSize(mem) < chunk_size_padded) {
           status = PARSE_NEED_MORE_DATA;
-        } else if (anim_chunks == 0) {
-          ++anim_chunks;
-          dmux->bgcolor_ = ReadLE32(mem);
-          dmux->loop_count_ = ReadLE16s(mem);
-          Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
+        } else if (loop_chunks == 0) {
+          ++loop_chunks;
+          dmux->loop_count_ = GetLE16s(mem);
+          Skip(mem, chunk_size_padded - LOOP_CHUNK_SIZE);
         } else {
           store_chunk = 0;
           goto Skip;
         }
         break;
       }
-      case MKFOURCC('A', 'N', 'M', 'F'): {
-        if (anim_chunks == 0) return PARSE_ERROR;  // 'ANIM' precedes frames.
-        status = ParseAnimationFrame(dmux, chunk_size_padded);
+      case MKFOURCC('F', 'R', 'M', ' '): {
+        status = ParseFrame(dmux, chunk_size_padded);
         break;
       }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      case MKFOURCC('F', 'R', 'G', 'M'): {
-        status = ParseFragment(dmux, chunk_size_padded);
+      case MKFOURCC('T', 'I', 'L', 'E'): {
+        if (dmux->num_frames_ == 0) dmux->num_frames_ = 1;
+        status = ParseTile(dmux, chunk_size_padded);
         break;
       }
-#endif
       case MKFOURCC('I', 'C', 'C', 'P'): {
         store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
         goto Skip;
       }
-      case MKFOURCC('E', 'X', 'I', 'F'): {
-        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
-        goto Skip;
-      }
-      case MKFOURCC('X', 'M', 'P', ' '): {
-        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
+      case MKFOURCC('M', 'E', 'T', 'A'): {
+        store_chunk = !!(dmux->feature_flags_ & META_FLAG);
         goto Skip;
       }
  Skip:
@@ -559,37 +546,6 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
   return status;
 }
 
-static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
-  MemBuffer* const mem = &dmux->mem_;
-  uint32_t vp8x_size;
-
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  dmux->is_ext_format_ = 1;
-  Skip(mem, TAG_SIZE);  // VP8X
-  vp8x_size = ReadLE32(mem);
-  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
-  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
-  vp8x_size += vp8x_size & 1;
-  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
-  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
-
-  dmux->feature_flags_ = ReadByte(mem);
-  Skip(mem, 3);  // Reserved.
-  dmux->canvas_width_  = 1 + ReadLE24s(mem);
-  dmux->canvas_height_ = 1 + ReadLE24s(mem);
-  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
-    return PARSE_ERROR;  // image final dimension is too large
-  }
-  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
-  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
-
-  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  return ParseVP8XChunks(dmux);
-}
-
 // -----------------------------------------------------------------------------
 // Format validation
 
@@ -604,55 +560,30 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
   return 1;
 }
 
-// If 'exact' is true, check that the image resolution matches the canvas.
-// If 'exact' is false, check that the x/y offsets do not exceed the canvas.
-// TODO(jzern): this is insufficient in the fragmented image case if the
-// expectation is that the fragments completely cover the canvas.
-static int CheckFrameBounds(const Frame* const frame, int exact,
-                            int canvas_width, int canvas_height) {
-  if (exact) {
-    if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
-      return 0;
-    }
-    if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
-      return 0;
-    }
-  } else {
-    if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
-    if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
-    if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
-  }
-  return 1;
-}
-
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const Frame* f = dmux->frames_;
+  const int has_tiles = !!(dmux->feature_flags_ & TILE_FLAG);
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const Frame* f;
 
   if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
 
   if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
   if (dmux->loop_count_ < 0) return 0;
   if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-#ifndef WEBP_EXPERIMENTAL_FEATURES
-  if (is_fragmented) return 0;
-#endif
 
-  while (f != NULL) {
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
     const int cur_frame_set = f->frame_num_;
-    int frame_count = 0, fragment_count = 0;
+    int frame_count = 0, tile_count = 0;
 
-    // Check frame properties and if the image is composed of fragments that
-    // each fragment came from a fragment.
+    // Check frame properties and if the image is composed of tiles that each
+    // fragment came from a 'TILE'.
     for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
       const ChunkData* const image = f->img_components_;
       const ChunkData* const alpha = f->img_components_ + 1;
 
-      if (is_fragmented && !f->is_fragment_) return 0;
-      if (!is_fragmented && f->is_fragment_) return 0;
-      if (!is_animation && f->frame_num_ > 1) return 0;
-
+      if (!has_tiles && f->is_tile_) return 0;
+      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
       if (f->complete_) {
         if (alpha->size_ == 0 && image->size_ == 0) return 0;
         // Ensure alpha precedes image bitstream.
@@ -662,9 +593,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
 
         if (f->width_ <= 0 || f->height_ <= 0) return 0;
       } else {
-        // There shouldn't be a partial frame in a complete file.
-        if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
-
         // Ensure alpha precedes image bitstream.
         if (alpha->size_ > 0 && image->size_ > 0 &&
             alpha->offset_ > image->offset_) {
@@ -674,17 +602,12 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
         if (f->next_ != NULL) return 0;
       }
 
-      if (f->width_ > 0 && f->height_ > 0 &&
-          !CheckFrameBounds(f, !(is_animation || is_fragmented),
-                            dmux->canvas_width_, dmux->canvas_height_)) {
-        return 0;
-      }
-
-      fragment_count += f->is_fragment_;
+      tile_count += f->is_tile_;
       ++frame_count;
     }
-    if (!is_fragmented && frame_count > 1) return 0;
-    if (fragment_count > 0 && frame_count != fragment_count) return 0;
+    if (!has_tiles && frame_count > 1) return 0;
+    if (tile_count > 0 && frame_count != tile_count) return 0;
+    if (f == NULL) break;
   }
   return 1;
 }
@@ -695,11 +618,8 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
 static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
   dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
   dmux->loop_count_ = 1;
-  dmux->bgcolor_ = 0xFFFFFFFF;  // White background by default.
   dmux->canvas_width_ = -1;
   dmux->canvas_height_ = -1;
-  dmux->frames_tail_ = &dmux->frames_;
-  dmux->chunks_tail_ = &dmux->chunks_;
   dmux->mem_ = *mem;
 }
 
@@ -711,20 +631,11 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
   MemBuffer mem;
   WebPDemuxer* dmux;
 
-  if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
-
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
-  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
-
-  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
-  status = ReadHeader(&mem);
-  if (status != PARSE_OK) {
-    if (state != NULL) {
-      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
-                                                : WEBP_DEMUX_PARSE_ERROR;
-    }
-    return NULL;
-  }
+  if (data == NULL || data->bytes_ == NULL || data->size_ == 0) return NULL;
+
+  if (!InitMemBuffer(&mem, data->bytes_, data->size_)) return NULL;
+  if (!ReadHeader(&mem)) return NULL;
 
   partial = (mem.buf_size_ < mem.riff_end_);
   if (!allow_partial && partial) return NULL;
@@ -733,18 +644,15 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
   if (dmux == NULL) return NULL;
   InitDemux(dmux, &mem);
 
-  status = PARSE_ERROR;
   for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
     if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
       status = parser->parse(dmux);
       if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
-      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
       if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
-      if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
       break;
     }
   }
-  if (state != NULL) *state = dmux->state_;
+  if (state) *state = dmux->state_;
 
   if (status == PARSE_ERROR) {
     WebPDemuxDelete(dmux);
@@ -777,12 +685,10 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
   if (dmux == NULL) return 0;
 
   switch (feature) {
-    case WEBP_FF_FORMAT_FLAGS:     return dmux->feature_flags_;
-    case WEBP_FF_CANVAS_WIDTH:     return (uint32_t)dmux->canvas_width_;
-    case WEBP_FF_CANVAS_HEIGHT:    return (uint32_t)dmux->canvas_height_;
-    case WEBP_FF_LOOP_COUNT:       return (uint32_t)dmux->loop_count_;
-    case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
-    case WEBP_FF_FRAME_COUNT:      return (uint32_t)dmux->num_frames_;
+    case WEBP_FF_FORMAT_FLAGS:  return dmux->feature_flags_;
+    case WEBP_FF_CANVAS_WIDTH:  return (uint32_t)dmux->canvas_width_;
+    case WEBP_FF_CANVAS_HEIGHT: return (uint32_t)dmux->canvas_height_;
+    case WEBP_FF_LOOP_COUNT:    return (uint32_t)dmux->loop_count_;
   }
   return 0;
 }
@@ -790,8 +696,7 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
 // -----------------------------------------------------------------------------
 // Frame iteration
 
-// Find the first 'frame_num' frame. There may be multiple such frames in a
-// fragmented frame.
+// Find the first 'frame_num' frame. There may be multiple in a tiled frame.
 static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
   const Frame* f;
   for (f = dmux->frames_; f != NULL; f = f->next_) {
@@ -800,19 +705,19 @@ static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
   return f;
 }
 
-// Returns fragment 'fragment_num' and the total count.
-static const Frame* GetFragment(
-    const Frame* const frame_set, int fragment_num, int* const count) {
+// Returns tile 'tile_num' and the total count.
+static const Frame* GetTile(
+    const Frame* const frame_set, int tile_num, int* const count) {
   const int this_frame = frame_set->frame_num_;
   const Frame* f = frame_set;
-  const Frame* fragment = NULL;
+  const Frame* tile = NULL;
   int total;
 
   for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
-    if (++total == fragment_num) fragment = f;
+    if (++total == tile_num) tile = f;
   }
   *count = total;
-  return fragment;
+  return tile;
 }
 
 static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
@@ -842,33 +747,27 @@ static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
 // Create a whole 'frame' from VP8 (+ alpha) or lossless.
 static int SynthesizeFrame(const WebPDemuxer* const dmux,
                            const Frame* const first_frame,
-                           int fragment_num, WebPIterator* const iter) {
+                           int tile_num, WebPIterator* const iter) {
   const uint8_t* const mem_buf = dmux->mem_.buf_;
-  int num_fragments;
+  int num_tiles;
   size_t payload_size = 0;
-  const Frame* const fragment =
-      GetFragment(first_frame, fragment_num, &num_fragments);
-  const uint8_t* const payload =
-      GetFramePayload(mem_buf, fragment, &payload_size);
+  const Frame* const tile = GetTile(first_frame, tile_num, &num_tiles);
+  const uint8_t* const payload = GetFramePayload(mem_buf, tile, &payload_size);
   if (payload == NULL) return 0;
-  assert(first_frame != NULL);
-
-  iter->frame_num      = first_frame->frame_num_;
-  iter->num_frames     = dmux->num_frames_;
-  iter->fragment_num   = fragment_num;
-  iter->num_fragments  = num_fragments;
-  iter->x_offset       = fragment->x_offset_;
-  iter->y_offset       = fragment->y_offset_;
-  iter->width          = fragment->width_;
-  iter->height         = fragment->height_;
-  iter->has_alpha      = fragment->has_alpha_;
-  iter->duration       = fragment->duration_;
-  iter->dispose_method = fragment->dispose_method_;
-  iter->blend_method   = fragment->blend_method_;
-  iter->complete       = fragment->complete_;
-  iter->fragment.bytes = payload;
-  iter->fragment.size  = payload_size;
-  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
+
+  iter->frame_num_   = first_frame->frame_num_;
+  iter->num_frames_  = dmux->num_frames_;
+  iter->tile_num_    = tile_num;
+  iter->num_tiles_   = num_tiles;
+  iter->x_offset_    = tile->x_offset_;
+  iter->y_offset_    = tile->y_offset_;
+  iter->width_       = tile->width_;
+  iter->height_      = tile->height_;
+  iter->duration_    = tile->duration_;
+  iter->complete_    = tile->complete_;
+  iter->tile_.bytes_ = payload;
+  iter->tile_.size_  = payload_size;
+  // TODO(jzern): adjust offsets for 'TILE's embedded in 'FRM 's
   return 1;
 }
 
@@ -880,8 +779,6 @@ static int SetFrame(int frame_num, WebPIterator* const iter) {
   if (frame_num == 0) frame_num = dmux->num_frames_;
 
   frame = GetFrame(dmux, frame_num);
-  if (frame == NULL) return 0;
-
   return SynthesizeFrame(dmux, frame, 1, iter);
 }
 
@@ -895,22 +792,22 @@ int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
 
 int WebPDemuxNextFrame(WebPIterator* iter) {
   if (iter == NULL) return 0;
-  return SetFrame(iter->frame_num + 1, iter);
+  return SetFrame(iter->frame_num_ + 1, iter);
 }
 
 int WebPDemuxPrevFrame(WebPIterator* iter) {
   if (iter == NULL) return 0;
-  if (iter->frame_num <= 1) return 0;
-  return SetFrame(iter->frame_num - 1, iter);
+  if (iter->frame_num_ <= 1) return 0;
+  return SetFrame(iter->frame_num_ - 1, iter);
 }
 
-int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
-  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
+int WebPDemuxSelectTile(WebPIterator* iter, int tile) {
+  if (iter != NULL && iter->private_ != NULL && tile > 0) {
     const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
-    const Frame* const frame = GetFrame(dmux, iter->frame_num);
+    const Frame* const frame = GetFrame(dmux, iter->frame_num_);
     if (frame == NULL) return 0;
 
-    return SynthesizeFrame(dmux, frame, fragment_num, iter);
+    return SynthesizeFrame(dmux, frame, tile, iter);
   }
   return 0;
 }
@@ -959,10 +856,10 @@ static int SetChunk(const char fourcc[4], int chunk_num,
   if (chunk_num <= count) {
     const uint8_t* const mem_buf = dmux->mem_.buf_;
     const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
-    iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
-    iter->chunk.size  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
-    iter->num_chunks  = count;
-    iter->chunk_num   = chunk_num;
+    iter->chunk_.bytes_ = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
+    iter->chunk_.size_  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
+    iter->num_chunks_   = count;
+    iter->chunk_num_    = chunk_num;
     return 1;
   }
   return 0;
@@ -981,17 +878,17 @@ int WebPDemuxGetChunk(const WebPDemuxer* dmux,
 int WebPDemuxNextChunk(WebPChunkIterator* iter) {
   if (iter != NULL) {
     const char* const fourcc =
-        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
-    return SetChunk(fourcc, iter->chunk_num + 1, iter);
+        (const char*)iter->chunk_.bytes_ - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num_ + 1, iter);
   }
   return 0;
 }
 
 int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
-  if (iter != NULL && iter->chunk_num > 1) {
+  if (iter != NULL && iter->chunk_num_ > 1) {
     const char* const fourcc =
-        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
-    return SetChunk(fourcc, iter->chunk_num - 1, iter);
+        (const char*)iter->chunk_.bytes_ - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num_ - 1, iter);
   }
   return 0;
 }
@@ -1000,3 +897,6 @@ void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
   (void)iter;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
diff --git a/drivers/webp/mux/muxedit.c b/drivers/webp/mux/muxedit.c
index 25770b3546..08629d4ae2 100644
--- a/drivers/webp/mux/muxedit.c
+++ b/drivers/webp/mux/muxedit.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Set and delete APIs for mux.
@@ -14,7 +12,10 @@
 
 #include <assert.h>
 #include "./muxi.h"
-#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 // Life of a mux object.
@@ -35,22 +36,20 @@ WebPMux* WebPNewInternal(int version) {
   }
 }
 
-// Delete all images in 'wpi_list'.
-static void DeleteAllImages(WebPMuxImage** const wpi_list) {
-  while (*wpi_list != NULL) {
-    *wpi_list = MuxImageDelete(*wpi_list);
+static void DeleteAllChunks(WebPChunk** const chunk_list) {
+  while (*chunk_list) {
+    *chunk_list = ChunkDelete(*chunk_list);
   }
 }
 
 static void MuxRelease(WebPMux* const mux) {
   if (mux == NULL) return;
-  DeleteAllImages(&mux->images_);
-  ChunkListDelete(&mux->vp8x_);
-  ChunkListDelete(&mux->iccp_);
-  ChunkListDelete(&mux->anim_);
-  ChunkListDelete(&mux->exif_);
-  ChunkListDelete(&mux->xmp_);
-  ChunkListDelete(&mux->unknown_);
+  MuxImageDeleteAll(&mux->images_);
+  DeleteAllChunks(&mux->vp8x_);
+  DeleteAllChunks(&mux->iccp_);
+  DeleteAllChunks(&mux->loop_);
+  DeleteAllChunks(&mux->meta_);
+  DeleteAllChunks(&mux->unknown_);
 }
 
 void WebPMuxDelete(WebPMux* mux) {
@@ -65,60 +64,81 @@ void WebPMuxDelete(WebPMux* mux) {
 // Handy MACRO, makes MuxSet() very symmetric to MuxGet().
 #define SWITCH_ID_LIST(INDEX, LIST)                                            \
   if (idx == (INDEX)) {                                                        \
-    err = ChunkAssignData(&chunk, data, copy_data, tag);                       \
+    err = ChunkAssignData(&chunk, data, copy_data, kChunks[(INDEX)].tag);      \
     if (err == WEBP_MUX_OK) {                                                  \
       err = ChunkSetNth(&chunk, (LIST), nth);                                  \
     }                                                                          \
     return err;                                                                \
   }
 
-static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth,
+static WebPMuxError MuxSet(WebPMux* const mux, CHUNK_INDEX idx, uint32_t nth,
                            const WebPData* const data, int copy_data) {
   WebPChunk chunk;
   WebPMuxError err = WEBP_MUX_NOT_FOUND;
-  const CHUNK_INDEX idx = ChunkGetIndexFromTag(tag);
   assert(mux != NULL);
   assert(!IsWPI(kChunks[idx].id));
 
   ChunkInit(&chunk);
-  SWITCH_ID_LIST(IDX_VP8X,    &mux->vp8x_);
-  SWITCH_ID_LIST(IDX_ICCP,    &mux->iccp_);
-  SWITCH_ID_LIST(IDX_ANIM,    &mux->anim_);
-  SWITCH_ID_LIST(IDX_EXIF,    &mux->exif_);
-  SWITCH_ID_LIST(IDX_XMP,     &mux->xmp_);
-  SWITCH_ID_LIST(IDX_UNKNOWN, &mux->unknown_);
+  SWITCH_ID_LIST(IDX_VP8X, &mux->vp8x_);
+  SWITCH_ID_LIST(IDX_ICCP, &mux->iccp_);
+  SWITCH_ID_LIST(IDX_LOOP, &mux->loop_);
+  SWITCH_ID_LIST(IDX_META, &mux->meta_);
+  if (idx == IDX_UNKNOWN && data->size_ > TAG_SIZE) {
+    // For raw-data unknown chunk, the first four bytes should be the tag to be
+    // used for the chunk.
+    const WebPData tmp = { data->bytes_ + TAG_SIZE, data->size_ - TAG_SIZE };
+    err = ChunkAssignData(&chunk, &tmp, copy_data, GetLE32(data->bytes_ + 0));
+    if (err == WEBP_MUX_OK)
+      err = ChunkSetNth(&chunk, &mux->unknown_, nth);
+  }
   return err;
 }
 #undef SWITCH_ID_LIST
 
-// Create data for frame/fragment given image data, offsets and duration.
-static WebPMuxError CreateFrameFragmentData(
-    int width, int height, const WebPMuxFrameInfo* const info, int is_frame,
-    WebPData* const frame_frgm) {
-  uint8_t* frame_frgm_bytes;
-  const size_t frame_frgm_size = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].size;
+static WebPMuxError MuxAddChunk(WebPMux* const mux, uint32_t nth, uint32_t tag,
+                                const uint8_t* data, size_t size,
+                                int copy_data) {
+  const CHUNK_INDEX idx = ChunkGetIndexFromTag(tag);
+  const WebPData chunk_data = { data, size };
+  assert(mux != NULL);
+  assert(size <= MAX_CHUNK_PAYLOAD);
+  assert(idx != IDX_NIL);
+  return MuxSet(mux, idx, nth, &chunk_data, copy_data);
+}
 
-  assert(width > 0 && height > 0 && info->duration >= 0);
-  assert(info->dispose_method == (info->dispose_method & 1));
+// Create data for frame/tile given image data, offsets and duration.
+static WebPMuxError CreateFrameTileData(const WebPData* const image,
+                                        int x_offset, int y_offset,
+                                        int duration, int is_lossless,
+                                        int is_frame,
+                                        WebPData* const frame_tile) {
+  int width;
+  int height;
+  uint8_t* frame_tile_bytes;
+  const size_t frame_tile_size = kChunks[is_frame ? IDX_FRAME : IDX_TILE].size;
+
+  const int ok = is_lossless ?
+      VP8LGetInfo(image->bytes_, image->size_, &width, &height, NULL) :
+      VP8GetInfo(image->bytes_, image->size_, image->size_, &width, &height);
+  if (!ok) return WEBP_MUX_INVALID_ARGUMENT;
+
+  assert(width > 0 && height > 0 && duration > 0);
   // Note: assertion on upper bounds is done in PutLE24().
 
-  frame_frgm_bytes = (uint8_t*)malloc(frame_frgm_size);
-  if (frame_frgm_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
+  frame_tile_bytes = (uint8_t*)malloc(frame_tile_size);
+  if (frame_tile_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
 
-  PutLE24(frame_frgm_bytes + 0, info->x_offset / 2);
-  PutLE24(frame_frgm_bytes + 3, info->y_offset / 2);
+  PutLE24(frame_tile_bytes + 0, x_offset / 2);
+  PutLE24(frame_tile_bytes + 3, y_offset / 2);
 
   if (is_frame) {
-    PutLE24(frame_frgm_bytes + 6, width - 1);
-    PutLE24(frame_frgm_bytes + 9, height - 1);
-    PutLE24(frame_frgm_bytes + 12, info->duration);
-    frame_frgm_bytes[15] =
-        (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
-        (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
+    PutLE24(frame_tile_bytes + 6, width - 1);
+    PutLE24(frame_tile_bytes + 9, height - 1);
+    PutLE24(frame_tile_bytes + 12, duration - 1);
   }
 
-  frame_frgm->bytes = frame_frgm_bytes;
-  frame_frgm->size = frame_frgm_size;
+  frame_tile->bytes_ = frame_tile_bytes;
+  frame_tile->size_ = frame_tile_size;
   return WEBP_MUX_OK;
 }
 
@@ -129,8 +149,8 @@ static WebPMuxError GetImageData(const WebPData* const bitstream,
                                  WebPData* const image, WebPData* const alpha,
                                  int* const is_lossless) {
   WebPDataInit(alpha);  // Default: no alpha.
-  if (bitstream->size < TAG_SIZE ||
-      memcmp(bitstream->bytes, "RIFF", TAG_SIZE)) {
+  if (bitstream->size_ < TAG_SIZE ||
+      memcmp(bitstream->bytes_, "RIFF", TAG_SIZE)) {
     // It is NOT webp file data. Return input data as is.
     *image = *bitstream;
   } else {
@@ -146,7 +166,7 @@ static WebPMuxError GetImageData(const WebPData* const bitstream,
     }
     WebPMuxDelete(mux);
   }
-  *is_lossless = VP8LCheckSignature(image->bytes, image->size);
+  *is_lossless = VP8LCheckSignature(image->bytes_, image->size_);
   return WEBP_MUX_OK;
 }
 
@@ -165,168 +185,204 @@ static WebPMuxError DeleteChunks(WebPChunk** chunk_list, uint32_t tag) {
   return err;
 }
 
-static WebPMuxError MuxDeleteAllNamedData(WebPMux* const mux, uint32_t tag) {
-  const WebPChunkId id = ChunkGetIdFromTag(tag);
-  assert(mux != NULL);
+static WebPMuxError MuxDeleteAllNamedData(WebPMux* const mux, CHUNK_INDEX idx) {
+  const WebPChunkId id = kChunks[idx].id;
+  WebPChunk** chunk_list;
+
+  if (mux == NULL) return WEBP_MUX_INVALID_ARGUMENT;
   if (IsWPI(id)) return WEBP_MUX_INVALID_ARGUMENT;
-  return DeleteChunks(MuxGetChunkListFromId(mux, id), tag);
+
+  chunk_list = MuxGetChunkListFromId(mux, id);
+  if (chunk_list == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+
+  return DeleteChunks(chunk_list, kChunks[idx].tag);
+}
+
+static WebPMuxError DeleteLoopCount(WebPMux* const mux) {
+  return MuxDeleteAllNamedData(mux, IDX_LOOP);
 }
 
 //------------------------------------------------------------------------------
 // Set API(s).
 
-WebPMuxError WebPMuxSetChunk(WebPMux* mux, const char fourcc[4],
-                             const WebPData* chunk_data, int copy_data) {
-  uint32_t tag;
+WebPMuxError WebPMuxSetImage(WebPMux* mux,
+                             const WebPData* bitstream, int copy_data) {
   WebPMuxError err;
-  if (mux == NULL || fourcc == NULL || chunk_data == NULL ||
-      chunk_data->bytes == NULL || chunk_data->size > MAX_CHUNK_PAYLOAD) {
+  WebPChunk chunk;
+  WebPMuxImage wpi;
+  WebPData image;
+  WebPData alpha;
+  int is_lossless;
+  int image_tag;
+
+  if (mux == NULL || bitstream == NULL || bitstream->bytes_ == NULL ||
+      bitstream->size_ > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
-  tag = ChunkGetTagFromFourCC(fourcc);
 
-  // Delete existing chunk(s) with the same 'fourcc'.
-  err = MuxDeleteAllNamedData(mux, tag);
-  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
+  // If given data is for a whole webp file,
+  // extract only the VP8/VP8L data from it.
+  err = GetImageData(bitstream, &image, &alpha, &is_lossless);
+  if (err != WEBP_MUX_OK) return err;
+  image_tag = is_lossless ? kChunks[IDX_VP8L].tag : kChunks[IDX_VP8].tag;
 
-  // Add the given chunk.
-  return MuxSet(mux, tag, 1, chunk_data, copy_data);
-}
+  // Delete the existing images.
+  MuxImageDeleteAll(&mux->images_);
 
-// Creates a chunk from given 'data' and sets it as 1st chunk in 'chunk_list'.
-static WebPMuxError AddDataToChunkList(
-    const WebPData* const data, int copy_data, uint32_t tag,
-    WebPChunk** chunk_list) {
-  WebPChunk chunk;
-  WebPMuxError err;
+  MuxImageInit(&wpi);
+
+  if (alpha.bytes_ != NULL) {  // Add alpha chunk.
+    ChunkInit(&chunk);
+    err = ChunkAssignData(&chunk, &alpha, copy_data, kChunks[IDX_ALPHA].tag);
+    if (err != WEBP_MUX_OK) goto Err;
+    err = ChunkSetNth(&chunk, &wpi.alpha_, 1);
+    if (err != WEBP_MUX_OK) goto Err;
+  }
+
+  // Add image chunk.
   ChunkInit(&chunk);
-  err = ChunkAssignData(&chunk, data, copy_data, tag);
+  err = ChunkAssignData(&chunk, &image, copy_data, image_tag);
   if (err != WEBP_MUX_OK) goto Err;
-  err = ChunkSetNth(&chunk, chunk_list, 1);
+  err = ChunkSetNth(&chunk, &wpi.img_, 1);
   if (err != WEBP_MUX_OK) goto Err;
+
+  // Add this image to mux.
+  err = MuxImagePush(&wpi, &mux->images_);
+  if (err != WEBP_MUX_OK) goto Err;
+
+  // All OK.
   return WEBP_MUX_OK;
+
  Err:
+  // Something bad happened.
   ChunkRelease(&chunk);
+  MuxImageRelease(&wpi);
   return err;
 }
 
-// Extracts image & alpha data from the given bitstream and then sets wpi.alpha_
-// and wpi.img_ appropriately.
-static WebPMuxError SetAlphaAndImageChunks(
-    const WebPData* const bitstream, int copy_data, WebPMuxImage* const wpi) {
-  int is_lossless = 0;
-  WebPData image, alpha;
-  WebPMuxError err = GetImageData(bitstream, &image, &alpha, &is_lossless);
-  const int image_tag =
-      is_lossless ? kChunks[IDX_VP8L].tag : kChunks[IDX_VP8].tag;
-  if (err != WEBP_MUX_OK) return err;
-  if (alpha.bytes != NULL) {
-    err = AddDataToChunkList(&alpha, copy_data, kChunks[IDX_ALPHA].tag,
-                             &wpi->alpha_);
-    if (err != WEBP_MUX_OK) return err;
+WebPMuxError WebPMuxSetMetadata(WebPMux* mux, const WebPData* metadata,
+                                int copy_data) {
+  WebPMuxError err;
+
+  if (mux == NULL || metadata == NULL || metadata->bytes_ == NULL ||
+      metadata->size_ > MAX_CHUNK_PAYLOAD) {
+    return WEBP_MUX_INVALID_ARGUMENT;
   }
-  err = AddDataToChunkList(&image, copy_data, image_tag, &wpi->img_);
-  if (err != WEBP_MUX_OK) return err;
-  return MuxImageFinalize(wpi) ? WEBP_MUX_OK : WEBP_MUX_INVALID_ARGUMENT;
+
+  // Delete the existing metadata chunk(s).
+  err = WebPMuxDeleteMetadata(mux);
+  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
+
+  // Add the given metadata chunk.
+  return MuxSet(mux, IDX_META, 1, metadata, copy_data);
 }
 
-WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
-                             int copy_data) {
-  WebPMuxImage wpi;
+WebPMuxError WebPMuxSetColorProfile(WebPMux* mux, const WebPData* color_profile,
+                                    int copy_data) {
   WebPMuxError err;
 
-  // Sanity checks.
-  if (mux == NULL || bitstream == NULL || bitstream->bytes == NULL ||
-      bitstream->size > MAX_CHUNK_PAYLOAD) {
+  if (mux == NULL || color_profile == NULL || color_profile->bytes_ == NULL ||
+      color_profile->size_ > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
-  if (mux->images_ != NULL) {
-    // Only one 'simple image' can be added in mux. So, remove present images.
-    DeleteAllImages(&mux->images_);
-  }
+  // Delete the existing ICCP chunk(s).
+  err = WebPMuxDeleteColorProfile(mux);
+  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
 
-  MuxImageInit(&wpi);
-  err = SetAlphaAndImageChunks(bitstream, copy_data, &wpi);
-  if (err != WEBP_MUX_OK) goto Err;
+  // Add the given ICCP chunk.
+  return MuxSet(mux, IDX_ICCP, 1, color_profile, copy_data);
+}
 
-  // Add this WebPMuxImage to mux.
-  err = MuxImagePush(&wpi, &mux->images_);
-  if (err != WEBP_MUX_OK) goto Err;
+WebPMuxError WebPMuxSetLoopCount(WebPMux* mux, int loop_count) {
+  WebPMuxError err;
+  uint8_t* data = NULL;
 
-  // All is well.
-  return WEBP_MUX_OK;
+  if (mux == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  if (loop_count >= MAX_LOOP_COUNT) return WEBP_MUX_INVALID_ARGUMENT;
 
- Err:  // Something bad happened.
-  MuxImageRelease(&wpi);
+  // Delete the existing LOOP chunk(s).
+  err = DeleteLoopCount(mux);
+  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
+
+  // Add the given loop count.
+  data = (uint8_t*)malloc(kChunks[IDX_LOOP].size);
+  if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
+
+  PutLE16(data, loop_count);
+  err = MuxAddChunk(mux, 1, kChunks[IDX_LOOP].tag, data,
+                    kChunks[IDX_LOOP].size, 1);
+  free(data);
   return err;
 }
 
-WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
-                              int copy_data) {
+static WebPMuxError MuxPushFrameTileInternal(
+    WebPMux* const mux, const WebPData* const bitstream, int x_offset,
+    int y_offset, int duration, int copy_data, uint32_t tag) {
+  WebPChunk chunk;
+  WebPData image;
+  WebPData alpha;
   WebPMuxImage wpi;
   WebPMuxError err;
-  int is_frame;
-  const WebPData* const bitstream = &frame->bitstream;
+  WebPData frame_tile;
+  const int is_frame = (tag == kChunks[IDX_FRAME].tag) ? 1 : 0;
+  int is_lossless;
+  int image_tag;
 
   // Sanity checks.
-  if (mux == NULL || frame == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-
-  is_frame = (frame->id == WEBP_CHUNK_ANMF);
-  if (!(is_frame || (frame->id == WEBP_CHUNK_FRGM))) {
+  if (mux == NULL || bitstream == NULL || bitstream->bytes_ == NULL ||
+      bitstream->size_ > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
-#ifndef WEBP_EXPERIMENTAL_FEATURES
-  if (frame->id == WEBP_CHUNK_FRGM) {     // disabled for now.
+  if (x_offset < 0 || x_offset >= MAX_POSITION_OFFSET ||
+      y_offset < 0 || y_offset >= MAX_POSITION_OFFSET ||
+      duration <= 0 || duration > MAX_DURATION) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
-#endif
 
-  if (bitstream->bytes == NULL || bitstream->size > MAX_CHUNK_PAYLOAD) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
+  // Snap offsets to even positions.
+  x_offset &= ~1;
+  y_offset &= ~1;
 
-  if (mux->images_ != NULL) {
-    const WebPMuxImage* const image = mux->images_;
-    const uint32_t image_id = (image->header_ != NULL) ?
-        ChunkGetIdFromTag(image->header_->tag_) : WEBP_CHUNK_IMAGE;
-    if (image_id != frame->id) {
-      return WEBP_MUX_INVALID_ARGUMENT;  // Conflicting frame types.
-    }
-  }
+  // If given data is for a whole webp file,
+  // extract only the VP8/VP8L data from it.
+  err = GetImageData(bitstream, &image, &alpha, &is_lossless);
+  if (err != WEBP_MUX_OK) return err;
+  image_tag = is_lossless ? kChunks[IDX_VP8L].tag : kChunks[IDX_VP8].tag;
 
+  WebPDataInit(&frame_tile);
+  ChunkInit(&chunk);
   MuxImageInit(&wpi);
-  err = SetAlphaAndImageChunks(bitstream, copy_data, &wpi);
-  if (err != WEBP_MUX_OK) goto Err;
-  assert(wpi.img_ != NULL);  // As SetAlphaAndImageChunks() was successful.
-
-  {
-    WebPData frame_frgm;
-    const uint32_t tag = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].tag;
-    WebPMuxFrameInfo tmp = *frame;
-    tmp.x_offset &= ~1;  // Snap offsets to even.
-    tmp.y_offset &= ~1;
-    if (!is_frame) {  // Reset unused values.
-      tmp.duration = 1;
-      tmp.dispose_method = WEBP_MUX_DISPOSE_NONE;
-      tmp.blend_method = WEBP_MUX_BLEND;
-    }
-    if (tmp.x_offset < 0 || tmp.x_offset >= MAX_POSITION_OFFSET ||
-        tmp.y_offset < 0 || tmp.y_offset >= MAX_POSITION_OFFSET ||
-        (tmp.duration < 0 || tmp.duration >= MAX_DURATION) ||
-        tmp.dispose_method != (tmp.dispose_method & 1)) {
-      err = WEBP_MUX_INVALID_ARGUMENT;
-      goto Err;
-    }
-    err = CreateFrameFragmentData(wpi.width_, wpi.height_, &tmp, is_frame,
-                                  &frame_frgm);
+
+  if (alpha.bytes_ != NULL) {
+    // Add alpha chunk.
+    err = ChunkAssignData(&chunk, &alpha, copy_data, kChunks[IDX_ALPHA].tag);
     if (err != WEBP_MUX_OK) goto Err;
-    // Add frame/fragment chunk (with copy_data = 1).
-    err = AddDataToChunkList(&frame_frgm, 1, tag, &wpi.header_);
-    WebPDataClear(&frame_frgm);  // frame_frgm owned by wpi.header_ now.
+    err = ChunkSetNth(&chunk, &wpi.alpha_, 1);
     if (err != WEBP_MUX_OK) goto Err;
+    ChunkInit(&chunk);  // chunk owned by wpi.alpha_ now.
   }
 
+  // Add image chunk.
+  err = ChunkAssignData(&chunk, &image, copy_data, image_tag);
+  if (err != WEBP_MUX_OK) goto Err;
+  err = ChunkSetNth(&chunk, &wpi.img_, 1);
+  if (err != WEBP_MUX_OK) goto Err;
+  ChunkInit(&chunk);  // chunk owned by wpi.img_ now.
+
+  // Create frame/tile data.
+  err = CreateFrameTileData(&image, x_offset, y_offset, duration, is_lossless,
+                            is_frame, &frame_tile);
+  if (err != WEBP_MUX_OK) goto Err;
+
+  // Add frame/tile chunk (with copy_data = 1).
+  err = ChunkAssignData(&chunk, &frame_tile, 1, tag);
+  if (err != WEBP_MUX_OK) goto Err;
+  WebPDataClear(&frame_tile);
+  err = ChunkSetNth(&chunk, &wpi.header_, 1);
+  if (err != WEBP_MUX_OK) goto Err;
+  ChunkInit(&chunk);  // chunk owned by wpi.header_ now.
+
   // Add this WebPMuxImage to mux.
   err = MuxImagePush(&wpi, &mux->images_);
   if (err != WEBP_MUX_OK) goto Err;
@@ -335,82 +391,123 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
   return WEBP_MUX_OK;
 
  Err:  // Something bad happened.
+  WebPDataClear(&frame_tile);
+  ChunkRelease(&chunk);
   MuxImageRelease(&wpi);
   return err;
 }
 
-WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux,
-                                       const WebPMuxAnimParams* params) {
+WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPData* bitstream,
+                              int x_offset, int y_offset,
+                              int duration, int copy_data) {
+  return MuxPushFrameTileInternal(mux, bitstream, x_offset, y_offset,
+                                  duration, copy_data, kChunks[IDX_FRAME].tag);
+}
+
+WebPMuxError WebPMuxPushTile(WebPMux* mux, const WebPData* bitstream,
+                             int x_offset, int y_offset,
+                             int copy_data) {
+  return MuxPushFrameTileInternal(mux, bitstream, x_offset, y_offset,
+                                  1 /* unused duration */, copy_data,
+                                  kChunks[IDX_TILE].tag);
+}
+
+//------------------------------------------------------------------------------
+// Delete API(s).
+
+WebPMuxError WebPMuxDeleteImage(WebPMux* mux) {
   WebPMuxError err;
-  uint8_t data[ANIM_CHUNK_SIZE];
-  const WebPData anim = { data, ANIM_CHUNK_SIZE };
 
-  if (mux == NULL || params == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-  if (params->loop_count < 0 || params->loop_count >= MAX_LOOP_COUNT) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
+  if (mux == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
-  // Delete any existing ANIM chunk(s).
-  err = MuxDeleteAllNamedData(mux, kChunks[IDX_ANIM].tag);
-  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
+  err = MuxValidateForImage(mux);
+  if (err != WEBP_MUX_OK) return err;
 
-  // Set the animation parameters.
-  PutLE32(data, params->bgcolor);
-  PutLE16(data + 4, params->loop_count);
-  return MuxSet(mux, kChunks[IDX_ANIM].tag, 1, &anim, 1);
+  // All well, delete image.
+  MuxImageDeleteAll(&mux->images_);
+  return WEBP_MUX_OK;
 }
 
-//------------------------------------------------------------------------------
-// Delete API(s).
+WebPMuxError WebPMuxDeleteMetadata(WebPMux* mux) {
+  return MuxDeleteAllNamedData(mux, IDX_META);
+}
 
-WebPMuxError WebPMuxDeleteChunk(WebPMux* mux, const char fourcc[4]) {
-  if (mux == NULL || fourcc == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-  return MuxDeleteAllNamedData(mux, ChunkGetTagFromFourCC(fourcc));
+WebPMuxError WebPMuxDeleteColorProfile(WebPMux* mux) {
+  return MuxDeleteAllNamedData(mux, IDX_ICCP);
 }
 
-WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth) {
+static WebPMuxError DeleteFrameTileInternal(WebPMux* const mux, uint32_t nth,
+                                            CHUNK_INDEX idx) {
+  const WebPChunkId id = kChunks[idx].id;
   if (mux == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-  return MuxImageDeleteNth(&mux->images_, nth);
+
+  assert(idx == IDX_FRAME || idx == IDX_TILE);
+  return MuxImageDeleteNth(&mux->images_, nth, id);
+}
+
+WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth) {
+  return DeleteFrameTileInternal(mux, nth, IDX_FRAME);
+}
+
+WebPMuxError WebPMuxDeleteTile(WebPMux* mux, uint32_t nth) {
+  return DeleteFrameTileInternal(mux, nth, IDX_TILE);
 }
 
 //------------------------------------------------------------------------------
 // Assembly of the WebP RIFF file.
 
-static WebPMuxError GetFrameFragmentInfo(
-    const WebPChunk* const frame_frgm_chunk,
-    int* const x_offset, int* const y_offset, int* const duration) {
-  const uint32_t tag = frame_frgm_chunk->tag_;
-  const int is_frame = (tag == kChunks[IDX_ANMF].tag);
-  const WebPData* const data = &frame_frgm_chunk->data_;
+static WebPMuxError GetFrameTileInfo(const WebPChunk* const frame_tile_chunk,
+                                     int* const x_offset, int* const y_offset,
+                                     int* const duration) {
+  const uint32_t tag = frame_tile_chunk->tag_;
+  const int is_frame = (tag == kChunks[IDX_FRAME].tag);
+  const WebPData* const data = &frame_tile_chunk->data_;
   const size_t expected_data_size =
-      is_frame ? ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
-  assert(frame_frgm_chunk != NULL);
-  assert(tag == kChunks[IDX_ANMF].tag || tag ==  kChunks[IDX_FRGM].tag);
-  if (data->size != expected_data_size) return WEBP_MUX_INVALID_ARGUMENT;
-
-  *x_offset = 2 * GetLE24(data->bytes + 0);
-  *y_offset = 2 * GetLE24(data->bytes + 3);
-  if (is_frame) *duration = GetLE24(data->bytes + 12);
+      is_frame ? FRAME_CHUNK_SIZE : TILE_CHUNK_SIZE;
+  assert(frame_tile_chunk != NULL);
+  assert(tag == kChunks[IDX_FRAME].tag || tag ==  kChunks[IDX_TILE].tag);
+  if (data->size_ != expected_data_size) return WEBP_MUX_INVALID_ARGUMENT;
+
+  *x_offset = 2 * GetLE24(data->bytes_ + 0);
+  *y_offset = 2 * GetLE24(data->bytes_ + 3);
+  if (is_frame) *duration = 1 + GetLE24(data->bytes_ + 12);
   return WEBP_MUX_OK;
 }
 
+WebPMuxError MuxGetImageWidthHeight(const WebPChunk* const image_chunk,
+                                    int* const width, int* const height) {
+  const uint32_t tag = image_chunk->tag_;
+  const WebPData* const data = &image_chunk->data_;
+  int w, h;
+  int ok;
+  assert(image_chunk != NULL);
+  assert(tag == kChunks[IDX_VP8].tag || tag ==  kChunks[IDX_VP8L].tag);
+  ok = (tag == kChunks[IDX_VP8].tag) ?
+      VP8GetInfo(data->bytes_, data->size_, data->size_, &w, &h) :
+      VP8LGetInfo(data->bytes_, data->size_, &w, &h, NULL);
+  if (ok) {
+    *width = w;
+    *height = h;
+    return WEBP_MUX_OK;
+  } else {
+    return WEBP_MUX_BAD_DATA;
+  }
+}
+
 static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
                                  int* const x_offset, int* const y_offset,
                                  int* const duration,
                                  int* const width, int* const height) {
-  const WebPChunk* const frame_frgm_chunk = wpi->header_;
-  WebPMuxError err;
-  assert(wpi != NULL);
-  assert(frame_frgm_chunk != NULL);
+  const WebPChunk* const image_chunk = wpi->img_;
+  const WebPChunk* const frame_tile_chunk = wpi->header_;
 
-  // Get offsets and duration from ANMF/FRGM chunk.
-  err = GetFrameFragmentInfo(frame_frgm_chunk, x_offset, y_offset, duration);
+  // Get offsets and duration from FRM/TILE chunk.
+  const WebPMuxError err =
+      GetFrameTileInfo(frame_tile_chunk, x_offset, y_offset, duration);
   if (err != WEBP_MUX_OK) return err;
 
   // Get width and height from VP8/VP8L chunk.
-  if (width != NULL) *width = wpi->width_;
-  if (height != NULL) *height = wpi->height_;
-  return WEBP_MUX_OK;
+  return MuxGetImageWidthHeight(image_chunk, width, height);
 }
 
 static WebPMuxError GetImageCanvasWidthHeight(
@@ -424,15 +521,13 @@ static WebPMuxError GetImageCanvasWidthHeight(
   assert(wpi != NULL);
   assert(wpi->img_ != NULL);
 
-  if (wpi->next_ != NULL) {
+  if (wpi->next_) {
     int max_x = 0;
     int max_y = 0;
     int64_t image_area = 0;
-    // if we have a chain of wpi's, header_ is necessarily set
-    assert(wpi->header_ != NULL);
-    // Aggregate the bounding box for animation frames & fragmented images.
+    // Aggregate the bounding box for animation frames & tiled images.
     for (; wpi != NULL; wpi = wpi->next_) {
-      int x_offset = 0, y_offset = 0, duration = 0, w = 0, h = 0;
+      int x_offset, y_offset, duration, w, h;
       const WebPMuxError err = GetImageInfo(wpi, &x_offset, &y_offset,
                                             &duration, &w, &h);
       const int max_x_pos = x_offset + w;
@@ -447,19 +542,23 @@ static WebPMuxError GetImageCanvasWidthHeight(
     }
     *width = max_x;
     *height = max_y;
-    // Crude check to validate that there are no image overlaps/holes for
-    // fragmented images. Check that the aggregated image area for individual
-    // fragments exactly matches the image area of the constructed canvas.
-    // However, the area-match is necessary but not sufficient condition.
-    if ((flags & FRAGMENTS_FLAG) && (image_area != (max_x * max_y))) {
+    // Crude check to validate that there are no image overlaps/holes for tile
+    // images. Check that the aggregated image area for individual tiles exactly
+    // matches the image area of the constructed canvas. However, the area-match
+    // is necessary but not sufficient condition.
+    if ((flags & TILE_FLAG) && (image_area != (max_x * max_y))) {
       *width = 0;
       *height = 0;
       return WEBP_MUX_INVALID_ARGUMENT;
     }
   } else {
-    // For a single image, canvas dimensions are same as image dimensions.
-    *width = wpi->width_;
-    *height = wpi->height_;
+    // For a single image, extract the width & height from VP8/VP8L image-data.
+    int w, h;
+    const WebPChunk* const image_chunk = wpi->img_;
+    const WebPMuxError err = MuxGetImageWidthHeight(image_chunk, &w, &h);
+    if (err != WEBP_MUX_OK) return err;
+    *width = w;
+    *height = h;
   }
   return WEBP_MUX_OK;
 }
@@ -475,40 +574,40 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
   int width = 0;
   int height = 0;
   uint8_t data[VP8X_CHUNK_SIZE];
-  const WebPData vp8x = { data, VP8X_CHUNK_SIZE };
+  const size_t data_size = VP8X_CHUNK_SIZE;
   const WebPMuxImage* images = NULL;
 
   assert(mux != NULL);
   images = mux->images_;  // First image.
   if (images == NULL || images->img_ == NULL ||
-      images->img_->data_.bytes == NULL) {
+      images->img_->data_.bytes_ == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
   // If VP8X chunk(s) is(are) already present, remove them (and later add new
   // VP8X chunk with updated flags).
-  err = MuxDeleteAllNamedData(mux, kChunks[IDX_VP8X].tag);
+  err = MuxDeleteAllNamedData(mux, IDX_VP8X);
   if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
 
   // Set flags.
-  if (mux->iccp_ != NULL && mux->iccp_->data_.bytes != NULL) {
+  if (mux->iccp_ != NULL && mux->iccp_->data_.bytes_ != NULL) {
     flags |= ICCP_FLAG;
   }
-  if (mux->exif_ != NULL && mux->exif_->data_.bytes != NULL) {
-    flags |= EXIF_FLAG;
-  }
-  if (mux->xmp_ != NULL && mux->xmp_->data_.bytes != NULL) {
-    flags |= XMP_FLAG;
+
+  if (mux->meta_ != NULL && mux->meta_->data_.bytes_ != NULL) {
+    flags |= META_FLAG;
   }
+
   if (images->header_ != NULL) {
-    if (images->header_->tag_ == kChunks[IDX_FRGM].tag) {
-      // This is a fragmented image.
-      flags |= FRAGMENTS_FLAG;
-    } else if (images->header_->tag_ == kChunks[IDX_ANMF].tag) {
+    if (images->header_->tag_ == kChunks[IDX_TILE].tag) {
+      // This is a tiled image.
+      flags |= TILE_FLAG;
+    } else if (images->header_->tag_ == kChunks[IDX_FRAME].tag) {
       // This is an image with animation.
       flags |= ANIMATION_FLAG;
     }
   }
+
   if (MuxImageCount(images, WEBP_CHUNK_ALPHA) > 0) {
     flags |= ALPHA_FLAG;  // Some images have an alpha channel.
   }
@@ -528,8 +627,9 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
-  if (MuxHasAlpha(images)) {
-    // This means some frames explicitly/implicitly contain alpha.
+  if (MuxHasLosslessImages(images)) {
+    // We have a file with a VP8X chunk having some lossless images.
+    // As lossless images implicitly contain alpha, force ALPHA_FLAG to be true.
     // Note: This 'flags' update must NOT be done for a lossless image
     // without a VP8X chunk!
     flags |= ALPHA_FLAG;
@@ -539,85 +639,43 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
   PutLE24(data + 4, width - 1);   // canvas width.
   PutLE24(data + 7, height - 1);  // canvas height.
 
-  return MuxSet(mux, kChunks[IDX_VP8X].tag, 1, &vp8x, 1);
-}
-
-// Cleans up 'mux' by removing any unnecessary chunks.
-static WebPMuxError MuxCleanup(WebPMux* const mux) {
-  int num_frames;
-  int num_fragments;
-  int num_anim_chunks;
-
-  // If we have an image with single fragment or frame, convert it to a
-  // non-animated non-fragmented image (to avoid writing FRGM/ANMF chunk
-  // unnecessarily).
-  WebPMuxError err = WebPMuxNumChunks(mux, kChunks[IDX_ANMF].id, &num_frames);
-  if (err != WEBP_MUX_OK) return err;
-  err = WebPMuxNumChunks(mux, kChunks[IDX_FRGM].id, &num_fragments);
-  if (err != WEBP_MUX_OK) return err;
-  if (num_frames == 1 || num_fragments == 1) {
-    WebPMuxImage* frame_frag;
-    err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame_frag);
-    assert(err == WEBP_MUX_OK);  // We know that one frame/fragment does exist.
-    if (frame_frag->header_ != NULL) {
-      assert(frame_frag->header_->tag_ == kChunks[IDX_ANMF].tag ||
-             frame_frag->header_->tag_ == kChunks[IDX_FRGM].tag);
-      ChunkDelete(frame_frag->header_);  // Removes ANMF/FRGM chunk.
-      frame_frag->header_ = NULL;
-    }
-    num_frames = 0;
-    num_fragments = 0;
-  }
-  // Remove ANIM chunk if this is a non-animated image.
-  err = WebPMuxNumChunks(mux, kChunks[IDX_ANIM].id, &num_anim_chunks);
-  if (err != WEBP_MUX_OK) return err;
-  if (num_anim_chunks >= 1 && num_frames == 0) {
-    err = MuxDeleteAllNamedData(mux, kChunks[IDX_ANIM].tag);
-    if (err != WEBP_MUX_OK) return err;
-  }
-  return WEBP_MUX_OK;
-}
-
-// Total size of a list of images.
-static size_t ImageListDiskSize(const WebPMuxImage* wpi_list) {
-  size_t size = 0;
-  while (wpi_list != NULL) {
-    size += MuxImageDiskSize(wpi_list);
-    wpi_list = wpi_list->next_;
-  }
-  return size;
-}
-
-// Write out the given list of images into 'dst'.
-static uint8_t* ImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst) {
-  while (wpi_list != NULL) {
-    dst = MuxImageEmit(wpi_list, dst);
-    wpi_list = wpi_list->next_;
-  }
-  return dst;
+  err = MuxAddChunk(mux, 1, kChunks[IDX_VP8X].tag, data, data_size, 1);
+  return err;
 }
 
 WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
   size_t size = 0;
   uint8_t* data = NULL;
   uint8_t* dst = NULL;
+  int num_frames;
+  int num_loop_chunks;
   WebPMuxError err;
 
   if (mux == NULL || assembled_data == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
-  // Finalize mux.
-  err = MuxCleanup(mux);
+  // Remove LOOP chunk if unnecessary.
+  err = WebPMuxNumChunks(mux, kChunks[IDX_LOOP].id, &num_loop_chunks);
   if (err != WEBP_MUX_OK) return err;
+  if (num_loop_chunks >= 1) {
+    err = WebPMuxNumChunks(mux, kChunks[IDX_FRAME].id, &num_frames);
+    if (err != WEBP_MUX_OK) return err;
+    if (num_frames == 0) {
+      err = DeleteLoopCount(mux);
+      if (err != WEBP_MUX_OK) return err;
+    }
+  }
+
+  // Create VP8X chunk.
   err = CreateVP8XChunk(mux);
   if (err != WEBP_MUX_OK) return err;
 
   // Allocate data.
-  size = ChunkListDiskSize(mux->vp8x_) + ChunkListDiskSize(mux->iccp_)
-       + ChunkListDiskSize(mux->anim_) + ImageListDiskSize(mux->images_)
-       + ChunkListDiskSize(mux->exif_) + ChunkListDiskSize(mux->xmp_)
-       + ChunkListDiskSize(mux->unknown_) + RIFF_HEADER_SIZE;
+  size = ChunksListDiskSize(mux->vp8x_) + ChunksListDiskSize(mux->iccp_)
+       + ChunksListDiskSize(mux->loop_) + MuxImageListDiskSize(mux->images_)
+       + ChunksListDiskSize(mux->meta_) + ChunksListDiskSize(mux->unknown_)
+       + RIFF_HEADER_SIZE;
 
   data = (uint8_t*)malloc(size);
   if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
@@ -626,10 +684,9 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
   dst = MuxEmitRiffHeader(data, size);
   dst = ChunkListEmit(mux->vp8x_, dst);
   dst = ChunkListEmit(mux->iccp_, dst);
-  dst = ChunkListEmit(mux->anim_, dst);
-  dst = ImageListEmit(mux->images_, dst);
-  dst = ChunkListEmit(mux->exif_, dst);
-  dst = ChunkListEmit(mux->xmp_, dst);
+  dst = ChunkListEmit(mux->loop_, dst);
+  dst = MuxImageListEmit(mux->images_, dst);
+  dst = ChunkListEmit(mux->meta_, dst);
   dst = ChunkListEmit(mux->unknown_, dst);
   assert(dst == data + size);
 
@@ -641,12 +698,15 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
     size = 0;
   }
 
-  // Finalize data.
-  assembled_data->bytes = data;
-  assembled_data->size = size;
+  // Finalize.
+  assembled_data->bytes_ = data;
+  assembled_data->size_ = size;
 
   return err;
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/mux/muxi.h b/drivers/webp/mux/muxi.h
index 277d5fba1b..edd8c368cd 100644
--- a/drivers/webp/mux/muxi.h
+++ b/drivers/webp/mux/muxi.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Internal header for mux library.
@@ -17,41 +15,34 @@
 #include <stdlib.h>
 #include "../dec/vp8i.h"
 #include "../dec/vp8li.h"
+#include "../webp/format_constants.h"
 #include "../webp/mux.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
 // Defines and constants.
 
-#define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 0
-
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
 struct WebPChunk {
   uint32_t        tag_;
   int             owner_;  // True if *data_ memory is owned internally.
-                           // VP8X, ANIM, and other internally created chunks
-                           // like ANMF/FRGM are always owned.
+                           // VP8X, Loop, and other internally created chunks
+                           // like frame/tile are always owned.
   WebPData        data_;
   WebPChunk*      next_;
 };
 
-// MuxImage object. Store a full WebP image (including ANMF/FRGM chunk, ALPH
+// MuxImage object. Store a full webp image (including frame/tile chunk, alpha
 // chunk and VP8/VP8L chunk),
 typedef struct WebPMuxImage WebPMuxImage;
 struct WebPMuxImage {
-  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF/WEBP_CHUNK_FRGM.
+  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_FRAME/WEBP_CHUNK_TILE.
   WebPChunk*  alpha_;       // Corresponds to WEBP_CHUNK_ALPHA.
   WebPChunk*  img_;         // Corresponds to WEBP_CHUNK_IMAGE.
-  WebPChunk*  unknown_;     // Corresponds to WEBP_CHUNK_UNKNOWN.
-  int         width_;
-  int         height_;
-  int         has_alpha_;   // Through ALPH chunk or as part of VP8L.
   int         is_partial_;  // True if only some of the chunks are filled.
   WebPMuxImage* next_;
 };
@@ -60,9 +51,8 @@ struct WebPMuxImage {
 struct WebPMux {
   WebPMuxImage*   images_;
   WebPChunk*      iccp_;
-  WebPChunk*      exif_;
-  WebPChunk*      xmp_;
-  WebPChunk*      anim_;
+  WebPChunk*      meta_;
+  WebPChunk*      loop_;
   WebPChunk*      vp8x_;
 
   WebPChunk*  unknown_;
@@ -75,14 +65,13 @@ struct WebPMux {
 typedef enum {
   IDX_VP8X = 0,
   IDX_ICCP,
-  IDX_ANIM,
-  IDX_ANMF,
-  IDX_FRGM,
+  IDX_LOOP,
+  IDX_FRAME,
+  IDX_TILE,
   IDX_ALPHA,
   IDX_VP8,
   IDX_VP8L,
-  IDX_EXIF,
-  IDX_XMP,
+  IDX_META,
   IDX_UNKNOWN,
 
   IDX_NIL,
@@ -91,6 +80,8 @@ typedef enum {
 
 #define NIL_TAG 0x00000000u  // To signal void chunk.
 
+#define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
+
 typedef struct {
   uint32_t      tag;
   WebPChunkId   id;
@@ -100,23 +91,55 @@ typedef struct {
 extern const ChunkInfo kChunks[IDX_LAST_CHUNK];
 
 //------------------------------------------------------------------------------
+// Helper functions.
+
+// Read 16, 24 or 32 bits stored in little-endian order.
+static WEBP_INLINE int GetLE16(const uint8_t* const data) {
+  return (int)(data[0] << 0) | (data[1] << 8);
+}
+
+static WEBP_INLINE int GetLE24(const uint8_t* const data) {
+  return GetLE16(data) | (data[2] << 16);
+}
+
+static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
+  return (uint32_t)GetLE16(data) | (GetLE16(data + 2) << 16);
+}
+
+// Store 16, 24 or 32 bits in little-endian order.
+static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
+  assert(val < (1 << 16));
+  data[0] = (val >> 0);
+  data[1] = (val >> 8);
+}
+
+static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
+  assert(val < (1 << 24));
+  PutLE16(data, val & 0xffff);
+  data[2] = (val >> 16);
+}
+
+static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
+  PutLE16(data, (int)(val & 0xffff));
+  PutLE16(data + 2, (int)(val >> 16));
+}
+
+static WEBP_INLINE size_t SizeWithPadding(size_t chunk_size) {
+  return CHUNK_HEADER_SIZE + ((chunk_size + 1) & ~1U);
+}
+
+//------------------------------------------------------------------------------
 // Chunk object management.
 
 // Initialize.
 void ChunkInit(WebPChunk* const chunk);
 
-// Get chunk index from chunk tag. Returns IDX_UNKNOWN if not found.
+// Get chunk index from chunk tag. Returns IDX_NIL if not found.
 CHUNK_INDEX ChunkGetIndexFromTag(uint32_t tag);
 
-// Get chunk id from chunk tag. Returns WEBP_CHUNK_UNKNOWN if not found.
+// Get chunk id from chunk tag. Returns WEBP_CHUNK_NIL if not found.
 WebPChunkId ChunkGetIdFromTag(uint32_t tag);
 
-// Convert a fourcc string to a tag.
-uint32_t ChunkGetTagFromFourCC(const char fourcc[4]);
-
-// Get chunk index from fourcc. Returns IDX_UNKNOWN if given fourcc is unknown.
-CHUNK_INDEX ChunkGetIndexFromFourCC(const char fourcc[4]);
-
 // Search for nth chunk with given 'tag' in the chunk list.
 // nth = 0 means "last of the list".
 WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag);
@@ -127,8 +150,7 @@ WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data,
 
 // Sets 'chunk' at nth position in the 'chunk_list'.
 // nth = 0 has the special meaning "last of the list".
-// On success ownership is transferred from 'chunk' to the 'chunk_list'.
-WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
+WebPMuxError ChunkSetNth(const WebPChunk* chunk, WebPChunk** chunk_list,
                          uint32_t nth);
 
 // Releases chunk and returns chunk->next_.
@@ -137,27 +159,23 @@ WebPChunk* ChunkRelease(WebPChunk* const chunk);
 // Deletes given chunk & returns chunk->next_.
 WebPChunk* ChunkDelete(WebPChunk* const chunk);
 
-// Deletes all chunks in the given chunk list.
-void ChunkListDelete(WebPChunk** const chunk_list);
-
-// Returns size of the chunk including chunk header and padding byte (if any).
-static WEBP_INLINE size_t SizeWithPadding(size_t chunk_size) {
-  return CHUNK_HEADER_SIZE + ((chunk_size + 1) & ~1U);
-}
-
 // Size of a chunk including header and padding.
 static WEBP_INLINE size_t ChunkDiskSize(const WebPChunk* chunk) {
-  const size_t data_size = chunk->data_.size;
+  const size_t data_size = chunk->data_.size_;
   assert(data_size < MAX_CHUNK_PAYLOAD);
   return SizeWithPadding(data_size);
 }
 
 // Total size of a list of chunks.
-size_t ChunkListDiskSize(const WebPChunk* chunk_list);
+size_t ChunksListDiskSize(const WebPChunk* chunk_list);
 
 // Write out the given list of chunks into 'dst'.
 uint8_t* ChunkListEmit(const WebPChunk* chunk_list, uint8_t* dst);
 
+// Get the width & height of image stored in 'image_chunk'.
+WebPMuxError MuxGetImageWidthHeight(const WebPChunk* const image_chunk,
+                                    int* const width, int* const height);
+
 //------------------------------------------------------------------------------
 // MuxImage object management.
 
@@ -171,59 +189,82 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi);
 // 'wpi' can be NULL.
 WebPMuxImage* MuxImageDelete(WebPMuxImage* const wpi);
 
+// Delete all images in 'wpi_list'.
+void MuxImageDeleteAll(WebPMuxImage** const wpi_list);
+
 // Count number of images matching the given tag id in the 'wpi_list'.
-// If id == WEBP_CHUNK_NIL, all images will be matched.
 int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id);
 
-// Update width/height/has_alpha info from chunks within wpi.
-// Also remove ALPH chunk if not needed.
-int MuxImageFinalize(WebPMuxImage* const wpi);
-
 // Check if given ID corresponds to an image related chunk.
 static WEBP_INLINE int IsWPI(WebPChunkId id) {
   switch (id) {
-    case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:
+    case WEBP_CHUNK_FRAME:
+    case WEBP_CHUNK_TILE:
     case WEBP_CHUNK_ALPHA:
     case WEBP_CHUNK_IMAGE:  return 1;
     default:        return 0;
   }
 }
 
+// Get a reference to appropriate chunk list within an image given chunk tag.
+static WEBP_INLINE WebPChunk** MuxImageGetListFromId(
+    const WebPMuxImage* const wpi, WebPChunkId id) {
+  assert(wpi != NULL);
+  switch (id) {
+    case WEBP_CHUNK_FRAME:
+    case WEBP_CHUNK_TILE:  return (WebPChunk**)&wpi->header_;
+    case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
+    case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
+    default: return NULL;
+  }
+}
+
 // Pushes 'wpi' at the end of 'wpi_list'.
 WebPMuxError MuxImagePush(const WebPMuxImage* wpi, WebPMuxImage** wpi_list);
 
-// Delete nth image in the image list.
-WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth);
+// Delete nth image in the image list with given tag id.
+WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth,
+                               WebPChunkId id);
 
-// Get nth image in the image list.
+// Get nth image in the image list with given tag id.
 WebPMuxError MuxImageGetNth(const WebPMuxImage** wpi_list, uint32_t nth,
-                            WebPMuxImage** wpi);
+                            WebPChunkId id, WebPMuxImage** wpi);
 
 // Total size of the given image.
 size_t MuxImageDiskSize(const WebPMuxImage* const wpi);
 
+// Total size of a list of images.
+size_t MuxImageListDiskSize(const WebPMuxImage* wpi_list);
+
 // Write out the given image into 'dst'.
 uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst);
 
+// Write out the given list of images into 'dst'.
+uint8_t* MuxImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst);
+
 //------------------------------------------------------------------------------
 // Helper methods for mux.
 
-// Checks if the given image list contains at least one image with alpha.
-int MuxHasAlpha(const WebPMuxImage* images);
+// Checks if the given image list contains at least one lossless image.
+int MuxHasLosslessImages(const WebPMuxImage* images);
 
 // Write out RIFF header into 'data', given total data size 'size'.
 uint8_t* MuxEmitRiffHeader(uint8_t* const data, size_t size);
 
 // Returns the list where chunk with given ID is to be inserted in mux.
+// Return value is NULL if this chunk should be inserted in mux->images_ list
+// or if 'id' is not known.
 WebPChunk** MuxGetChunkListFromId(const WebPMux* mux, WebPChunkId id);
 
+// Validates that the given mux has a single image.
+WebPMuxError MuxValidateForImage(const WebPMux* const mux);
+
 // Validates the given mux object.
 WebPMuxError MuxValidate(const WebPMux* const mux);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/mux/muxinternal.c b/drivers/webp/mux/muxinternal.c
index 3f992ce130..6c3c4fe60a 100644
--- a/drivers/webp/mux/muxinternal.c
+++ b/drivers/webp/mux/muxinternal.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Internal objects and utils for mux.
@@ -14,33 +12,29 @@
 
 #include <assert.h>
 #include "./muxi.h"
-#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define UNDEFINED_CHUNK_SIZE (-1)
 
 const ChunkInfo kChunks[] = {
   { MKFOURCC('V', 'P', '8', 'X'),  WEBP_CHUNK_VP8X,    VP8X_CHUNK_SIZE },
   { MKFOURCC('I', 'C', 'C', 'P'),  WEBP_CHUNK_ICCP,    UNDEFINED_CHUNK_SIZE },
-  { MKFOURCC('A', 'N', 'I', 'M'),  WEBP_CHUNK_ANIM,    ANIM_CHUNK_SIZE },
-  { MKFOURCC('A', 'N', 'M', 'F'),  WEBP_CHUNK_ANMF,    ANMF_CHUNK_SIZE },
-  { MKFOURCC('F', 'R', 'G', 'M'),  WEBP_CHUNK_FRGM,    FRGM_CHUNK_SIZE },
+  { MKFOURCC('L', 'O', 'O', 'P'),  WEBP_CHUNK_LOOP,    LOOP_CHUNK_SIZE },
+  { MKFOURCC('F', 'R', 'M', ' '),  WEBP_CHUNK_FRAME,   FRAME_CHUNK_SIZE },
+  { MKFOURCC('T', 'I', 'L', 'E'),  WEBP_CHUNK_TILE,    TILE_CHUNK_SIZE },
   { MKFOURCC('A', 'L', 'P', 'H'),  WEBP_CHUNK_ALPHA,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', ' '),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', 'L'),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
-  { MKFOURCC('E', 'X', 'I', 'F'),  WEBP_CHUNK_EXIF,    UNDEFINED_CHUNK_SIZE },
-  { MKFOURCC('X', 'M', 'P', ' '),  WEBP_CHUNK_XMP,     UNDEFINED_CHUNK_SIZE },
-  { NIL_TAG,                       WEBP_CHUNK_UNKNOWN, UNDEFINED_CHUNK_SIZE },
+  { MKFOURCC('M', 'E', 'T', 'A'),  WEBP_CHUNK_META,    UNDEFINED_CHUNK_SIZE },
+  { MKFOURCC('U', 'N', 'K', 'N'),  WEBP_CHUNK_UNKNOWN, UNDEFINED_CHUNK_SIZE },
 
-  { NIL_TAG,                       WEBP_CHUNK_NIL,     UNDEFINED_CHUNK_SIZE }
+  { NIL_TAG,                    WEBP_CHUNK_NIL,     UNDEFINED_CHUNK_SIZE }
 };
 
 //------------------------------------------------------------------------------
-
-int WebPGetMuxVersion(void) {
-  return (MUX_MAJ_VERSION << 16) | (MUX_MIN_VERSION << 8) | MUX_REV_VERSION;
-}
-
-//------------------------------------------------------------------------------
 // Life of a chunk object.
 
 void ChunkInit(WebPChunk* const chunk) {
@@ -66,9 +60,9 @@ WebPChunk* ChunkRelease(WebPChunk* const chunk) {
 CHUNK_INDEX ChunkGetIndexFromTag(uint32_t tag) {
   int i;
   for (i = 0; kChunks[i].tag != NIL_TAG; ++i) {
-    if (tag == kChunks[i].tag) return (CHUNK_INDEX)i;
+    if (tag == kChunks[i].tag) return i;
   }
-  return IDX_UNKNOWN;
+  return IDX_NIL;
 }
 
 WebPChunkId ChunkGetIdFromTag(uint32_t tag) {
@@ -76,16 +70,7 @@ WebPChunkId ChunkGetIdFromTag(uint32_t tag) {
   for (i = 0; kChunks[i].tag != NIL_TAG; ++i) {
     if (tag == kChunks[i].tag) return kChunks[i].id;
   }
-  return WEBP_CHUNK_UNKNOWN;
-}
-
-uint32_t ChunkGetTagFromFourCC(const char fourcc[4]) {
-  return MKFOURCC(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
-}
-
-CHUNK_INDEX ChunkGetIndexFromFourCC(const char fourcc[4]) {
-  const uint32_t tag = ChunkGetTagFromFourCC(fourcc);
-  return ChunkGetIndexFromTag(tag);
+  return WEBP_CHUNK_NIL;
 }
 
 //------------------------------------------------------------------------------
@@ -93,7 +78,7 @@ CHUNK_INDEX ChunkGetIndexFromFourCC(const char fourcc[4]) {
 
 // Returns next chunk in the chunk list with the given tag.
 static WebPChunk* ChunkSearchNextInList(WebPChunk* chunk, uint32_t tag) {
-  while (chunk != NULL && chunk->tag_ != tag) {
+  while (chunk && chunk->tag_ != tag) {
     chunk = chunk->next_;
   }
   return chunk;
@@ -102,7 +87,7 @@ static WebPChunk* ChunkSearchNextInList(WebPChunk* chunk, uint32_t tag) {
 WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag) {
   uint32_t iter = nth;
   first = ChunkSearchNextInList(first, tag);
-  if (first == NULL) return NULL;
+  if (!first) return NULL;
 
   while (--iter != 0) {
     WebPChunk* next_chunk = ChunkSearchNextInList(first->next_, tag);
@@ -114,14 +99,14 @@ WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag) {
 
 // Outputs a pointer to 'prev_chunk->next_',
 //   where 'prev_chunk' is the pointer to the chunk at position (nth - 1).
-// Returns true if nth chunk was found.
+// Returns 1 if nth chunk was found, 0 otherwise.
 static int ChunkSearchListToSet(WebPChunk** chunk_list, uint32_t nth,
                                 WebPChunk*** const location) {
   uint32_t count = 0;
-  assert(chunk_list != NULL);
+  assert(chunk_list);
   *location = chunk_list;
 
-  while (*chunk_list != NULL) {
+  while (*chunk_list) {
     WebPChunk* const cur_chunk = *chunk_list;
     ++count;
     if (count == nth) return 1;  // Found.
@@ -139,25 +124,34 @@ static int ChunkSearchListToSet(WebPChunk** chunk_list, uint32_t nth,
 WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data,
                              int copy_data, uint32_t tag) {
   // For internally allocated chunks, always copy data & make it owner of data.
-  if (tag == kChunks[IDX_VP8X].tag || tag == kChunks[IDX_ANIM].tag) {
+  if (tag == kChunks[IDX_VP8X].tag || tag == kChunks[IDX_LOOP].tag) {
     copy_data = 1;
   }
 
   ChunkRelease(chunk);
 
   if (data != NULL) {
-    if (copy_data) {        // Copy data.
-      if (!WebPDataCopy(data, &chunk->data_)) return WEBP_MUX_MEMORY_ERROR;
-      chunk->owner_ = 1;    // Chunk is owner of data.
-    } else {                // Don't copy data.
+    if (copy_data) {
+      // Copy data.
+      chunk->data_.bytes_ = (uint8_t*)malloc(data->size_);
+      if (chunk->data_.bytes_ == NULL) return WEBP_MUX_MEMORY_ERROR;
+      memcpy((uint8_t*)chunk->data_.bytes_, data->bytes_, data->size_);
+      chunk->data_.size_ = data->size_;
+
+      // Chunk is owner of data.
+      chunk->owner_ = 1;
+    } else {
+      // Don't copy data.
       chunk->data_ = *data;
     }
   }
+
   chunk->tag_ = tag;
+
   return WEBP_MUX_OK;
 }
 
-WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
+WebPMuxError ChunkSetNth(const WebPChunk* chunk, WebPChunk** chunk_list,
                          uint32_t nth) {
   WebPChunk* new_chunk;
 
@@ -168,7 +162,6 @@ WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
   new_chunk = (WebPChunk*)malloc(sizeof(*new_chunk));
   if (new_chunk == NULL) return WEBP_MUX_MEMORY_ERROR;
   *new_chunk = *chunk;
-  chunk->owner_ = 0;
   new_chunk->next_ = *chunk_list;
   *chunk_list = new_chunk;
   return WEBP_MUX_OK;
@@ -183,43 +176,66 @@ WebPChunk* ChunkDelete(WebPChunk* const chunk) {
   return next;
 }
 
-void ChunkListDelete(WebPChunk** const chunk_list) {
-  while (*chunk_list != NULL) {
-    *chunk_list = ChunkDelete(*chunk_list);
-  }
-}
-
 //------------------------------------------------------------------------------
 // Chunk serialization methods.
 
+size_t ChunksListDiskSize(const WebPChunk* chunk_list) {
+  size_t size = 0;
+  while (chunk_list) {
+    size += ChunkDiskSize(chunk_list);
+    chunk_list = chunk_list->next_;
+  }
+  return size;
+}
+
 static uint8_t* ChunkEmit(const WebPChunk* const chunk, uint8_t* dst) {
-  const size_t chunk_size = chunk->data_.size;
+  const size_t chunk_size = chunk->data_.size_;
   assert(chunk);
   assert(chunk->tag_ != NIL_TAG);
   PutLE32(dst + 0, chunk->tag_);
   PutLE32(dst + TAG_SIZE, (uint32_t)chunk_size);
   assert(chunk_size == (uint32_t)chunk_size);
-  memcpy(dst + CHUNK_HEADER_SIZE, chunk->data_.bytes, chunk_size);
+  memcpy(dst + CHUNK_HEADER_SIZE, chunk->data_.bytes_, chunk_size);
   if (chunk_size & 1)
     dst[CHUNK_HEADER_SIZE + chunk_size] = 0;  // Add padding.
   return dst + ChunkDiskSize(chunk);
 }
 
 uint8_t* ChunkListEmit(const WebPChunk* chunk_list, uint8_t* dst) {
-  while (chunk_list != NULL) {
+  while (chunk_list) {
     dst = ChunkEmit(chunk_list, dst);
     chunk_list = chunk_list->next_;
   }
   return dst;
 }
 
-size_t ChunkListDiskSize(const WebPChunk* chunk_list) {
-  size_t size = 0;
-  while (chunk_list != NULL) {
-    size += ChunkDiskSize(chunk_list);
-    chunk_list = chunk_list->next_;
+//------------------------------------------------------------------------------
+// Manipulation of a WebPData object.
+
+void WebPDataInit(WebPData* webp_data) {
+  if (webp_data != NULL) {
+    memset(webp_data, 0, sizeof(*webp_data));
   }
-  return size;
+}
+
+void WebPDataClear(WebPData* webp_data) {
+  if (webp_data != NULL) {
+    free((void*)webp_data->bytes_);
+    WebPDataInit(webp_data);
+  }
+}
+
+int WebPDataCopy(const WebPData* src, WebPData* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  WebPDataInit(dst);
+  if (src->bytes_ != NULL && src->size_ != 0) {
+    dst->bytes_ = (uint8_t*)malloc(src->size_);
+    if (dst->bytes_ == NULL) return 0;
+    memcpy((void*)dst->bytes_, src->bytes_, src->size_);
+    dst->size_ = src->size_;
+  }
+  return 1;
 }
 
 //------------------------------------------------------------------------------
@@ -236,7 +252,6 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) {
   ChunkDelete(wpi->header_);
   ChunkDelete(wpi->alpha_);
   ChunkDelete(wpi->img_);
-  ChunkListDelete(&wpi->unknown_);
 
   next = wpi->next_;
   MuxImageInit(wpi);
@@ -246,31 +261,14 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) {
 //------------------------------------------------------------------------------
 // MuxImage search methods.
 
-// Get a reference to appropriate chunk list within an image given chunk tag.
-static WebPChunk** GetChunkListFromId(const WebPMuxImage* const wpi,
-                                      WebPChunkId id) {
-  assert(wpi != NULL);
-  switch (id) {
-    case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:  return (WebPChunk**)&wpi->header_;
-    case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
-    case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
-    default: return NULL;
-  }
-}
-
 int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id) {
   int count = 0;
   const WebPMuxImage* current;
   for (current = wpi_list; current != NULL; current = current->next_) {
-    if (id == WEBP_CHUNK_NIL) {
-      ++count;  // Special case: count all images.
-    } else {
-      const WebPChunk* const wpi_chunk = *GetChunkListFromId(current, id);
-      if (wpi_chunk != NULL) {
-        const WebPChunkId wpi_chunk_id = ChunkGetIdFromTag(wpi_chunk->tag_);
-        if (wpi_chunk_id == id) ++count;  // Count images with a matching 'id'.
-      }
+    const WebPChunk* const wpi_chunk = *MuxImageGetListFromId(current, id);
+    if (wpi_chunk != NULL) {
+      const WebPChunkId wpi_chunk_id = ChunkGetIdFromTag(wpi_chunk->tag_);
+      if (wpi_chunk_id == id) ++count;
     }
   }
   return count;
@@ -278,22 +276,34 @@ int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id) {
 
 // Outputs a pointer to 'prev_wpi->next_',
 //   where 'prev_wpi' is the pointer to the image at position (nth - 1).
-// Returns true if nth image was found.
+// Returns 1 if nth image with given id was found, 0 otherwise.
 static int SearchImageToGetOrDelete(WebPMuxImage** wpi_list, uint32_t nth,
+                                    WebPChunkId id,
                                     WebPMuxImage*** const location) {
   uint32_t count = 0;
   assert(wpi_list);
   *location = wpi_list;
 
+  // Search makes sense only for the following.
+  assert(id == WEBP_CHUNK_FRAME || id == WEBP_CHUNK_TILE ||
+         id == WEBP_CHUNK_IMAGE);
+  assert(id != WEBP_CHUNK_IMAGE || nth == 1);
+
   if (nth == 0) {
-    nth = MuxImageCount(*wpi_list, WEBP_CHUNK_NIL);
+    nth = MuxImageCount(*wpi_list, id);
     if (nth == 0) return 0;  // Not found.
   }
 
-  while (*wpi_list != NULL) {
+  while (*wpi_list) {
     WebPMuxImage* const cur_wpi = *wpi_list;
-    ++count;
-    if (count == nth) return 1;  // Found.
+    const WebPChunk* const wpi_chunk = *MuxImageGetListFromId(cur_wpi, id);
+    if (wpi_chunk != NULL) {
+      const WebPChunkId wpi_chunk_id = ChunkGetIdFromTag(wpi_chunk->tag_);
+      if (wpi_chunk_id == id) {
+        ++count;
+        if (count == nth) return 1;  // Found.
+      }
+    }
     wpi_list = &cur_wpi->next_;
     *location = wpi_list;
   }
@@ -335,9 +345,16 @@ WebPMuxImage* MuxImageDelete(WebPMuxImage* const wpi) {
   return next;
 }
 
-WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth) {
+void MuxImageDeleteAll(WebPMuxImage** const wpi_list) {
+  while (*wpi_list) {
+    *wpi_list = MuxImageDelete(*wpi_list);
+  }
+}
+
+WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth,
+                               WebPChunkId id) {
   assert(wpi_list);
-  if (!SearchImageToGetOrDelete(wpi_list, nth, &wpi_list)) {
+  if (!SearchImageToGetOrDelete(wpi_list, nth, id, &wpi_list)) {
     return WEBP_MUX_NOT_FOUND;
   }
   *wpi_list = MuxImageDelete(*wpi_list);
@@ -348,10 +365,10 @@ WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth) {
 // MuxImage reader methods.
 
 WebPMuxError MuxImageGetNth(const WebPMuxImage** wpi_list, uint32_t nth,
-                            WebPMuxImage** wpi) {
+                            WebPChunkId id, WebPMuxImage** wpi) {
   assert(wpi_list);
   assert(wpi);
-  if (!SearchImageToGetOrDelete((WebPMuxImage**)wpi_list, nth,
+  if (!SearchImageToGetOrDelete((WebPMuxImage**)wpi_list, nth, id,
                                 (WebPMuxImage***)&wpi_list)) {
     return WEBP_MUX_NOT_FOUND;
   }
@@ -368,48 +385,47 @@ size_t MuxImageDiskSize(const WebPMuxImage* const wpi) {
   if (wpi->header_ != NULL) size += ChunkDiskSize(wpi->header_);
   if (wpi->alpha_ != NULL) size += ChunkDiskSize(wpi->alpha_);
   if (wpi->img_ != NULL) size += ChunkDiskSize(wpi->img_);
-  if (wpi->unknown_ != NULL) size += ChunkListDiskSize(wpi->unknown_);
   return size;
 }
 
-// Special case as ANMF/FRGM chunk encapsulates other image chunks.
-static uint8_t* ChunkEmitSpecial(const WebPChunk* const header,
-                                 size_t total_size, uint8_t* dst) {
-  const size_t header_size = header->data_.size;
-  const size_t offset_to_next = total_size - CHUNK_HEADER_SIZE;
-  assert(header->tag_ == kChunks[IDX_ANMF].tag ||
-         header->tag_ == kChunks[IDX_FRGM].tag);
-  PutLE32(dst + 0, header->tag_);
-  PutLE32(dst + TAG_SIZE, (uint32_t)offset_to_next);
-  assert(header_size == (uint32_t)header_size);
-  memcpy(dst + CHUNK_HEADER_SIZE, header->data_.bytes, header_size);
-  if (header_size & 1) {
-    dst[CHUNK_HEADER_SIZE + header_size] = 0;  // Add padding.
+size_t MuxImageListDiskSize(const WebPMuxImage* wpi_list) {
+  size_t size = 0;
+  while (wpi_list) {
+    size += MuxImageDiskSize(wpi_list);
+    wpi_list = wpi_list->next_;
   }
-  return dst + ChunkDiskSize(header);
+  return size;
 }
 
 uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst) {
   // Ordering of chunks to be emitted is strictly as follows:
-  // 1. ANMF/FRGM chunk (if present).
-  // 2. ALPH chunk (if present).
+  // 1. Frame/Tile chunk (if present).
+  // 2. Alpha chunk (if present).
   // 3. VP8/VP8L chunk.
   assert(wpi);
-  if (wpi->header_ != NULL) {
-    dst = ChunkEmitSpecial(wpi->header_, MuxImageDiskSize(wpi), dst);
-  }
+  if (wpi->header_ != NULL) dst = ChunkEmit(wpi->header_, dst);
   if (wpi->alpha_ != NULL) dst = ChunkEmit(wpi->alpha_, dst);
   if (wpi->img_ != NULL) dst = ChunkEmit(wpi->img_, dst);
-  if (wpi->unknown_ != NULL) dst = ChunkListEmit(wpi->unknown_, dst);
+  return dst;
+}
+
+uint8_t* MuxImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst) {
+  while (wpi_list) {
+    dst = MuxImageEmit(wpi_list, dst);
+    wpi_list = wpi_list->next_;
+  }
   return dst;
 }
 
 //------------------------------------------------------------------------------
 // Helper methods for mux.
 
-int MuxHasAlpha(const WebPMuxImage* images) {
+int MuxHasLosslessImages(const WebPMuxImage* images) {
   while (images != NULL) {
-    if (images->has_alpha_) return 1;
+    assert(images->img_ != NULL);
+    if (images->img_->tag_ == kChunks[IDX_VP8L].tag) {
+      return 1;
+    }
     images = images->next_;
   }
   return 0;
@@ -425,13 +441,30 @@ uint8_t* MuxEmitRiffHeader(uint8_t* const data, size_t size) {
 
 WebPChunk** MuxGetChunkListFromId(const WebPMux* mux, WebPChunkId id) {
   assert(mux != NULL);
-  switch (id) {
+  switch(id) {
     case WEBP_CHUNK_VP8X:    return (WebPChunk**)&mux->vp8x_;
     case WEBP_CHUNK_ICCP:    return (WebPChunk**)&mux->iccp_;
-    case WEBP_CHUNK_ANIM:    return (WebPChunk**)&mux->anim_;
-    case WEBP_CHUNK_EXIF:    return (WebPChunk**)&mux->exif_;
-    case WEBP_CHUNK_XMP:     return (WebPChunk**)&mux->xmp_;
-    default:                 return (WebPChunk**)&mux->unknown_;
+    case WEBP_CHUNK_LOOP:    return (WebPChunk**)&mux->loop_;
+    case WEBP_CHUNK_META:    return (WebPChunk**)&mux->meta_;
+    case WEBP_CHUNK_UNKNOWN: return (WebPChunk**)&mux->unknown_;
+    default: return NULL;
+  }
+}
+
+WebPMuxError MuxValidateForImage(const WebPMux* const mux) {
+  const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
+  const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_FRAME);
+  const int num_tiles  = MuxImageCount(mux->images_, WEBP_CHUNK_TILE);
+
+  if (num_images == 0) {
+    // No images in mux.
+    return WEBP_MUX_NOT_FOUND;
+  } else if (num_images == 1 && num_frames == 0 && num_tiles == 0) {
+    // Valid case (single image).
+    return WEBP_MUX_OK;
+  } else {
+    // Frame/Tile case OR an invalid mux.
+    return WEBP_MUX_INVALID_ARGUMENT;
   }
 }
 
@@ -447,7 +480,7 @@ static int IsNotCompatible(int feature, int num_items) {
 // On success returns WEBP_MUX_OK and stores the chunk count in *num.
 static WebPMuxError ValidateChunk(const WebPMux* const mux, CHUNK_INDEX idx,
                                   WebPFeatureFlags feature,
-                                  uint32_t vp8x_flags,
+                                  WebPFeatureFlags vp8x_flags,
                                   int max, int* num) {
   const WebPMuxError err =
       WebPMuxNumChunks(mux, kChunks[idx].id, num);
@@ -461,11 +494,10 @@ static WebPMuxError ValidateChunk(const WebPMux* const mux, CHUNK_INDEX idx,
 
 WebPMuxError MuxValidate(const WebPMux* const mux) {
   int num_iccp;
-  int num_exif;
-  int num_xmp;
-  int num_anim;
+  int num_meta;
+  int num_loop_chunks;
   int num_frames;
-  int num_fragments;
+  int num_tiles;
   int num_vp8x;
   int num_images;
   int num_alpha;
@@ -485,33 +517,29 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   err = ValidateChunk(mux, IDX_ICCP, ICCP_FLAG, flags, 1, &num_iccp);
   if (err != WEBP_MUX_OK) return err;
 
-  // At most one EXIF metadata.
-  err = ValidateChunk(mux, IDX_EXIF, EXIF_FLAG, flags, 1, &num_exif);
-  if (err != WEBP_MUX_OK) return err;
-
   // At most one XMP metadata.
-  err = ValidateChunk(mux, IDX_XMP, XMP_FLAG, flags, 1, &num_xmp);
+  err = ValidateChunk(mux, IDX_META, META_FLAG, flags, 1, &num_meta);
   if (err != WEBP_MUX_OK) return err;
 
-  // Animation: ANIMATION_FLAG, ANIM chunk and ANMF chunk(s) are consistent.
-  // At most one ANIM chunk.
-  err = ValidateChunk(mux, IDX_ANIM, NO_FLAG, flags, 1, &num_anim);
+  // Animation: ANIMATION_FLAG, loop chunk and frame chunk(s) are consistent.
+  // At most one loop chunk.
+  err = ValidateChunk(mux, IDX_LOOP, NO_FLAG, flags, 1, &num_loop_chunks);
   if (err != WEBP_MUX_OK) return err;
-  err = ValidateChunk(mux, IDX_ANMF, NO_FLAG, flags, -1, &num_frames);
+  err = ValidateChunk(mux, IDX_FRAME, NO_FLAG, flags, -1, &num_frames);
   if (err != WEBP_MUX_OK) return err;
 
   {
     const int has_animation = !!(flags & ANIMATION_FLAG);
-    if (has_animation && (num_anim == 0 || num_frames == 0)) {
+    if (has_animation && (num_loop_chunks == 0 || num_frames == 0)) {
       return WEBP_MUX_INVALID_ARGUMENT;
     }
-    if (!has_animation && (num_anim == 1 || num_frames > 0)) {
+    if (!has_animation && (num_loop_chunks == 1 || num_frames > 0)) {
       return WEBP_MUX_INVALID_ARGUMENT;
     }
   }
 
-  // Fragmentation: FRAGMENTS_FLAG and FRGM chunk(s) are consistent.
-  err = ValidateChunk(mux, IDX_FRGM, FRAGMENTS_FLAG, flags, -1, &num_fragments);
+  // Tiling: TILE_FLAG and tile chunk(s) are consistent.
+  err = ValidateChunk(mux, IDX_TILE, TILE_FLAG, flags, -1, &num_tiles);
   if (err != WEBP_MUX_OK) return err;
 
   // Verify either VP8X chunk is present OR there is only one elem in
@@ -523,22 +551,16 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;
 
   // ALPHA_FLAG & alpha chunk(s) are consistent.
-  if (MuxHasAlpha(mux->images_)) {
-    if (num_vp8x > 0) {
-      // VP8X chunk is present, so it should contain ALPHA_FLAG.
-      if (!(flags & ALPHA_FLAG)) return WEBP_MUX_INVALID_ARGUMENT;
-    } else {
-      // VP8X chunk is not present, so ALPH chunks should NOT be present either.
-      err = WebPMuxNumChunks(mux, WEBP_CHUNK_ALPHA, &num_alpha);
-      if (err != WEBP_MUX_OK) return err;
-      if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
-    }
-  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
-    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
+  if (num_vp8x > 0 && MuxHasLosslessImages(mux->images_)) {
+    // Special case: we have a VP8X chunk as well as some lossless images.
+    if (!(flags & ALPHA_FLAG)) return WEBP_MUX_INVALID_ARGUMENT;
+  } else {
+    err = ValidateChunk(mux, IDX_ALPHA, ALPHA_FLAG, flags, -1, &num_alpha);
+    if (err != WEBP_MUX_OK) return err;
   }
 
-  // num_fragments & num_images are consistent.
-  if (num_fragments > 0 && num_images != num_fragments) {
+  // num_tiles & num_images are consistent.
+  if (num_tiles > 0 && num_images != num_tiles) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
@@ -549,3 +571,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/mux/muxread.c b/drivers/webp/mux/muxread.c
index 6003a25b71..21c3cfbaeb 100644
--- a/drivers/webp/mux/muxread.c
+++ b/drivers/webp/mux/muxread.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Read APIs for mux.
@@ -14,7 +12,10 @@
 
 #include <assert.h>
 #include "./muxi.h"
-#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 // Helper method(s).
@@ -40,9 +41,8 @@ static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx,
 
   SWITCH_ID_LIST(IDX_VP8X, mux->vp8x_);
   SWITCH_ID_LIST(IDX_ICCP, mux->iccp_);
-  SWITCH_ID_LIST(IDX_ANIM, mux->anim_);
-  SWITCH_ID_LIST(IDX_EXIF, mux->exif_);
-  SWITCH_ID_LIST(IDX_XMP, mux->xmp_);
+  SWITCH_ID_LIST(IDX_LOOP, mux->loop_);
+  SWITCH_ID_LIST(IDX_META, mux->meta_);
   SWITCH_ID_LIST(IDX_UNKNOWN, mux->unknown_);
   return WEBP_MUX_NOT_FOUND;
 }
@@ -50,9 +50,10 @@ static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx,
 
 // Fill the chunk with the given data (includes chunk header bytes), after some
 // verifications.
-static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
-                                         const uint8_t* data, size_t data_size,
-                                         size_t riff_size, int copy_data) {
+static WebPMuxError ChunkVerifyAndAssignData(WebPChunk* chunk,
+                                             const uint8_t* data,
+                                             size_t data_size, size_t riff_size,
+                                             int copy_data) {
   uint32_t chunk_size;
   WebPData chunk_data;
 
@@ -67,103 +68,11 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
   }
 
   // Data assignment.
-  chunk_data.bytes = data + CHUNK_HEADER_SIZE;
-  chunk_data.size = chunk_size;
+  chunk_data.bytes_ = data + CHUNK_HEADER_SIZE;
+  chunk_data.size_ = chunk_size;
   return ChunkAssignData(chunk, &chunk_data, copy_data, GetLE32(data + 0));
 }
 
-int MuxImageFinalize(WebPMuxImage* const wpi) {
-  const WebPChunk* const img = wpi->img_;
-  const WebPData* const image = &img->data_;
-  const int is_lossless = (img->tag_ == kChunks[IDX_VP8L].tag);
-  int w, h;
-  int vp8l_has_alpha = 0;
-  const int ok = is_lossless ?
-      VP8LGetInfo(image->bytes, image->size, &w, &h, &vp8l_has_alpha) :
-      VP8GetInfo(image->bytes, image->size, image->size, &w, &h);
-  assert(img != NULL);
-  if (ok) {
-    // Ignore ALPH chunk accompanying VP8L.
-    if (is_lossless && (wpi->alpha_ != NULL)) {
-      ChunkDelete(wpi->alpha_);
-      wpi->alpha_ = NULL;
-    }
-    wpi->width_ = w;
-    wpi->height_ = h;
-    wpi->has_alpha_ = vp8l_has_alpha || (wpi->alpha_ != NULL);
-  }
-  return ok;
-}
-
-static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
-                         WebPMuxImage* const wpi) {
-  const uint8_t* bytes = chunk->data_.bytes;
-  size_t size = chunk->data_.size;
-  const uint8_t* const last = bytes + size;
-  WebPChunk subchunk;
-  size_t subchunk_size;
-  ChunkInit(&subchunk);
-
-  assert(chunk->tag_ == kChunks[IDX_ANMF].tag ||
-         chunk->tag_ == kChunks[IDX_FRGM].tag);
-  assert(!wpi->is_partial_);
-
-  // ANMF/FRGM.
-  {
-    const size_t hdr_size = (chunk->tag_ == kChunks[IDX_ANMF].tag) ?
-        ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
-    const WebPData temp = { bytes, hdr_size };
-    // Each of ANMF and FRGM chunk contain a header at the beginning. So, its
-    // size should at least be 'hdr_size'.
-    if (size < hdr_size) goto Fail;
-    ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_);
-  }
-  ChunkSetNth(&subchunk, &wpi->header_, 1);
-  wpi->is_partial_ = 1;  // Waiting for ALPH and/or VP8/VP8L chunks.
-
-  // Rest of the chunks.
-  subchunk_size = ChunkDiskSize(&subchunk) - CHUNK_HEADER_SIZE;
-  bytes += subchunk_size;
-  size -= subchunk_size;
-
-  while (bytes != last) {
-    ChunkInit(&subchunk);
-    if (ChunkVerifyAndAssign(&subchunk, bytes, size, size,
-                             copy_data) != WEBP_MUX_OK) {
-      goto Fail;
-    }
-    switch (ChunkGetIdFromTag(subchunk.tag_)) {
-      case WEBP_CHUNK_ALPHA:
-        if (wpi->alpha_ != NULL) goto Fail;  // Consecutive ALPH chunks.
-        if (ChunkSetNth(&subchunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Fail;
-        wpi->is_partial_ = 1;  // Waiting for a VP8 chunk.
-        break;
-      case WEBP_CHUNK_IMAGE:
-        if (ChunkSetNth(&subchunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Fail;
-        if (!MuxImageFinalize(wpi)) goto Fail;
-        wpi->is_partial_ = 0;  // wpi is completely filled.
-        break;
-      case WEBP_CHUNK_UNKNOWN:
-        if (wpi->is_partial_) goto Fail;  // Encountered an unknown chunk
-                                          // before some image chunks.
-        if (ChunkSetNth(&subchunk, &wpi->unknown_, 0) != WEBP_MUX_OK) goto Fail;
-        break;
-      default:
-        goto Fail;
-        break;
-    }
-    subchunk_size = ChunkDiskSize(&subchunk);
-    bytes += subchunk_size;
-    size -= subchunk_size;
-  }
-  if (wpi->is_partial_) goto Fail;
-  return 1;
-
- Fail:
-  ChunkRelease(&subchunk);
-  return 0;
-}
-
 //------------------------------------------------------------------------------
 // Create a mux object from WebP-RIFF data.
 
@@ -185,8 +94,8 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   }
   if (bitstream == NULL) return NULL;
 
-  data = bitstream->bytes;
-  size = bitstream->size;
+  data = bitstream->bytes_;
+  size = bitstream->size_;
 
   if (data == NULL) return NULL;
   if (size < RIFF_HEADER_SIZE) return NULL;
@@ -226,48 +135,42 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
 
   // Loop over chunks.
   while (data != end) {
-    size_t data_size;
     WebPChunkId id;
-    WebPChunk** chunk_list;
-    if (ChunkVerifyAndAssign(&chunk, data, size, riff_size,
-                             copy_data) != WEBP_MUX_OK) {
-      goto Err;
-    }
-    data_size = ChunkDiskSize(&chunk);
+    WebPMuxError err;
+
+    err = ChunkVerifyAndAssignData(&chunk, data, size, riff_size, copy_data);
+    if (err != WEBP_MUX_OK) goto Err;
+
     id = ChunkGetIdFromTag(chunk.tag_);
-    switch (id) {
-      case WEBP_CHUNK_ALPHA:
-        if (wpi->alpha_ != NULL) goto Err;  // Consecutive ALPH chunks.
-        if (ChunkSetNth(&chunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Err;
-        wpi->is_partial_ = 1;  // Waiting for a VP8 chunk.
-        break;
-      case WEBP_CHUNK_IMAGE:
-        if (ChunkSetNth(&chunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Err;
-        if (!MuxImageFinalize(wpi)) goto Err;
+
+    if (IsWPI(id)) {  // An image chunk (frame/tile/alpha/vp8).
+      WebPChunk** wpi_chunk_ptr =
+          MuxImageGetListFromId(wpi, id);  // Image chunk to set.
+      assert(wpi_chunk_ptr != NULL);
+      if (*wpi_chunk_ptr != NULL) goto Err;  // Consecutive alpha chunks or
+                                             // consecutive frame/tile chunks.
+      if (ChunkSetNth(&chunk, wpi_chunk_ptr, 1) != WEBP_MUX_OK) goto Err;
+      if (id == WEBP_CHUNK_IMAGE) {
         wpi->is_partial_ = 0;  // wpi is completely filled.
- PushImage:
         // Add this to mux->images_ list.
         if (MuxImagePush(wpi, &mux->images_) != WEBP_MUX_OK) goto Err;
         MuxImageInit(wpi);  // Reset for reading next image.
-        break;
-      case WEBP_CHUNK_ANMF:
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      case WEBP_CHUNK_FRGM:
-#endif
-        if (wpi->is_partial_) goto Err;  // Previous wpi is still incomplete.
-        if (!MuxImageParse(&chunk, copy_data, wpi)) goto Err;
-        ChunkRelease(&chunk);
-        goto PushImage;
-        break;
-      default:  // A non-image chunk.
-        if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
-                                         // getting all chunks of an image.
-        chunk_list = MuxGetChunkListFromId(mux, id);  // List to add this chunk.
-        if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err;
-        break;
+      } else {
+        wpi->is_partial_ = 1;  // wpi is only partially filled.
+      }
+    } else {  // A non-image chunk.
+      WebPChunk** chunk_list;
+      if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
+                                       // getting all chunks of an image.
+      chunk_list = MuxGetChunkListFromId(mux, id);  // List to add this chunk.
+      if (chunk_list == NULL) chunk_list = &mux->unknown_;
+      if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err;
+    }
+    {
+      const size_t data_size = ChunkDiskSize(&chunk);
+      data += data_size;
+      size -= data_size;
     }
-    data += data_size;
-    size -= data_size;
     ChunkInit(&chunk);
   }
 
@@ -287,66 +190,29 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
 //------------------------------------------------------------------------------
 // Get API(s).
 
-// Validates that the given mux has a single image.
-static WebPMuxError ValidateForSingleImage(const WebPMux* const mux) {
-  const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
-  const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_ANMF);
-  const int num_fragments = MuxImageCount(mux->images_, WEBP_CHUNK_FRGM);
-
-  if (num_images == 0) {
-    // No images in mux.
-    return WEBP_MUX_NOT_FOUND;
-  } else if (num_images == 1 && num_frames == 0 && num_fragments == 0) {
-    // Valid case (single image).
-    return WEBP_MUX_OK;
-  } else {
-    // Frame/Fragment case OR an invalid mux.
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-}
-
-// Get the canvas width, height and flags after validating that VP8X/VP8/VP8L
-// chunk and canvas size are valid.
-static WebPMuxError MuxGetCanvasInfo(const WebPMux* const mux,
-                                     int* width, int* height, uint32_t* flags) {
-  int w, h;
-  uint32_t f = 0;
+WebPMuxError WebPMuxGetFeatures(const WebPMux* mux, uint32_t* flags) {
   WebPData data;
-  assert(mux != NULL);
+  WebPMuxError err;
+
+  if (mux == NULL || flags == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  *flags = 0;
 
   // Check if VP8X chunk is present.
-  if (MuxGet(mux, IDX_VP8X, 1, &data) == WEBP_MUX_OK) {
-    if (data.size < VP8X_CHUNK_SIZE) return WEBP_MUX_BAD_DATA;
-    f = GetLE32(data.bytes + 0);
-    w = GetLE24(data.bytes + 4) + 1;
-    h = GetLE24(data.bytes + 7) + 1;
-  } else {  // Single image case.
-    const WebPMuxImage* const wpi = mux->images_;
-    WebPMuxError err = ValidateForSingleImage(mux);
-    if (err != WEBP_MUX_OK) return err;
-    assert(wpi != NULL);
-    w = wpi->width_;
-    h = wpi->height_;
-    if (wpi->has_alpha_) f |= ALPHA_FLAG;
+  err = MuxGet(mux, IDX_VP8X, 1, &data);
+  if (err == WEBP_MUX_NOT_FOUND) {
+    // Check if VP8/VP8L chunk is present.
+    err = WebPMuxGetImage(mux, &data);
+    WebPDataClear(&data);
+    return err;
+  } else if (err != WEBP_MUX_OK) {
+    return err;
   }
-  if (w * (uint64_t)h >= MAX_IMAGE_AREA) return WEBP_MUX_BAD_DATA;
 
-  if (width != NULL) *width = w;
-  if (height != NULL) *height = h;
-  if (flags != NULL) *flags = f;
-  return WEBP_MUX_OK;
-}
-
-WebPMuxError WebPMuxGetCanvasSize(const WebPMux* mux, int* width, int* height) {
-  if (mux == NULL || width == NULL || height == NULL) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-  return MuxGetCanvasInfo(mux, width, height, NULL);
-}
+  if (data.size_ < CHUNK_SIZE_BYTES) return WEBP_MUX_BAD_DATA;
 
-WebPMuxError WebPMuxGetFeatures(const WebPMux* mux, uint32_t* flags) {
-  if (mux == NULL || flags == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-  return MuxGetCanvasInfo(mux, NULL, NULL, flags);
+  // All OK. Fill up flags.
+  *flags = GetLE32(data.bytes_);
+  return WEBP_MUX_OK;
 }
 
 static uint8_t* EmitVP8XChunk(uint8_t* const dst, int width,
@@ -364,7 +230,7 @@ static uint8_t* EmitVP8XChunk(uint8_t* const dst, int width,
 }
 
 // Assemble a single image WebP bitstream from 'wpi'.
-static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
+static WebPMuxError SynthesizeBitstream(WebPMuxImage* const wpi,
                                         WebPData* const bitstream) {
   uint8_t* dst;
 
@@ -372,7 +238,7 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   const int need_vp8x = (wpi->alpha_ != NULL);
   const size_t vp8x_size = need_vp8x ? CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE : 0;
   const size_t alpha_size = need_vp8x ? ChunkDiskSize(wpi->alpha_) : 0;
-  // Note: No need to output ANMF/FRGM chunk for a single image.
+  // Note: No need to output FRM/TILE chunk for a single image.
   const size_t size = RIFF_HEADER_SIZE + vp8x_size + alpha_size +
                       ChunkDiskSize(wpi->img_);
   uint8_t* const data = (uint8_t*)malloc(size);
@@ -382,7 +248,15 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   dst = MuxEmitRiffHeader(data, size);
 
   if (need_vp8x) {
-    dst = EmitVP8XChunk(dst, wpi->width_, wpi->height_, ALPHA_FLAG);  // VP8X.
+    int w, h;
+    WebPMuxError err;
+    assert(wpi->img_ != NULL);
+    err = MuxGetImageWidthHeight(wpi->img_, &w, &h);
+    if (err != WEBP_MUX_OK) {
+      free(data);
+      return err;
+    }
+    dst = EmitVP8XChunk(dst, w, h, ALPHA_FLAG);  // VP8X.
     dst = ChunkListEmit(wpi->alpha_, dst);       // ALPH.
   }
 
@@ -391,117 +265,107 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   assert(dst == data + size);
 
   // Output.
-  bitstream->bytes = data;
-  bitstream->size = size;
+  bitstream->bytes_ = data;
+  bitstream->size_ = size;
   return WEBP_MUX_OK;
 }
 
-WebPMuxError WebPMuxGetChunk(const WebPMux* mux, const char fourcc[4],
-                             WebPData* chunk_data) {
-  CHUNK_INDEX idx;
-  if (mux == NULL || fourcc == NULL || chunk_data == NULL) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-  idx = ChunkGetIndexFromFourCC(fourcc);
-  if (IsWPI(kChunks[idx].id)) {     // An image chunk.
+WebPMuxError WebPMuxGetImage(const WebPMux* mux, WebPData* bitstream) {
+  WebPMuxError err;
+  WebPMuxImage* wpi = NULL;
+
+  if (mux == NULL || bitstream == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
-  } else if (idx != IDX_UNKNOWN) {  // A known chunk type.
-    return MuxGet(mux, idx, 1, chunk_data);
-  } else {                          // An unknown chunk type.
-    const WebPChunk* const chunk =
-        ChunkSearchList(mux->unknown_, 1, ChunkGetTagFromFourCC(fourcc));
-    if (chunk == NULL) return WEBP_MUX_NOT_FOUND;
-    *chunk_data = chunk->data_;
-    return WEBP_MUX_OK;
   }
+
+  err = MuxValidateForImage(mux);
+  if (err != WEBP_MUX_OK) return err;
+
+  // All well. Get the image.
+  err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, WEBP_CHUNK_IMAGE,
+                       &wpi);
+  assert(err == WEBP_MUX_OK);  // Already tested above.
+
+  return SynthesizeBitstream(wpi, bitstream);
 }
 
-static WebPMuxError MuxGetImageInternal(const WebPMuxImage* const wpi,
-                                        WebPMuxFrameInfo* const info) {
-  // Set some defaults for unrelated fields.
-  info->x_offset = 0;
-  info->y_offset = 0;
-  info->duration = 1;
-  info->dispose_method = WEBP_MUX_DISPOSE_NONE;
-  info->blend_method = WEBP_MUX_BLEND;
-  // Extract data for related fields.
-  info->id = ChunkGetIdFromTag(wpi->img_->tag_);
-  return SynthesizeBitstream(wpi, &info->bitstream);
+WebPMuxError WebPMuxGetMetadata(const WebPMux* mux, WebPData* metadata) {
+  if (mux == NULL || metadata == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return MuxGet(mux, IDX_META, 1, metadata);
 }
 
-static WebPMuxError MuxGetFrameFragmentInternal(const WebPMuxImage* const wpi,
-                                                WebPMuxFrameInfo* const frame) {
-  const int is_frame = (wpi->header_->tag_ == kChunks[IDX_ANMF].tag);
-  const CHUNK_INDEX idx = is_frame ? IDX_ANMF : IDX_FRGM;
-  const WebPData* frame_frgm_data;
-#ifndef WEBP_EXPERIMENTAL_FEATURES
-  if (!is_frame) return WEBP_MUX_INVALID_ARGUMENT;
-#endif
-  assert(wpi->header_ != NULL);  // Already checked by WebPMuxGetFrame().
-  // Get frame/fragment chunk.
-  frame_frgm_data = &wpi->header_->data_;
-  if (frame_frgm_data->size < kChunks[idx].size) return WEBP_MUX_BAD_DATA;
-  // Extract info.
-  frame->x_offset = 2 * GetLE24(frame_frgm_data->bytes + 0);
-  frame->y_offset = 2 * GetLE24(frame_frgm_data->bytes + 3);
-  if (is_frame) {
-    const uint8_t bits = frame_frgm_data->bytes[15];
-    frame->duration = GetLE24(frame_frgm_data->bytes + 12);
-    frame->dispose_method =
-        (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
-  } else {  // Defaults for unused values.
-    frame->duration = 1;
-    frame->dispose_method = WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = WEBP_MUX_BLEND;
-  }
-  frame->id = ChunkGetIdFromTag(wpi->header_->tag_);
-  return SynthesizeBitstream(wpi, &frame->bitstream);
+WebPMuxError WebPMuxGetColorProfile(const WebPMux* mux,
+                                    WebPData* color_profile) {
+  if (mux == NULL || color_profile == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return MuxGet(mux, IDX_ICCP, 1, color_profile);
 }
 
-WebPMuxError WebPMuxGetFrame(
-    const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame) {
+WebPMuxError WebPMuxGetLoopCount(const WebPMux* mux, int* loop_count) {
+  WebPData image;
+  WebPMuxError err;
+
+  if (mux == NULL || loop_count == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+
+  err = MuxGet(mux, IDX_LOOP, 1, &image);
+  if (err != WEBP_MUX_OK) return err;
+  if (image.size_ < kChunks[WEBP_CHUNK_LOOP].size) return WEBP_MUX_BAD_DATA;
+  *loop_count = GetLE16(image.bytes_);
+
+  return WEBP_MUX_OK;
+}
+
+static WebPMuxError MuxGetFrameTileInternal(
+    const WebPMux* const mux, uint32_t nth, WebPData* const bitstream,
+    int* const x_offset, int* const y_offset, int* const duration,
+    uint32_t tag) {
+  const WebPData* frame_tile_data;
   WebPMuxError err;
   WebPMuxImage* wpi;
 
-  // Sanity checks.
-  if (mux == NULL || frame == NULL) {
+  const int is_frame = (tag == kChunks[WEBP_CHUNK_FRAME].tag) ? 1 : 0;
+  const CHUNK_INDEX idx = is_frame ? IDX_FRAME : IDX_TILE;
+  const WebPChunkId id = kChunks[idx].id;
+
+  if (mux == NULL || bitstream == NULL ||
+      x_offset == NULL || y_offset == NULL || (is_frame && duration == NULL)) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
   // Get the nth WebPMuxImage.
-  err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, nth, &wpi);
+  err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, nth, id, &wpi);
   if (err != WEBP_MUX_OK) return err;
 
-  // Get frame info.
-  if (wpi->header_ == NULL) {
-    return MuxGetImageInternal(wpi, frame);
-  } else {
-    return MuxGetFrameFragmentInternal(wpi, frame);
-  }
-}
+  // Get frame chunk.
+  assert(wpi->header_ != NULL);  // As MuxImageGetNth() already checked header_.
+  frame_tile_data = &wpi->header_->data_;
 
-WebPMuxError WebPMuxGetAnimationParams(const WebPMux* mux,
-                                       WebPMuxAnimParams* params) {
-  WebPData anim;
-  WebPMuxError err;
+  if (frame_tile_data->size_ < kChunks[idx].size) return WEBP_MUX_BAD_DATA;
+  *x_offset = 2 * GetLE24(frame_tile_data->bytes_ + 0);
+  *y_offset = 2 * GetLE24(frame_tile_data->bytes_ + 3);
+  if (is_frame) *duration = 1 + GetLE24(frame_tile_data->bytes_ + 12);
 
-  if (mux == NULL || params == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return SynthesizeBitstream(wpi, bitstream);
+}
 
-  err = MuxGet(mux, IDX_ANIM, 1, &anim);
-  if (err != WEBP_MUX_OK) return err;
-  if (anim.size < kChunks[WEBP_CHUNK_ANIM].size) return WEBP_MUX_BAD_DATA;
-  params->bgcolor = GetLE32(anim.bytes);
-  params->loop_count = GetLE16(anim.bytes + 4);
+WebPMuxError WebPMuxGetFrame(const WebPMux* mux, uint32_t nth,
+                             WebPData* bitstream,
+                             int* x_offset, int* y_offset, int* duration) {
+  return MuxGetFrameTileInternal(mux, nth, bitstream, x_offset, y_offset,
+                                 duration, kChunks[IDX_FRAME].tag);
+}
 
-  return WEBP_MUX_OK;
+WebPMuxError WebPMuxGetTile(const WebPMux* mux, uint32_t nth,
+                            WebPData* bitstream,
+                            int* x_offset, int* y_offset) {
+  return MuxGetFrameTileInternal(mux, nth, bitstream, x_offset, y_offset, NULL,
+                                 kChunks[IDX_TILE].tag);
 }
 
 // Get chunk index from chunk id. Returns IDX_NIL if not found.
 static CHUNK_INDEX ChunkGetIndexFromId(WebPChunkId id) {
   int i;
   for (i = 0; kChunks[i].id != WEBP_CHUNK_NIL; ++i) {
-    if (id == kChunks[i].id) return (CHUNK_INDEX)i;
+    if (id == kChunks[i].id) return i;
   }
   return IDX_NIL;
 }
@@ -529,8 +393,12 @@ WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
     *num_elements = MuxImageCount(mux->images_, id);
   } else {
     WebPChunk* const* chunk_list = MuxGetChunkListFromId(mux, id);
-    const CHUNK_INDEX idx = ChunkGetIndexFromId(id);
-    *num_elements = CountChunks(*chunk_list, kChunks[idx].tag);
+    if (chunk_list == NULL) {
+      *num_elements = 0;
+    } else {
+      const CHUNK_INDEX idx = ChunkGetIndexFromId(id);
+      *num_elements = CountChunks(*chunk_list, kChunks[idx].tag);
+    }
   }
 
   return WEBP_MUX_OK;
@@ -538,3 +406,6 @@ WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/mux_types.h b/drivers/webp/mux_types.h
deleted file mode 100644
index c94043a3c0..0000000000
--- a/drivers/webp/mux_types.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2012 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Data-types common to the mux and demux libraries.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_WEBP_MUX_TYPES_H_
-#define WEBP_WEBP_MUX_TYPES_H_
-
-#include <stdlib.h>  // free()
-#include <string.h>  // memset()
-#include "./types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Note: forward declaring enumerations is not allowed in (strict) C and C++,
-// the types are left here for reference.
-// typedef enum WebPFeatureFlags WebPFeatureFlags;
-// typedef enum WebPMuxAnimDispose WebPMuxAnimDispose;
-// typedef enum WebPMuxAnimBlend WebPMuxAnimBlend;
-typedef struct WebPData WebPData;
-
-// VP8X Feature Flags.
-typedef enum WebPFeatureFlags {
-  FRAGMENTS_FLAG  = 0x00000001,
-  ANIMATION_FLAG  = 0x00000002,
-  XMP_FLAG        = 0x00000004,
-  EXIF_FLAG       = 0x00000008,
-  ALPHA_FLAG      = 0x00000010,
-  ICCP_FLAG       = 0x00000020
-} WebPFeatureFlags;
-
-// Dispose method (animation only). Indicates how the area used by the current
-// frame is to be treated before rendering the next frame on the canvas.
-typedef enum WebPMuxAnimDispose {
-  WEBP_MUX_DISPOSE_NONE,       // Do not dispose.
-  WEBP_MUX_DISPOSE_BACKGROUND  // Dispose to background color.
-} WebPMuxAnimDispose;
-
-// Blend operation (animation only). Indicates how transparent pixels of the
-// current frame are blended with those of the previous canvas.
-typedef enum WebPMuxAnimBlend {
-  WEBP_MUX_BLEND,              // Blend.
-  WEBP_MUX_NO_BLEND            // Do not blend.
-} WebPMuxAnimBlend;
-
-// Data type used to describe 'raw' data, e.g., chunk data
-// (ICC profile, metadata) and WebP compressed image data.
-struct WebPData {
-  const uint8_t* bytes;
-  size_t size;
-};
-
-// Initializes the contents of the 'webp_data' object with default values.
-static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
-  if (webp_data != NULL) {
-    memset(webp_data, 0, sizeof(*webp_data));
-  }
-}
-
-// Clears the contents of the 'webp_data' object by calling free(). Does not
-// deallocate the object itself.
-static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
-  if (webp_data != NULL) {
-    free((void*)webp_data->bytes);
-    WebPDataInit(webp_data);
-  }
-}
-
-// Allocates necessary storage for 'dst' and copies the contents of 'src'.
-// Returns true on success.
-static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
-  if (src == NULL || dst == NULL) return 0;
-  WebPDataInit(dst);
-  if (src->bytes != NULL && src->size != 0) {
-    dst->bytes = (uint8_t*)malloc(src->size);
-    if (dst->bytes == NULL) return 0;
-    memcpy((void*)dst->bytes, src->bytes, src->size);
-    dst->size = src->size;
-  }
-  return 1;
-}
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_WEBP_MUX_TYPES_H_ */
diff --git a/drivers/webp/types.h b/drivers/webp/types.h
index 568d1f263f..3e27190bef 100644
--- a/drivers/webp/types.h
+++ b/drivers/webp/types.h
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //  Common types
diff --git a/drivers/webp/utils/alpha_processing.c b/drivers/webp/utils/alpha_processing.c
deleted file mode 100644
index 7362ff94a5..0000000000
--- a/drivers/webp/utils/alpha_processing.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Utilities for processing transparent channel.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include "./alpha_processing.h"
-
-// Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
-
-// -----------------------------------------------------------------------------
-
-#define MFIX 24    // 24bit fixed-point arithmetic
-#define HALF ((1u << MFIX) >> 1)
-#define KINV_255 ((1u << MFIX) / 255u)
-
-static uint32_t Mult(uint8_t x, uint32_t mult) {
-  const uint32_t v = (x * mult + HALF) >> MFIX;
-  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
-  return v;
-}
-
-#ifdef USE_TABLES_FOR_ALPHA_MULT
-
-static const uint32_t kMultTables[2][256] = {
-  {    // (255u << MFIX) / alpha
-    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
-    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
-    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
-    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
-    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
-    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
-    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
-    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
-    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
-    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
-    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
-    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
-    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
-    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
-    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
-    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
-    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
-    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
-    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
-    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
-    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
-    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
-    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
-    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
-    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
-    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
-    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
-    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
-    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
-    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
-    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
-    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
-    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
-    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
-    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
-    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
-    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
-    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
-    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
-    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
-    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
-    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
-    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
-  {   // alpha * KINV_255
-    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
-    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
-    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
-    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
-    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
-    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
-    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
-    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
-    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
-    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
-    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
-    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
-    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
-    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
-    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
-    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
-    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
-    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
-    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
-    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
-    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
-    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
-    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
-    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
-    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
-    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
-    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
-    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
-    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
-    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
-    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
-    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
-    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
-    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
-    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
-    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
-    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
-    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
-    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
-    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
-    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
-    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
-    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
-};
-
-static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
-  return kMultTables[!inverse][a];
-}
-
-#else
-
-static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
-  return inverse ? (255u << MFIX) / a : a * KINV_255;
-}
-
-#endif    // USE_TABLES_FOR_ALPHA_MULT
-
-void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    const uint32_t argb = ptr[x];
-    if (argb < 0xff000000u) {      // alpha < 255
-      if (argb <= 0x00ffffffu) {   // alpha == 0
-        ptr[x] = 0;
-      } else {
-        const uint32_t alpha = (argb >> 24) & 0xff;
-        const uint32_t scale = GetScale(alpha, inverse);
-        uint32_t out = argb & 0xff000000u;
-        out |= Mult(argb >>  0, scale) <<  0;
-        out |= Mult(argb >>  8, scale) <<  8;
-        out |= Mult(argb >> 16, scale) << 16;
-        ptr[x] = out;
-      }
-    }
-  }
-}
-
-void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
-                      int inverse) {
-  int n;
-  for (n = 0; n < num_rows; ++n) {
-    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
-    ptr += stride;
-  }
-}
-
-void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                 int width, int inverse) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    const uint32_t a = alpha[x];
-    if (a != 255) {
-      if (a == 0) {
-        ptr[x] = 0;
-      } else {
-        const uint32_t scale = GetScale(a, inverse);
-        ptr[x] = Mult(ptr[x], scale);
-      }
-    }
-  }
-}
-
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
-                  int width, int num_rows, int inverse) {
-  int n;
-  for (n = 0; n < num_rows; ++n) {
-    WebPMultRow(ptr, alpha, width, inverse);
-    ptr += stride;
-    alpha += alpha_stride;
-  }
-}
-
-#undef KINV_255
-#undef HALF
-#undef MFIX
-
diff --git a/drivers/webp/utils/alpha_processing.h b/drivers/webp/utils/alpha_processing.h
deleted file mode 100644
index 80e1ae45dd..0000000000
--- a/drivers/webp/utils/alpha_processing.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Utilities for processing transparent channel.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_UTILS_ALPHA_PROCESSING_H_
-#define WEBP_UTILS_ALPHA_PROCESSING_H_
-
-#include "../webp/types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
-// Un-Multiply operation transforms x into x * 255 / A.
-
-// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
-void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse);
-
-// Same a WebPMultARGBRow(), but for several rows.
-void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
-                      int inverse);
-
-// Same for a row of single values, with side alpha values.
-void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                 int width, int inverse);
-
-// Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
-                  int width, int num_rows, int inverse);
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif    // WEBP_UTILS_ALPHA_PROCESSING_H_
diff --git a/drivers/webp/utils/bit_reader.c b/drivers/webp/utils/bit_reader.c
index bfa4d7d2e2..1afb1db890 100644
--- a/drivers/webp/utils/bit_reader.c
+++ b/drivers/webp/utils/bit_reader.c
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Boolean decoder
@@ -13,12 +11,12 @@
 
 #include "./bit_reader.h"
 
-#ifndef USE_RIGHT_JUSTIFY
-#define MK(X) (((range_t)(X) << (BITS)) | (MASK))
-#else
-#define MK(X) ((range_t)(X))
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
 #endif
 
+#define MK(X) (((bit_t)(X) << (BITS)) | (MASK))
+
 //------------------------------------------------------------------------------
 // VP8BitReader
 
@@ -31,7 +29,7 @@ void VP8InitBitReader(VP8BitReader* const br,
   br->buf_     = start;
   br->buf_end_ = end;
   br->value_   = 0;
-  br->bits_    = -8;   // to load the very first 8bits
+  br->missing_ = 8;   // to load the very first 8bits
   br->eof_     = 0;
 }
 
@@ -48,7 +46,7 @@ const uint8_t kVP8Log2Range[128] = {
 };
 
 // range = (range << kVP8Log2Range[range]) + trailing 1's
-const range_t kVP8NewRange[128] = {
+const bit_t kVP8NewRange[128] = {
   MK(127), MK(127), MK(191), MK(127), MK(159), MK(191), MK(223), MK(127),
   MK(143), MK(159), MK(175), MK(191), MK(207), MK(223), MK(239), MK(127),
   MK(135), MK(143), MK(151), MK(159), MK(167), MK(175), MK(183), MK(191),
@@ -73,19 +71,9 @@ void VP8LoadFinalBytes(VP8BitReader* const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Only read 8bits at a time
   if (br->buf_ < br->buf_end_) {
-#ifndef USE_RIGHT_JUSTIFY
-    br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 - br->bits_);
-#else
-    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
-#endif
-    br->bits_ += 8;
-  } else if (!br->eof_) {
-#ifdef USE_RIGHT_JUSTIFY
-    // These are not strictly needed, but it makes the behaviour
-    // consistent for both USE_RIGHT_JUSTIFY and !USE_RIGHT_JUSTIFY.
-    br->value_ <<= 8;
-    br->bits_ += 8;
-#endif
+    br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 + br->missing_);
+    br->missing_ -= 8;
+  } else {
     br->eof_ = 1;
   }
 }
@@ -111,10 +99,6 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 
 #define MAX_NUM_BIT_READ 25
 
-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
-
 static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
   0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
   65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
@@ -136,7 +120,7 @@ void VP8LInitBitReader(VP8LBitReader* const br,
   br->eos_ = 0;
   br->error_ = 0;
   for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (8 * i);
+    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
     ++br->pos_;
   }
 }
@@ -151,57 +135,95 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
   br->len_ = len;
 }
 
-// If not at EOS, reload up to LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
   while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
     br->val_ >>= 8;
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
+    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << 56;
     ++br->pos_;
     br->bit_pos_ -= 8;
   }
 }
 
 void VP8LFillBitWindow(VP8LBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-#if (defined(__x86_64__) || defined(_M_X64))
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
+  if (br->bit_pos_ >= 32) {
+#if defined(__x86_64__) || defined(_M_X64)
+    if (br->pos_ + 8 < br->len_) {
+      br->val_ >>= 32;
       // The expression below needs a little-endian arch to work correctly.
       // This gives a large speedup for decoding speed.
-      br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
+      br->val_ |= *(const uint64_t *)(br->buf_ + br->pos_) << 32;
+      br->pos_ += 4;
+      br->bit_pos_ -= 32;
+    } else {
+      // Slow path.
+      ShiftBytes(br);
     }
+#else
+    // Always the slow path.
+    ShiftBytes(br);
 #endif
-    ShiftBytes(br);       // Slow path.
-    if (br->pos_ == br->len_ && br->bit_pos_ >= LBITS) {
+  }
+  if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
+    br->eos_ = 1;
+  }
+}
+
+uint32_t VP8LReadOneBit(VP8LBitReader* const br) {
+  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
+  // Flag an error at end_of_stream.
+  if (!br->eos_) {
+    ++br->bit_pos_;
+    if (br->bit_pos_ >= 32) {
+      ShiftBytes(br);
+    }
+    // After this last bit is read, check if eos needs to be flagged.
+    if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
       br->eos_ = 1;
     }
+  } else {
+    br->error_ = 1;
   }
+  return val;
 }
 
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
+  uint32_t val = 0;
   assert(n_bits >= 0);
   // Flag an error if end_of_stream or n_bits is more than allowed limit.
   if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
-    const uint32_t val =
-        (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
-    const int new_bits = br->bit_pos_ + n_bits;
-    br->bit_pos_ = new_bits;
     // If this read is going to cross the read buffer, set the eos flag.
     if (br->pos_ == br->len_) {
-      if (new_bits >= LBITS) {
+      if ((br->bit_pos_ + n_bits) >= 64) {
         br->eos_ = 1;
+        if ((br->bit_pos_ + n_bits) > 64) return val;
+      }
+    }
+    val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
+    br->bit_pos_ += n_bits;
+    if (br->bit_pos_ >= 40) {
+      if (br->pos_ + 5 < br->len_) {
+        br->val_ >>= 40;
+        br->val_ |=
+            (((uint64_t)br->buf_[br->pos_ + 0]) << 24) |
+            (((uint64_t)br->buf_[br->pos_ + 1]) << 32) |
+            (((uint64_t)br->buf_[br->pos_ + 2]) << 40) |
+            (((uint64_t)br->buf_[br->pos_ + 3]) << 48) |
+            (((uint64_t)br->buf_[br->pos_ + 4]) << 56);
+        br->pos_ += 5;
+        br->bit_pos_ -= 40;
+      }
+      if (br->bit_pos_ >= 8) {
+        ShiftBytes(br);
       }
     }
-    ShiftBytes(br);
-    return val;
   } else {
     br->error_ = 1;
-    return 0;
   }
+  return val;
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/bit_reader.h b/drivers/webp/utils/bit_reader.h
index 98df98a767..36fc13e2da 100644
--- a/drivers/webp/utils/bit_reader.h
+++ b/drivers/webp/utils/bit_reader.h
@@ -1,10 +1,8 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Boolean decoder
@@ -19,86 +17,18 @@
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
+#include <string.h>  // For memcpy
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-// The Boolean decoder needs to maintain infinite precision on the value_ field.
-// However, since range_ is only 8bit, we only need an active window of 8 bits
-// for value_. Left bits (MSB) gets zeroed and shifted away when value_ falls
-// below 128, range_ is updated, and fresh bits read from the bitstream are
-// brought in as LSB.
-// To avoid reading the fresh bits one by one (slow), we cache a few of them
-// ahead (actually, we cache BITS of them ahead. See below). There's two
-// strategies regarding how to shift these looked-ahead fresh bits into the
-// 8bit window of value_: either we shift them in, while keeping the position of
-// the window fixed. Or we slide the window to the right while keeping the cache
-// bits at a fixed, right-justified, position.
-//
-//  Example, for BITS=16: here is the content of value_ for both strategies:
-//
-//          !USE_RIGHT_JUSTIFY            ||        USE_RIGHT_JUSTIFY
-//                                        ||
-//   <- 8b -><- 8b -><- BITS bits  ->     ||  <- 8b+3b -><- 8b -><- 13 bits ->
-//   [unused][value_][cached bits][0]     ||  [unused...][value_][cached bits]
-//  [........00vvvvvvBBBBBBBBBBBBB000]LSB || [...........00vvvvvvBBBBBBBBBBBBB]
-//                                        ||
-// After calling VP8Shift(), where we need to shift away two zeros:
-//  [........vvvvvvvvBBBBBBBBBBB00000]LSB || [.............vvvvvvvvBBBBBBBBBBB]
-//                                        ||
-// Just before we need to call VP8LoadNewBytes(), the situation is:
-//  [........vvvvvv000000000000000000]LSB || [..........................vvvvvv]
-//                                        ||
-// And just after calling VP8LoadNewBytes():
-//  [........vvvvvvvvBBBBBBBBBBBBBBBB]LSB || [........vvvvvvvvBBBBBBBBBBBBBBBB]
-//
-// -> we're back to eight active 'value_' bits (marked 'v') and BITS cached
-// bits (marked 'B')
-//
-// The right-justify strategy tends to use less shifts and is often faster.
-
-//------------------------------------------------------------------------------
-// BITS can be any multiple of 8 from 8 to 56 (inclusive).
-// Pick values that fit natural register size.
-
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-
-#define USE_RIGHT_JUSTIFY
-
-#if defined(__i386__) || defined(_M_IX86)      // x86 32bit
-#define BITS 16
-#elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
-#define BITS 56
-#elif defined(__arm__) || defined(_M_ARM)      // ARM
-#define BITS 24
-#else                      // reasonable default
-#define BITS 24
-#endif
-
-#else     // reference choices
-
-#define USE_RIGHT_JUSTIFY
-#define BITS 8
-
-#endif
-
-//------------------------------------------------------------------------------
-// Derived types and constants
-
-// bit_t = natural register type
-// lbit_t = natural type for memory I/O
-
-#if (BITS > 32)
-typedef uint64_t bit_t;
-typedef uint64_t lbit_t;
-#elif (BITS == 32)
-typedef uint64_t bit_t;
-typedef uint32_t lbit_t;
-#elif (BITS == 24)
-typedef uint32_t bit_t;
-typedef uint32_t lbit_t;
+#define BITS 32     // can be 32, 16 or 8
+#define MASK ((((bit_t)1) << (BITS)) - 1)
+#if (BITS == 32)
+typedef uint64_t bit_t;   // natural register type
+typedef uint32_t lbit_t;  // natural type for memory I/O
 #elif (BITS == 16)
 typedef uint32_t bit_t;
 typedef uint16_t lbit_t;
@@ -107,15 +37,8 @@ typedef uint32_t bit_t;
 typedef uint8_t lbit_t;
 #endif
 
-#ifndef USE_RIGHT_JUSTIFY
-typedef bit_t range_t;     // type for storing range_
-#define MASK ((((bit_t)1) << (BITS)) - 1)
-#else
-typedef uint32_t range_t;  // range_ only uses 8bits here. No need for bit_t.
-#endif
-
 //------------------------------------------------------------------------------
-// Bitreader
+// Bitreader and code-tree reader
 
 typedef struct VP8BitReader VP8BitReader;
 struct VP8BitReader {
@@ -124,9 +47,9 @@ struct VP8BitReader {
   int eof_;                   // true if input is exhausted
 
   // boolean decoder
-  range_t range_;            // current range minus 1. In [127, 254] interval.
-  bit_t value_;              // current value
-  int bits_;                 // number of valid bits left
+  bit_t range_;            // current range minus 1. In [127, 254] interval.
+  bit_t value_;            // current value
+  int missing_;            // number of missing bits in value_ (8bit)
 };
 
 // Initialize the bit reader and the boolean decoder.
@@ -144,160 +67,98 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 
 // Read a bit with proba 'prob'. Speed-critical function!
 extern const uint8_t kVP8Log2Range[128];
-extern const range_t kVP8NewRange[128];
+extern const bit_t kVP8NewRange[128];
 
 void VP8LoadFinalBytes(VP8BitReader* const br);    // special case for the tail
 
 static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
-  assert(br != NULL && br->buf_ != NULL);
+  assert(br && br->buf_);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
     // convert memory type to register type (with some zero'ing!)
     bit_t bits;
-    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+    lbit_t in_bits = *(lbit_t*)br->buf_;
     br->buf_ += (BITS) >> 3;
 #if !defined(__BIG_ENDIAN__)
-#if (BITS > 32)
-// gcc 4.3 has builtin functions for swap32/swap64
-#if defined(__GNUC__) && \
-           (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-    bits = (bit_t)__builtin_bswap64(in_bits);
-#elif defined(_MSC_VER)
-    bits = (bit_t)_byteswap_uint64(in_bits);
-#elif defined(__x86_64__)
-    __asm__ volatile("bswapq %0" : "=r"(bits) : "0"(in_bits));
-#else  // generic code for swapping 64-bit values (suggested by bdb@)
-    bits = (bit_t)in_bits;
-    bits = ((bits & 0xffffffff00000000ull) >> 32) |
-           ((bits & 0x00000000ffffffffull) << 32);
-    bits = ((bits & 0xffff0000ffff0000ull) >> 16) |
-           ((bits & 0x0000ffff0000ffffull) << 16);
-    bits = ((bits & 0xff00ff00ff00ff00ull) >> 8) |
-           ((bits & 0x00ff00ff00ff00ffull) << 8);
-#endif
-    bits >>= 64 - BITS;
-#elif (BITS >= 24)
+#if (BITS == 32)
 #if defined(__i386__) || defined(__x86_64__)
-    {
-      lbit_t swapped_in_bits;
-      __asm__ volatile("bswap %k0" : "=r"(swapped_in_bits) : "0"(in_bits));
-      bits = (bit_t)swapped_in_bits;   // 24b/32b -> 32b/64b zero-extension
-    }
+    __asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
+    bits = (bit_t)in_bits;   // 32b -> 64b zero-extension
 #elif defined(_MSC_VER)
-    bits = (bit_t)_byteswap_ulong(in_bits);
+    bits = _byteswap_ulong(in_bits);
 #else
     bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
          | ((in_bits << 8) & 0xff0000)  | (in_bits << 24);
 #endif  // x86
-    bits >>= (32 - BITS);
 #elif (BITS == 16)
     // gcc will recognize a 'rorw $8, ...' here:
     bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
-#else   // BITS == 8
-    bits = (bit_t)in_bits;
 #endif
-#else    // BIG_ENDIAN
+#else    // LITTLE_ENDIAN
     bits = (bit_t)in_bits;
-    if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
-#endif
-#ifndef USE_RIGHT_JUSTIFY
-    br->value_ |= bits << (-br->bits_);
-#else
-    br->value_ = bits | (br->value_ << (BITS));
 #endif
-    br->bits_ += (BITS);
+    br->value_ |= bits << br->missing_;
+    br->missing_ -= (BITS);
   } else {
     VP8LoadFinalBytes(br);    // no need to be inlined
   }
 }
 
-static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, range_t split) {
-  if (br->bits_ < 0) {  // Make sure we have a least BITS bits in 'value_'
+static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, bit_t split) {
+  const bit_t value_split = split | (MASK);
+  if (br->missing_ > 0) {  // Make sure we have a least BITS bits in 'value_'
     VP8LoadNewBytes(br);
   }
-#ifndef USE_RIGHT_JUSTIFY
-  split |= (MASK);
-  if (br->value_ > split) {
-    br->range_ -= split + 1;
-    br->value_ -= split + 1;
+  if (br->value_ > value_split) {
+    br->range_ -= value_split + 1;
+    br->value_ -= value_split + 1;
     return 1;
   } else {
-    br->range_ = split;
+    br->range_ = value_split;
     return 0;
   }
-#else
-  {
-    const int pos = br->bits_;
-    const range_t value = (range_t)(br->value_ >> pos);
-    if (value > split) {
-      br->range_ -= split + 1;
-      br->value_ -= (bit_t)(split + 1) << pos;
-      return 1;
-    } else {
-      br->range_ = split;
-      return 0;
-    }
-  }
-#endif
 }
 
 static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
-#ifndef USE_RIGHT_JUSTIFY
   // range_ is in [0..127] interval here.
-  const bit_t idx = br->range_ >> (BITS);
+  const int idx = br->range_ >> (BITS);
   const int shift = kVP8Log2Range[idx];
   br->range_ = kVP8NewRange[idx];
   br->value_ <<= shift;
-  br->bits_ -= shift;
-#else
-  const int shift = kVP8Log2Range[br->range_];
-  assert(br->range_ < (range_t)128);
-  br->range_ = kVP8NewRange[br->range_];
-  br->bits_ -= shift;
-#endif
+  br->missing_ += shift;
 }
 
 static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
-#ifndef USE_RIGHT_JUSTIFY
   // It's important to avoid generating a 64bit x 64bit multiply here.
   // We just need an 8b x 8b after all.
-  const range_t split =
-      (range_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
+  const bit_t split =
+      (bit_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
   const int bit = VP8BitUpdate(br, split);
-  if (br->range_ <= (((range_t)0x7e << (BITS)) | (MASK))) {
+  if (br->range_ <= (((bit_t)0x7e << (BITS)) | (MASK))) {
     VP8Shift(br);
   }
   return bit;
-#else
-  const range_t split = (br->range_ * prob) >> 8;
-  const int bit = VP8BitUpdate(br, split);
-  if (br->range_ <= (range_t)0x7e) {
-    VP8Shift(br);
-  }
-  return bit;
-#endif
 }
 
 static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
-  const range_t split = (br->range_ >> 1);
+  const bit_t split = (br->range_ >> 1);
   const int bit = VP8BitUpdate(br, split);
   VP8Shift(br);
   return bit ? -v : v;
 }
 
-// -----------------------------------------------------------------------------
-// Bitreader for lossless format
 
-typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
+// -----------------------------------------------------------------------------
+// Bitreader
 
 typedef struct {
-  vp8l_val_t     val_;        // pre-fetched bits
-  const uint8_t* buf_;        // input byte buffer
-  size_t         len_;        // buffer length
-  size_t         pos_;        // byte position in buf_
-  int            bit_pos_;    // current bit-reading position in val_
-  int            eos_;        // bitstream is finished
-  int            error_;      // an error occurred (buffer overflow attempt...)
+  uint64_t       val_;
+  const uint8_t* buf_;
+  size_t         len_;
+  size_t         pos_;
+  int            bit_pos_;
+  int            eos_;
+  int            error_;
 } VP8LBitReader;
 
 void VP8LInitBitReader(VP8LBitReader* const br,
@@ -313,21 +174,23 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
 // Flags eos if this read attempt is going to cross the read buffer.
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
 
-// Return the prefetched bits, so they can be looked up.
-static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
-  return (uint32_t)(br->val_ >> br->bit_pos_);
-}
-
-// For jumping over a number of bits in the bit stream when accessed with
-// VP8LPrefetchBits and VP8LFillBitWindow.
-static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
-  br->bit_pos_ = val;
+// Reads one bit from Read Buffer. Flags an error in case end_of_stream.
+// Flags eos after reading last bit from the buffer.
+uint32_t VP8LReadOneBit(VP8LBitReader* const br);
+
+// VP8LReadOneBitUnsafe is faster than VP8LReadOneBit, but it can be called only
+// 32 times after the last VP8LFillBitWindow. Any subsequent calls
+// (without VP8LFillBitWindow) will return invalid data.
+static WEBP_INLINE uint32_t VP8LReadOneBitUnsafe(VP8LBitReader* const br) {
+  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
+  ++br->bit_pos_;
+  return val;
 }
 
-// Advances the read buffer by 4 bytes to make room for reading next 32 bits.
+// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
 void VP8LFillBitWindow(VP8LBitReader* const br);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/bit_writer.c b/drivers/webp/utils/bit_writer.c
index 29810a1749..671159cacd 100644
--- a/drivers/webp/utils/bit_writer.c
+++ b/drivers/webp/utils/bit_writer.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Bit writing and boolean coder
@@ -17,6 +15,10 @@
 #include <stdlib.h>
 #include "./bit_writer.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // VP8BitWriter
 
@@ -39,10 +41,7 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
     bw->error_ = 1;
     return 0;
   }
-  if (bw->pos_ > 0) {
-    assert(bw->buf_ != NULL);
-    memcpy(new_buf, bw->buf_, bw->pos_);
-  }
+  memcpy(new_buf, bw->buf_, bw->pos_);
   free(bw->buf_);
   bw->buf_ = new_buf;
   bw->max_pos_ = new_size;
@@ -252,7 +251,7 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
     uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
     const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
     const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
-    // implicit & 0xff is assumed for uint8_t arithmetic
+    // implicit & 0xff is assumed for uint8_t arithmetics
     *p++ |= bits << bits_reserved_in_first_byte;
     bits >>= 8 - bits_reserved_in_first_byte;
     if (bits_left_to_write >= 1) {
@@ -280,3 +279,6 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/bit_writer.h b/drivers/webp/utils/bit_writer.h
index 89a9ead488..f7ca08497f 100644
--- a/drivers/webp/utils/bit_writer.h
+++ b/drivers/webp/utils/bit_writer.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Bit writing and boolean coder
@@ -16,7 +14,7 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -118,7 +116,7 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/color_cache.c b/drivers/webp/utils/color_cache.c
index 66a44647fd..560f81db10 100644
--- a/drivers/webp/utils/color_cache.c
+++ b/drivers/webp/utils/color_cache.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
@@ -16,6 +14,10 @@
 #include "./color_cache.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
@@ -37,3 +39,6 @@ void VP8LColorCacheClear(VP8LColorCache* const cc) {
   }
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
diff --git a/drivers/webp/utils/color_cache.h b/drivers/webp/utils/color_cache.h
index 0f824ed457..13be629f36 100644
--- a/drivers/webp/utils/color_cache.h
+++ b/drivers/webp/utils/color_cache.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
@@ -17,7 +15,7 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -63,7 +61,7 @@ void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/utils/filters.c b/drivers/webp/utils/filters.c
index 2d15bd0e4a..08f52a3d20 100644
--- a/drivers/webp/utils/filters.c
+++ b/drivers/webp/utils/filters.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Spatial prediction using various filters
@@ -16,17 +14,20 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-# define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
-  assert(width > 0);                                                           \
-  assert(height > 0);                                                          \
-  assert(stride >= width);                                                     \
-  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
-  (void)height;  // Silence unused warning.
+# define SANITY_CHECK(in, out)                              \
+  assert(in != NULL);                                       \
+  assert(out != NULL);                                      \
+  assert(width > 0);                                        \
+  assert(height > 0);                                       \
+  assert(bpp > 0);                                          \
+  assert(stride >= width * bpp);
 
 static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
                                     uint8_t* dst, int length, int inverse) {
@@ -42,33 +43,20 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
 // Horizontal filter.
 
 static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
+  int h;
+  const uint8_t* preds = (inverse ? out : in);
   SANITY_CHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
-
-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
 
   // Filter line-by-line.
-  while (row < last_row) {
-    // Leftmost pixel is predicted from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    ++row;
+  for (h = 0; h < height; ++h) {
+    // Leftmost pixel is predicted from above (except for topmost scanline).
+    if (h == 0) {
+      memcpy((void*)out, (const void*)in, bpp);
+    } else {
+      PredictLine(in, preds - stride, out, bpp, inverse);
+    }
+    PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
     preds += stride;
     in += stride;
     out += stride;
@@ -76,61 +64,46 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 }
 
 static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+                             int bpp, int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, bpp, stride, 0, filtered_data);
 }
 
-static void HorizontalUnfilter(int width, int height, int stride, int row,
-                               int num_rows, uint8_t* data) {
-  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void HorizontalUnfilter(const uint8_t* data, int width, int height,
+                               int bpp, int stride, uint8_t* recon_data) {
+  DoHorizontalFilter(data, width, height, bpp, stride, 1, recon_data);
 }
 
 //------------------------------------------------------------------------------
 // Vertical filter.
 
 static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
+  int h;
+  const uint8_t* preds = (inverse ? out : in);
   SANITY_CHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
-
-  if (row == 0) {
-    // Very first top-left pixel is copied.
-    out[0] = in[0];
-    // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    in += stride;
-    out += stride;
-  } else {
-    // We are starting from in-between. Make sure 'preds' points to prev row.
-    preds -= stride;
-  }
+
+  // Very first top-left pixel is copied.
+  memcpy((void*)out, (const void*)in, bpp);
+  // Rest of top scan-line is left-predicted.
+  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
 
   // Filter line-by-line.
-  while (row < last_row) {
-    PredictLine(in, preds, out, width, inverse);
-    ++row;
-    preds += stride;
+  for (h = 1; h < height; ++h) {
     in += stride;
     out += stride;
+    PredictLine(in, preds, out, bpp * width, inverse);
+    preds += stride;
   }
 }
 
 static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+                           int bpp, int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, bpp, stride, 0, filtered_data);
 }
 
-static void VerticalUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void VerticalUnfilter(const uint8_t* data, int width, int height,
+                             int bpp, int stride, uint8_t* recon_data) {
+  DoVerticalFilter(data, width, height, bpp, stride, 1, recon_data);
 }
 
 //------------------------------------------------------------------------------
@@ -138,63 +111,52 @@ static void VerticalUnfilter(int width, int height, int stride, int row,
 
 static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
-  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+  return (g < 0) ? 0 : (g > 255) ? 255 : g;
 }
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE
+void DoGradientFilter(const uint8_t* in, int width, int height,
+                      int bpp, int stride, int inverse, uint8_t* out) {
+  const uint8_t* preds = (inverse ? out : in);
+  int h;
   SANITY_CHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
 
   // left prediction for top scan-line
-  if (row == 0) {
-    out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
+  memcpy((void*)out, (const void*)in, bpp);
+  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
 
   // Filter line-by-line.
-  while (row < last_row) {
+  for (h = 1; h < height; ++h) {
     int w;
+    preds += stride;
+    in += stride;
+    out += stride;
     // leftmost pixel: predict from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
-    for (w = 1; w < width; ++w) {
-      const int pred = GradientPredictor(preds[w - 1],
+    PredictLine(in, preds - stride, out, bpp, inverse);
+    for (w = bpp; w < width * bpp; ++w) {
+      const int pred = GradientPredictor(preds[w - bpp],
                                          preds[w - stride],
-                                         preds[w - stride - 1]);
+                                         preds[w - stride - bpp]);
       out[w] = in[w] + (inverse ? pred : -pred);
     }
-    ++row;
-    preds += stride;
-    in += stride;
-    out += stride;
   }
 }
 
 static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+                           int bpp, int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, bpp, stride, 0, filtered_data);
 }
 
-static void GradientUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+static void GradientUnfilter(const uint8_t* data, int width, int height,
+                             int bpp, int stride, uint8_t* recon_data) {
+  DoGradientFilter(data, width, height, bpp, stride, 1, recon_data);
 }
 
 #undef SANITY_CHECK
 
 // -----------------------------------------------------------------------------
-// Quick estimate of a potentially interesting filter mode to try.
+// Quick estimate of a potentially interesting filter mode to try, in addition
+// to the default NONE.
 
 #define SMAX 16
 #define SDIFF(a, b) (abs((a) - (b)) >> 4)   // Scoring diff, in [0..SMAX)
@@ -204,7 +166,6 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
   int i, j;
   int bins[WEBP_FILTER_LAST][SMAX];
   memset(bins, 0, sizeof(bins));
-
   // We only sample every other pixels. That's enough.
   for (j = 2; j < height - 1; j += 2) {
     const uint8_t* const p = data + j * stride;
@@ -224,8 +185,7 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
     }
   }
   {
-    int filter;
-    WEBP_FILTER_TYPE best_filter = WEBP_FILTER_NONE;
+    WEBP_FILTER_TYPE filter, best_filter = WEBP_FILTER_NONE;
     int best_score = 0x7fffffff;
     for (filter = WEBP_FILTER_NONE; filter < WEBP_FILTER_LAST; ++filter) {
       int score = 0;
@@ -236,7 +196,7 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
       }
       if (score < best_score) {
         best_score = score;
-        best_filter = (WEBP_FILTER_TYPE)filter;
+        best_filter = filter;
       }
     }
     return best_filter;
@@ -255,7 +215,7 @@ const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST] = {
   GradientFilter     // WEBP_FILTER_GRADIENT
 };
 
-const WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
+const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
   NULL,                // WEBP_FILTER_NONE
   HorizontalUnfilter,  // WEBP_FILTER_HORIZONTAL
   VerticalUnfilter,    // WEBP_FILTER_VERTICAL
@@ -264,3 +224,6 @@ const WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/filters.h b/drivers/webp/utils/filters.h
index dde39cb5c4..c5cdbd6deb 100644
--- a/drivers/webp/utils/filters.h
+++ b/drivers/webp/utils/filters.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Spatial prediction using various filters
@@ -16,7 +14,7 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -32,27 +30,24 @@ typedef enum {
 } WEBP_FILTER_TYPE;
 
 typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
-                               int stride, uint8_t* out);
-typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
-                                 int row, int num_rows, uint8_t* data);
+                               int bpp, int stride, uint8_t* out);
 
 // Filter the given data using the given predictor.
 // 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
 // in raster order.
+// 'bpp' is number of bytes per pixel, and
 // 'stride' is number of bytes per scan line (with possible padding).
 // 'out' should be pre-allocated.
 extern const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 
-// In-place reconstruct the original data from the given filtered data.
-// The reconstruction will be done for 'num_rows' rows starting from 'row'
-// (assuming rows upto 'row - 1' are already reconstructed).
-extern const WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+// Reconstruct the original data from the given filtered data.
+extern const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 
 // Fast estimate of a potentially good filter.
-WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
-                                    int width, int height, int stride);
+extern WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
+                                           int width, int height, int stride);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/huffman.c b/drivers/webp/utils/huffman.c
index 8c5739f633..41529cc9da 100644
--- a/drivers/webp/utils/huffman.c
+++ b/drivers/webp/utils/huffman.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
@@ -13,14 +11,13 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include <string.h>
 #include "./huffman.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-// Uncomment the following to use look-up table for ReverseBits()
-// (might be faster on some platform)
-// #define USE_LUT_REVERSE_BITS
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define NON_EXISTENT_SYMBOL (-1)
 
@@ -53,14 +50,11 @@ static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   // Note that a Huffman tree is a full binary tree; and in a full binary tree
   // with L leaves, the total number of nodes N = 2 * L - 1.
   tree->max_nodes_ = 2 * num_leaves - 1;
-  assert(tree->max_nodes_ < (1 << 16));   // limit for the lut_jump_ table
   tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
                                                  sizeof(*tree->root_));
   if (tree->root_ == NULL) return 0;
   TreeNodeInit(tree->root_);  // Initialize root.
   tree->num_nodes_ = 1;
-  memset(tree->lut_bits_, 255, sizeof(tree->lut_bits_));
-  memset(tree->lut_jump_, 0, sizeof(tree->lut_jump_));
   return 1;
 }
 
@@ -121,54 +115,10 @@ int HuffmanCodeLengthsToCodes(const int* const code_lengths,
   return 1;
 }
 
-#ifndef USE_LUT_REVERSE_BITS
-
-static int ReverseBitsShort(int bits, int num_bits) {
-  int retval = 0;
-  int i;
-  assert(num_bits <= 8);   // Not a hard requirement, just for coherency.
-  for (i = 0; i < num_bits; ++i) {
-    retval <<= 1;
-    retval |= bits & 1;
-    bits >>= 1;
-  }
-  return retval;
-}
-
-#else
-
-static const uint8_t kReversedBits[16] = {  // Pre-reversed 4-bit values.
-  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
-  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
-};
-
-static int ReverseBitsShort(int bits, int num_bits) {
-  const uint8_t v = (kReversedBits[bits & 0xf] << 4) | kReversedBits[bits >> 4];
-  assert(num_bits <= 8);
-  return v >> (8 - num_bits);
-}
-
-#endif
-
 static int TreeAddSymbol(HuffmanTree* const tree,
                          int symbol, int code, int code_length) {
-  int step = HUFF_LUT_BITS;
-  int base_code;
   HuffmanTreeNode* node = tree->root_;
   const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
-  assert(symbol == (int16_t)symbol);
-  if (code_length <= HUFF_LUT_BITS) {
-    int i;
-    base_code = ReverseBitsShort(code, code_length);
-    for (i = 0; i < (1 << (HUFF_LUT_BITS - code_length)); ++i) {
-      const int idx = base_code | (i << code_length);
-      tree->lut_symbol_[idx] = (int16_t)symbol;
-      tree->lut_bits_[idx] = code_length;
-    }
-  } else {
-    base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
-                                 HUFF_LUT_BITS);
-  }
   while (code_length-- > 0) {
     if (node >= max_node) {
       return 0;
@@ -176,17 +126,14 @@ static int TreeAddSymbol(HuffmanTree* const tree,
     if (NodeIsEmpty(node)) {
       if (IsFull(tree)) return 0;    // error: too many symbols.
       AssignChildren(tree, node);
-    } else if (!HuffmanTreeNodeIsNotLeaf(node)) {
+    } else if (HuffmanTreeNodeIsLeaf(node)) {
       return 0;  // leaf is already occupied.
     }
     node += node->children_ + ((code >> code_length) & 1);
-    if (--step == 0) {
-      tree->lut_jump_[base_code] = (int16_t)(node - tree->root_);
-    }
   }
   if (NodeIsEmpty(node)) {
     node->children_ = 0;      // turn newly created node into a leaf.
-  } else if (HuffmanTreeNodeIsNotLeaf(node)) {
+  } else if (!HuffmanTreeNodeIsLeaf(node)) {
     return 0;   // trying to assign a symbol to already used code.
   }
   node->symbol_ = symbol;  // Add symbol in this node.
@@ -286,3 +233,6 @@ int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
   return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/huffman.h b/drivers/webp/utils/huffman.h
index e8afd27f24..70220a67fb 100644
--- a/drivers/webp/utils/huffman.h
+++ b/drivers/webp/utils/huffman.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
@@ -17,7 +15,7 @@
 #include <assert.h>
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -28,24 +26,17 @@ typedef struct {
 } HuffmanTreeNode;
 
 // Huffman Tree.
-#define HUFF_LUT_BITS 7
-#define HUFF_LUT (1U << HUFF_LUT_BITS)
 typedef struct HuffmanTree HuffmanTree;
 struct HuffmanTree {
-  // Fast lookup for short bit lengths.
-  uint8_t lut_bits_[HUFF_LUT];
-  int16_t lut_symbol_[HUFF_LUT];
-  int16_t lut_jump_[HUFF_LUT];
-  // Complete tree for lookups.
   HuffmanTreeNode* root_;   // all the nodes, starting at root.
   int max_nodes_;           // max number of nodes
   int num_nodes_;           // number of currently occupied nodes
 };
 
-// Returns true if the given node is not a leaf of the Huffman tree.
-static WEBP_INLINE int HuffmanTreeNodeIsNotLeaf(
+// Returns true if the given node is a leaf of the Huffman tree.
+static WEBP_INLINE int HuffmanTreeNodeIsLeaf(
     const HuffmanTreeNode* const node) {
-  return node->children_;
+  return (node->children_ == 0);
 }
 
 // Go down one level. Most critical function. 'right_child' must be 0 or 1.
@@ -80,7 +71,7 @@ int HuffmanCodeLengthsToCodes(const int* const code_lengths,
                               int code_lengths_size, int* const huff_codes);
 
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/huffman_encode.c b/drivers/webp/utils/huffman_encode.c
index 9c5986738d..8ccd291d22 100644
--- a/drivers/webp/utils/huffman_encode.c
+++ b/drivers/webp/utils/huffman_encode.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -27,7 +25,7 @@ static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
 }
 
 // Change the population counts in a way that the consequent
-// Huffman tree compression, especially its RLE-part, give smaller output.
+// Hufmann tree compression, especially its RLE-part, give smaller output.
 static int OptimizeHuffmanForRle(int length, int* const counts) {
   uint8_t* good_for_rle;
   // 1) Let's make the Huffman code more compatible with rle encoding.
@@ -140,8 +138,13 @@ static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
   } else if (t1->total_count_ < t2->total_count_) {
     return 1;
   } else {
-    assert(t1->value_ != t2->value_);
-    return (t1->value_ < t2->value_) ? -1 : 1;
+    if (t1->value_ < t2->value_) {
+      return -1;
+    }
+    if (t1->value_ > t2->value_) {
+      return 1;
+    }
+    return 0;
   }
 }
 
@@ -190,10 +193,6 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     }
   }
 
-  if (tree_size_orig == 0) {   // pretty optimal already!
-    return 1;
-  }
-
   // 3 * tree_size is enough to cover all the nodes representing a
   // population and all the inserted nodes combining two existing nodes.
   // The tree pool needs 2 * (tree_size_orig - 1) entities, and the
@@ -235,7 +234,7 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
         tree_pool[tree_pool_size++] = tree[tree_size - 1];
         tree_pool[tree_pool_size++] = tree[tree_size - 2];
         count = tree_pool[tree_pool_size - 1].total_count_ +
-                tree_pool[tree_pool_size - 2].total_count_;
+            tree_pool[tree_pool_size - 2].total_count_;
         tree_size -= 2;
         {
           // Search for the insertion point.
diff --git a/drivers/webp/utils/huffman_encode.h b/drivers/webp/utils/huffman_encode.h
index ee51c68c9f..cc3b38d330 100644
--- a/drivers/webp/utils/huffman_encode.h
+++ b/drivers/webp/utils/huffman_encode.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -16,7 +14,7 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -42,7 +40,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
                           HuffmanTreeCode* const tree);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/utils/quant_levels.c b/drivers/webp/utils/quant_levels.c
index d7c8aab922..f6884392aa 100644
--- a/drivers/webp/utils/quant_levels.c
+++ b/drivers/webp/utils/quant_levels.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Quantize levels for specified number of quantization-levels ([2, 256]).
@@ -16,6 +14,10 @@
 
 #include "./quant_levels.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define NUM_SYMBOLS     256
 
 #define MAX_ITER  6             // Maximum number of convergence steps.
@@ -138,3 +140,15 @@ int QuantizeLevels(uint8_t* const data, int width, int height,
   return 1;
 }
 
+int DequantizeLevels(uint8_t* const data, int width, int height) {
+  if (data == NULL || width <= 0 || height <= 0) return 0;
+  // TODO(skal): implement gradient smoothing.
+  (void)data;
+  (void)width;
+  (void)height;
+  return 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/quant_levels.h b/drivers/webp/utils/quant_levels.h
index 1cb5a32cae..89ccafe40d 100644
--- a/drivers/webp/utils/quant_levels.h
+++ b/drivers/webp/utils/quant_levels.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha plane quantization utility
@@ -18,7 +16,7 @@
 
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -29,7 +27,12 @@ extern "C" {
 int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
                    uint64_t* const sse);
 
-#ifdef __cplusplus
+// Apply post-processing to input 'data' of size 'width'x'height' assuming
+// that the source was quantized to a reduced number of levels.
+// Returns false in case of error (data is NULL, invalid parameters, ...).
+int DequantizeLevels(uint8_t* const data, int width, int height);
+
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/quant_levels_dec.c b/drivers/webp/utils/quant_levels_dec.c
deleted file mode 100644
index 8489705a2d..0000000000
--- a/drivers/webp/utils/quant_levels_dec.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// TODO(skal): implement gradient smoothing.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./quant_levels_dec.h"
-
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows) {
-  if (data == NULL || width <= 0 || height <= 0 || row < 0 || num_rows < 0 ||
-      row + num_rows > height) {
-    return 0;
-  }
-  return 1;
-}
-
diff --git a/drivers/webp/utils/quant_levels_dec.h b/drivers/webp/utils/quant_levels_dec.h
deleted file mode 100644
index 0288383aeb..0000000000
--- a/drivers/webp/utils/quant_levels_dec.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Alpha plane de-quantization utility
-//
-// Author:  Vikas Arora (vikasa@google.com)
-
-#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
-#define WEBP_UTILS_QUANT_LEVELS_DEC_H_
-
-#include "../webp/types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Apply post-processing to input 'data' of size 'width'x'height' assuming that
-// the source was quantized to a reduced number of levels. The post-processing
-// will be applied to 'num_rows' rows of 'data' starting from 'row'.
-// Returns false in case of error (data is NULL, invalid parameters, ...).
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows);
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */
diff --git a/drivers/webp/utils/random.c b/drivers/webp/utils/random.c
deleted file mode 100644
index 24e96ad648..0000000000
--- a/drivers/webp/utils/random.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Pseudo-random utilities
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <string.h>
-#include "./random.h"
-
-//------------------------------------------------------------------------------
-
-// 31b-range values
-static const uint32_t kRandomTable[VP8_RANDOM_TABLE_SIZE] = {
-  0x0de15230, 0x03b31886, 0x775faccb, 0x1c88626a, 0x68385c55, 0x14b3b828,
-  0x4a85fef8, 0x49ddb84b, 0x64fcf397, 0x5c550289, 0x4a290000, 0x0d7ec1da,
-  0x5940b7ab, 0x5492577d, 0x4e19ca72, 0x38d38c69, 0x0c01ee65, 0x32a1755f,
-  0x5437f652, 0x5abb2c32, 0x0faa57b1, 0x73f533e7, 0x685feeda, 0x7563cce2,
-  0x6e990e83, 0x4730a7ed, 0x4fc0d9c6, 0x496b153c, 0x4f1403fa, 0x541afb0c,
-  0x73990b32, 0x26d7cb1c, 0x6fcc3706, 0x2cbb77d8, 0x75762f2a, 0x6425ccdd,
-  0x24b35461, 0x0a7d8715, 0x220414a8, 0x141ebf67, 0x56b41583, 0x73e502e3,
-  0x44cab16f, 0x28264d42, 0x73baaefb, 0x0a50ebed, 0x1d6ab6fb, 0x0d3ad40b,
-  0x35db3b68, 0x2b081e83, 0x77ce6b95, 0x5181e5f0, 0x78853bbc, 0x009f9494,
-  0x27e5ed3c
-};
-
-void VP8InitRandom(VP8Random* const rg, float dithering) {
-  memcpy(rg->tab_, kRandomTable, sizeof(rg->tab_));
-  rg->index1_ = 0;
-  rg->index2_ = 31;
-  rg->amp_ = (dithering < 0.0) ? 0
-           : (dithering > 1.0) ? (1 << VP8_RANDOM_DITHER_FIX)
-           : (uint32_t)((1 << VP8_RANDOM_DITHER_FIX) * dithering);
-}
-
-//------------------------------------------------------------------------------
-
diff --git a/drivers/webp/utils/random.h b/drivers/webp/utils/random.h
deleted file mode 100644
index 08a83e9674..0000000000
--- a/drivers/webp/utils/random.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Pseudo-random utilities
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_UTILS_RANDOM_H_
-#define WEBP_UTILS_RANDOM_H_
-
-#include <assert.h>
-#include "../webp/types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define VP8_RANDOM_DITHER_FIX 8   // fixed-point precision for dithering
-#define VP8_RANDOM_TABLE_SIZE 55
-
-typedef struct {
-  int index1_, index2_;
-  uint32_t tab_[VP8_RANDOM_TABLE_SIZE];
-  int amp_;
-} VP8Random;
-
-// Initializes random generator with an amplitude 'dithering' in range [0..1].
-void VP8InitRandom(VP8Random* const rg, float dithering);
-
-// Returns a centered pseudo-random number with 'num_bits' amplitude.
-// (uses D.Knuth's Difference-based random generator).
-// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
-static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
-                                      int amp) {
-  int diff;
-  assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
-  diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
-  if (diff < 0) diff += (1u << 31);
-  rg->tab_[rg->index1_] = diff;
-  if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
-  if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
-  diff = (diff << 1) >> (32 - num_bits);         // sign-extend, 0-center
-  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
-  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
-  return diff;
-}
-
-static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
-  return VP8RandomBits2(rg, num_bits, rg->amp_);
-}
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_UTILS_RANDOM_H_ */
diff --git a/drivers/webp/utils/rescaler.c b/drivers/webp/utils/rescaler.c
index 7061246024..9825dcbc5f 100644
--- a/drivers/webp/utils/rescaler.c
+++ b/drivers/webp/utils/rescaler.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
@@ -17,8 +15,12 @@
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define RFIX 30
-#define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+#define MULT_FIX(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
 
 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
                       uint8_t* const dst, int dst_width, int dst_height,
@@ -119,11 +121,6 @@ uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
 //------------------------------------------------------------------------------
 // all-in-one calls
 
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
-  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
-  return (num_lines > max_num_lines) ? max_num_lines : num_lines;
-}
-
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
@@ -150,3 +147,6 @@ int WebPRescalerExport(WebPRescaler* const rescaler) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/rescaler.h b/drivers/webp/utils/rescaler.h
index 68e49cee55..ef93d465f0 100644
--- a/drivers/webp/utils/rescaler.h
+++ b/drivers/webp/utils/rescaler.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
@@ -14,7 +12,7 @@
 #ifndef WEBP_UTILS_RESCALER_H_
 #define WEBP_UTILS_RESCALER_H_
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -38,8 +36,7 @@ typedef struct {
 } WebPRescaler;
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
-                      int src_width, int src_height,
+void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
                       uint8_t* const dst,
                       int dst_width, int dst_height, int dst_stride,
                       int num_channels,
@@ -47,11 +44,6 @@ void WebPRescalerInit(WebPRescaler* const rescaler,
                       int y_add, int y_sub,
                       int32_t* const work);
 
-// Returns the number of input lines needed next to produce one output line,
-// considering that the maximum available input lines are 'max_num_lines'.
-int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
-                           int max_num_lines);
-
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported.
 void WebPRescalerImportRow(WebPRescaler* const rescaler,
@@ -70,14 +62,14 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
 
 // Export one row from rescaler. Returns the pointer where output was written,
 // or NULL if no row was pending.
-uint8_t* WebPRescalerExportRow(WebPRescaler* const rescaler);
+uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk);
 
 // Export as many rows as possible. Return the numbers of rows written.
-int WebPRescalerExport(WebPRescaler* const rescaler);
+int WebPRescalerExport(WebPRescaler* const wrk);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/thread.c b/drivers/webp/utils/thread.c
index a9e3fae8de..ce89cf9dc7 100644
--- a/drivers/webp/utils/thread.c
+++ b/drivers/webp/utils/thread.c
@@ -1,20 +1,26 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Multi-threaded worker
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./thread.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef WEBP_USE_THREAD
 
 #if defined(_WIN32)
@@ -122,14 +128,14 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
   return !ok;
 }
 
-#else  // !_WIN32
+#else  // _WIN32
 # define THREADFN void*
 # define THREAD_RETURN(val) val
-#endif  // _WIN32
+#endif
 
 //------------------------------------------------------------------------------
 
-static THREADFN ThreadLoop(void* ptr) {
+static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
   WebPWorker* const worker = (WebPWorker*)ptr;
   int done = 0;
   while (!done) {
@@ -138,7 +144,9 @@ static THREADFN ThreadLoop(void* ptr) {
       pthread_cond_wait(&worker->condition_, &worker->mutex_);
     }
     if (worker->status_ == WORK) {
-      WebPWorkerExecute(worker);
+      if (worker->hook) {
+        worker->had_error |= !worker->hook(worker->data1, worker->data2);
+      }
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
@@ -151,8 +159,8 @@ static THREADFN ThreadLoop(void* ptr) {
 }
 
 // main thread state control
-static void ChangeState(WebPWorker* const worker,
-                        WebPWorkerStatus new_status) {
+static void WebPWorkerChangeState(WebPWorker* const worker,
+                                  WebPWorkerStatus new_status) {
   // no-op when attempting to change state on a thread that didn't come up
   if (worker->status_ < OK) return;
 
@@ -169,7 +177,7 @@ static void ChangeState(WebPWorker* const worker,
   pthread_mutex_unlock(&worker->mutex_);
 }
 
-#endif  // WEBP_USE_THREAD
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -180,7 +188,7 @@ void WebPWorkerInit(WebPWorker* const worker) {
 
 int WebPWorkerSync(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  ChangeState(worker, OK);
+  WebPWorkerChangeState(worker, OK);
 #endif
   assert(worker->status_ <= OK);
   return !worker->had_error;
@@ -196,7 +204,7 @@ int WebPWorkerReset(WebPWorker* const worker) {
       return 0;
     }
     pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, ThreadLoop, worker);
+    ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
     if (ok) worker->status_ = OK;
     pthread_mutex_unlock(&worker->mutex_);
 #else
@@ -209,24 +217,19 @@ int WebPWorkerReset(WebPWorker* const worker) {
   return ok;
 }
 
-void WebPWorkerExecute(WebPWorker* const worker) {
-  if (worker->hook != NULL) {
-    worker->had_error |= !worker->hook(worker->data1, worker->data2);
-  }
-}
-
 void WebPWorkerLaunch(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  ChangeState(worker, WORK);
+  WebPWorkerChangeState(worker, WORK);
 #else
-  WebPWorkerExecute(worker);
+  if (worker->hook)
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
 #endif
 }
 
 void WebPWorkerEnd(WebPWorker* const worker) {
   if (worker->status_ >= OK) {
 #ifdef WEBP_USE_THREAD
-    ChangeState(worker, NOT_OK);
+    WebPWorkerChangeState(worker, NOT_OK);
     pthread_join(worker->thread_, NULL);
     pthread_mutex_destroy(&worker->mutex_);
     pthread_cond_destroy(&worker->condition_);
@@ -239,3 +242,6 @@ void WebPWorkerEnd(WebPWorker* const worker) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/thread.h b/drivers/webp/utils/thread.h
index aef33bdaf2..3191890b76 100644
--- a/drivers/webp/utils/thread.h
+++ b/drivers/webp/utils/thread.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Multi-threaded worker
@@ -14,15 +12,11 @@
 #ifndef WEBP_UTILS_THREAD_H_
 #define WEBP_UTILS_THREAD_H_
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#ifdef WEBP_USE_THREAD
+#if WEBP_USE_THREAD
 
 #if defined(_WIN32)
 
@@ -55,7 +49,7 @@ typedef int (*WebPWorkerHook)(void*, void*);
 
 // Synchronize object used to launch job in the worker thread
 typedef struct {
-#ifdef WEBP_USE_THREAD
+#if WEBP_USE_THREAD
   pthread_mutex_t mutex_;
   pthread_cond_t  condition_;
   pthread_t       thread_;
@@ -69,28 +63,23 @@ typedef struct {
 
 // Must be called first, before any other method.
 void WebPWorkerInit(WebPWorker* const worker);
-// Must be called to initialize the object and spawn the thread. Re-entrant.
+// Must be called initialize the object and spawn the thread. Re-entrant.
 // Will potentially launch the thread. Returns false in case of error.
 int WebPWorkerReset(WebPWorker* const worker);
-// Makes sure the previous work is finished. Returns true if worker->had_error
-// was not set and no error condition was triggered by the working thread.
+// Make sure the previous work is finished. Returns true if worker->had_error
+// was not set and not error condition was triggered by the working thread.
 int WebPWorkerSync(WebPWorker* const worker);
-// Triggers the thread to call hook() with data1 and data2 argument. These
+// Trigger the thread to call hook() with data1 and data2 argument. These
 // hook/data1/data2 can be changed at any time before calling this function,
 // but not be changed afterward until the next call to WebPWorkerSync().
 void WebPWorkerLaunch(WebPWorker* const worker);
-// This function is similar to WebPWorkerLaunch() except that it calls the
-// hook directly instead of using a thread. Convenient to bypass the thread
-// mechanism while still using the WebPWorker structs. WebPWorkerSync() must
-// still be called afterward (for error reporting).
-void WebPWorkerExecute(WebPWorker* const worker);
 // Kill the thread and terminate the object. To use the object again, one
 // must call WebPWorkerReset() again.
 void WebPWorkerEnd(WebPWorker* const worker);
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/utils.c b/drivers/webp/utils/utils.c
index 5592538988..673b7e284c 100644
--- a/drivers/webp/utils/utils.c
+++ b/drivers/webp/utils/utils.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Misc. common utility functions
@@ -14,11 +12,14 @@
 #include <stdlib.h>
 #include "./utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Checked memory allocation
 
-// Returns 0 in case of overflow of nmemb * size.
-static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
+static int CheckSizeArguments(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
@@ -27,16 +28,17 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
 }
 
 void* WebPSafeMalloc(uint64_t nmemb, size_t size) {
-  if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
-  assert(nmemb * size > 0);
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
   return malloc((size_t)(nmemb * size));
 }
 
 void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
-  if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
-  assert(nmemb * size > 0);
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
   return calloc((size_t)nmemb, size);
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/utils/utils.h b/drivers/webp/utils/utils.h
index 8bdf0f03db..a034762556 100644
--- a/drivers/webp/utils/utils.h
+++ b/drivers/webp/utils/utils.h
@@ -1,25 +1,20 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Misc. common utility functions
 //
-// Authors: Skal (pascal.massimino@gmail.com)
-//          Urvang (urvang@google.com)
+// Author: Skal (pascal.massimino@gmail.com)
 
 #ifndef WEBP_UTILS_UTILS_H_
 #define WEBP_UTILS_UTILS_H_
 
-#include <assert.h>
-
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -41,42 +36,8 @@ void* WebPSafeMalloc(uint64_t nmemb, size_t size);
 void* WebPSafeCalloc(uint64_t nmemb, size_t size);
 
 //------------------------------------------------------------------------------
-// Reading/writing data.
-
-// Read 16, 24 or 32 bits stored in little-endian order.
-static WEBP_INLINE int GetLE16(const uint8_t* const data) {
-  return (int)(data[0] << 0) | (data[1] << 8);
-}
-
-static WEBP_INLINE int GetLE24(const uint8_t* const data) {
-  return GetLE16(data) | (data[2] << 16);
-}
-
-static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
-  return (uint32_t)GetLE16(data) | (GetLE16(data + 2) << 16);
-}
-
-// Store 16, 24 or 32 bits in little-endian order.
-static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
-  assert(val < (1 << 16));
-  data[0] = (val >> 0);
-  data[1] = (val >> 8);
-}
-
-static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
-  assert(val < (1 << 24));
-  PutLE16(data, val & 0xffff);
-  data[2] = (val >> 16);
-}
-
-static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
-  PutLE16(data, (int)(val & 0xffff));
-  PutLE16(data + 2, (int)(val >> 16));
-}
-
-//------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/logo.png b/logo.png
deleted file mode 100644
index affc21cf84..0000000000
--- a/logo.png
+++ /dev/null