15 files changed, 2224 insertions, 1686 deletions
diff --git a/drivers/webp/dec/alpha.c b/drivers/webp/dec/alpha.c
index d1095fa555..52216fc4d6 100644
--- a/drivers/webp/dec/alpha.c
+++ b/drivers/webp/dec/alpha.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
@@ -10,131 +12,156 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
-#include "../utils/filters.h"
-#include "../utils/quant_levels.h"
-#include "../format_constants.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-// TODO(skal): move to dsp/ ?
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
+#include "../dsp/dsp.h"
+#include "../utils/quant_levels_dec.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+//------------------------------------------------------------------------------
+// ALPHDecoder object.
+
+ALPHDecoder* ALPHNew(void) {
+  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
+  return dec;
+}
+
+void ALPHDelete(ALPHDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LDelete(dec->vp8l_dec_);
+    dec->vp8l_dec_ = NULL;
+    WebPSafeFree(dec);
   }
 }
 
 //------------------------------------------------------------------------------
-// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
-// The 'output' buffer should be pre-allocated and must be of the same
-// dimension 'height'x'stride', as that of the image.
-//
-// Returns 1 on successfully decoding the compressed alpha and
-//         0 if either:
-//           error in bit-stream header (invalid compression mode or filter), or
-//           error returned by appropriate compression method.
-
-static int DecodeAlpha(const uint8_t* data, size_t data_size,
-                       int width, int height, int stride, uint8_t* output) {
-  uint8_t* decoded_data = NULL;
-  const size_t decoded_size = height * width;
-  uint8_t* unfiltered_data = NULL;
-  WEBP_FILTER_TYPE filter;
-  int pre_processing;
-  int rsrv;
+// Decoding.
+
+// Initialize alpha decoding by parsing the alpha header and decoding the image
+// header for alpha data stored using lossless compression.
+// Returns false in case of error in alpha header (data too short, invalid
+// compression method or filter, error in lossless header data etc).
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                    size_t data_size, int width, int height, uint8_t* output) {
   int ok = 0;
-  int method;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
+  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+  int rsrv;
 
-  assert(width > 0 && height > 0 && stride >= width);
+  assert(width > 0 && height > 0);
   assert(data != NULL && output != NULL);
 
+  dec->width_ = width;
+  dec->height_ = height;
+
   if (data_size <= ALPHA_HEADER_LEN) {
     return 0;
   }
 
-  method = (data[0] >> 0) & 0x03;
-  filter = (data[0] >> 2) & 0x03;
-  pre_processing = (data[0] >> 4) & 0x03;
+  dec->method_ = (data[0] >> 0) & 0x03;
+  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
   rsrv = (data[0] >> 6) & 0x03;
-  if (method < ALPHA_NO_COMPRESSION ||
-      method > ALPHA_LOSSLESS_COMPRESSION ||
-      filter >= WEBP_FILTER_LAST ||
-      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
       rsrv != 0) {
     return 0;
   }
 
-  if (method == ALPHA_NO_COMPRESSION) {
-    ok = (data_size >= decoded_size);
-    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
+    ok = (alpha_data_size >= alpha_decoded_size);
   } else {
-    decoded_data = (uint8_t*)malloc(decoded_size);
-    if (decoded_data == NULL) return 0;
-    ok = VP8LDecodeAlphaImageStream(width, height,
-                                    data + ALPHA_HEADER_LEN,
-                                    data_size - ALPHA_HEADER_LEN,
-                                    decoded_data);
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
   }
+  VP8FiltersInit();
+  return ok;
+}
 
-  if (ok) {
-    WebPFilterFunc unfilter_func = WebPUnfilters[filter];
-    if (unfilter_func != NULL) {
-      unfiltered_data = (uint8_t*)malloc(decoded_size);
-      if (unfiltered_data == NULL) {
-        ok = 0;
-        goto Error;
-      }
-      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
-      // and apply filter per image-row.
-      unfilter_func(decoded_data, width, height, 1, width, unfiltered_data);
-      // Construct raw_data (height x stride) from alpha data (height x width).
-      CopyPlane(unfiltered_data, width, output, stride, width, height);
-      free(unfiltered_data);
-    } else {
-      // Construct raw_data (height x stride) from alpha data (height x width).
-      CopyPlane(decoded_data, width, output, stride, width, height);
-    }
-    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
-      ok = DequantizeLevels(decoded_data, width, height);
+// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
+// starting from row number 'row'. It assumes that rows up to (row - 1) have
+// already been decoded.
+// Returns false in case of bitstream error.
+static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+  ALPHDecoder* const alph_dec = dec->alph_dec_;
+  const int width = alph_dec->width_;
+  const int height = alph_dec->height_;
+  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
+  uint8_t* const output = dec->alpha_plane_;
+  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t offset = row * width;
+    const size_t num_pixels = num_rows * width;
+    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
+    memcpy(dec->alpha_plane_ + offset,
+           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
+    assert(alph_dec->vp8l_dec_ != NULL);
+    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
+      return 0;
     }
   }
 
- Error:
-  if (method != ALPHA_NO_COMPRESSION) {
-    free(decoded_data);
+  if (unfilter_func != NULL) {
+    unfilter_func(width, height, width, row, num_rows, output);
   }
-  return ok;
+
+  if (row + num_rows == dec->pic_hdr_.height_) {
+    dec->is_alpha_decoded_ = 1;
+  }
+  return 1;
 }
 
 //------------------------------------------------------------------------------
+// Main entry point.
 
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows) {
-  const int stride = dec->pic_hdr_.width_;
+  const int width = dec->pic_hdr_.width_;
+  const int height = dec->pic_hdr_.height_;
 
-  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
     return NULL;    // sanity check.
   }
 
   if (row == 0) {
-    // Decode everything during the first call.
-    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
-                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
-                     dec->alpha_plane_)) {
-      return NULL;  // Error.
+    // Initialize decoding.
+    assert(dec->alpha_plane_ != NULL);
+    dec->alph_dec_ = ALPHNew();
+    if (dec->alph_dec_ == NULL) return NULL;
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                  width, height, dec->alpha_plane_)) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+      return NULL;
+    }
+    // if we allowed use of alpha dithering, check whether it's needed at all
+    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+      dec->alpha_dithering_ = 0;  // disable dithering
+    } else {
+      num_rows = height;          // decode everything in one pass
     }
   }
 
+  if (!dec->is_alpha_decoded_) {
+    int ok = 0;
+    assert(dec->alph_dec_ != NULL);
+    ok = ALPHDecode(dec, row, num_rows);
+    if (ok && dec->alpha_dithering_ > 0) {
+      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
+                                dec->alpha_dithering_);
+    }
+    if (!ok || dec->is_alpha_decoded_) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+    }
+    if (!ok) return NULL;  // Error.
+  }
+
   // Return a pointer to the current decoded row.
-  return dec->alpha_plane_ + row * stride;
+  return dec->alpha_plane_ + row * width;
 }
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/buffer.c b/drivers/webp/dec/buffer.c
index c159f6f248..9ed2b3fe1a 100644
--- a/drivers/webp/dec/buffer.c
+++ b/drivers/webp/dec/buffer.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
@@ -15,10 +17,6 @@
 #include "./webpi.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 
@@ -35,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
   return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }
 
+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   int ok = 1;
   const WEBP_CSP_MODE mode = buffer->colorspace;
@@ -44,33 +47,41 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
     ok = 0;
   } else if (!WebPIsRGBMode(mode)) {   // YUV checks
     const WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const uint64_t y_size = (uint64_t)buf->y_stride * height;
-    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
+    const int y_stride = abs(buf->y_stride);
+    const int u_stride = abs(buf->u_stride);
+    const int v_stride = abs(buf->v_stride);
+    const int a_stride = abs(buf->a_stride);
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
     ok &= (y_size <= buf->y_size);
     ok &= (u_size <= buf->u_size);
     ok &= (v_size <= buf->v_size);
-    ok &= (buf->y_stride >= width);
-    ok &= (buf->u_stride >= (width + 1) / 2);
-    ok &= (buf->v_stride >= (width + 1) / 2);
+    ok &= (y_stride >= width);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
     ok &= (buf->y != NULL);
     ok &= (buf->u != NULL);
     ok &= (buf->v != NULL);
     if (mode == MODE_YUVA) {
-      ok &= (buf->a_stride >= width);
+      ok &= (a_stride >= width);
       ok &= (a_size <= buf->a_size);
       ok &= (buf->a != NULL);
     }
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    const uint64_t size = (uint64_t)buf->stride * height;
+    const int stride = abs(buf->stride);
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
     ok &= (size <= buf->size);
-    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (stride >= width * kModeBpp[mode]);
     ok &= (buf->rgba != NULL);
   }
   return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE
 
 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
   const int w = buffer->width;
@@ -133,9 +144,35 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
   return CheckDecBuffer(buffer);
 }
 
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
+  if (buffer == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (WebPIsRGBMode(buffer->colorspace)) {
+    WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->stride = -buf->stride;
+  } else {
+    WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int H = buffer->height;
+    buf->y += (H - 1) * buf->y_stride;
+    buf->y_stride = -buf->y_stride;
+    buf->u += ((H - 1) >> 1) * buf->u_stride;
+    buf->u_stride = -buf->u_stride;
+    buf->v += ((H - 1) >> 1) * buf->v_stride;
+    buf->v_stride = -buf->v_stride;
+    if (buf->a != NULL) {
+      buf->a += (H - 1) * buf->a_stride;
+      buf->a_stride = -buf->a_stride;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
 VP8StatusCode WebPAllocateDecBuffer(int w, int h,
                                     const WebPDecoderOptions* const options,
                                     WebPDecBuffer* const out) {
+  VP8StatusCode status;
   if (out == NULL || w <= 0 || h <= 0) {
     return VP8_STATUS_INVALID_PARAM;
   }
@@ -152,18 +189,28 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
       h = ch;
     }
     if (options->use_scaling) {
-      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      int scaled_width = options->scaled_width;
+      int scaled_height = options->scaled_height;
+      if (!WebPRescalerGetScaledDimensions(
+              w, h, &scaled_width, &scaled_height)) {
         return VP8_STATUS_INVALID_PARAM;
       }
-      w = options->scaled_width;
-      h = options->scaled_height;
+      w = scaled_width;
+      h = scaled_height;
     }
   }
   out->width = w;
   out->height = h;
 
-  // Then, allocate buffer for real
-  return AllocateBuffer(out);
+  // Then, allocate buffer for real.
+  status = AllocateBuffer(out);
+  if (status != VP8_STATUS_OK) return status;
+
+  // Use the stride trick if vertical flip is needed.
+  if (options != NULL && options->flip) {
+    status = WebPFlipBuffer(out);
+  }
+  return status;
 }
 
 //------------------------------------------------------------------------------
@@ -180,8 +227,9 @@ int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
 
 void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
   if (buffer != NULL) {
-    if (!buffer->is_external_memory)
-      free(buffer->private_memory);
+    if (!buffer->is_external_memory) {
+      WebPSafeFree(buffer->private_memory);
+    }
     buffer->private_memory = NULL;
   }
 }
@@ -210,6 +258,3 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/decode_vp8.h b/drivers/webp/dec/decode_vp8.h
index c26a9fc891..b9337bbec0 100644
--- a/drivers/webp/dec/decode_vp8.h
+++ b/drivers/webp/dec/decode_vp8.h
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
@@ -12,9 +14,9 @@
 #ifndef WEBP_WEBP_DECODE_VP8_H_
 #define WEBP_WEBP_DECODE_VP8_H_
 
-#include "../decode.h"
+#include "../webp/decode.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -130,7 +132,8 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
   return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
 
-// Start decoding a new picture. Returns true if ok.
+// Decode the VP8 frame header. Returns true if ok.
+// Note: 'io->data' must be pointing to the start of the VP8 frame header.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@@ -175,7 +178,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
     const uint8_t* data, size_t data_size,  // data available so far
     int* const width, int* const height, int* const has_alpha);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/frame.c b/drivers/webp/dec/frame.c
index 9c91a48e17..b882133eab 100644
--- a/drivers/webp/dec/frame.c
+++ b/drivers/webp/dec/frame.c
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
@@ -13,11 +15,180 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+//------------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static int CheckMode(int mb_x, int mb_y, int mode) {
+  if (mode == B_DC_PRED) {
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
 
-#define ALIGN_MASK (32 - 1)
+static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
+  memcpy(dst, src, 4);
+}
+
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}
+
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx) {
+  int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Initialize left-most block.
+  for (j = 0; j < 16; ++j) {
+    y_dst[j * BPS - 1] = 129;
+  }
+  for (j = 0; j < 8; ++j) {
+    u_dst[j * BPS - 1] = 129;
+    v_dst[j * BPS - 1] = 129;
+  }
+
+  // Init top-left sample on left column too.
+  if (mb_y > 0) {
+    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+  } else {
+    // we only need to do this init once at block (0,0).
+    // Afterward, it remains valid for the whole topmost row.
+    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+    memset(u_dst - BPS - 1, 127, 8 + 1);
+    memset(v_dst - BPS - 1, 127, 8 + 1);
+  }
+
+  // Reconstruct one row.
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
+
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+      }
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      }
+    }
+    {
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;
+
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      }
+
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
+        }
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          uint8_t* const dst = y_dst + kScan[n];
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
+        }
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
+        }
+      }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      }
+
+      // stash away top samples for next block
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
+    }
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
+    }
+  }
+}
 
 //------------------------------------------------------------------------------
 // Filtering
@@ -29,25 +200,18 @@ extern "C" {
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 
-static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
-  if (keyframe) {
-    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-  } else {
-    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
-  }
-}
-
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int y_bps = dec->cache_y_stride_;
-  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
-  const int level = f_info->f_level_;
+  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
   const int ilevel = f_info->f_ilevel_;
-  const int limit = 2 * level + ilevel;
-  if (level == 0) {
+  const int limit = f_info->f_limit_;
+  if (limit == 0) {
     return;
   }
+  assert(limit >= 3);
   if (dec->filter_type_ == 1) {   // simple
     if (mb_x > 0) {
       VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@@ -63,10 +227,9 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
     }
   } else {    // complex
     const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh =
-        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh = f_info->hev_thresh_;
     if (mb_x > 0) {
       VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
       VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@@ -97,53 +260,138 @@ static void FilterRow(const VP8Decoder* const dec) {
 }
 
 //------------------------------------------------------------------------------
+// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
 
-void VP8StoreBlock(VP8Decoder* const dec) {
+static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
-    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
-    const int skip = dec->mb_info_[dec->mb_x_].skip_;
-    int level = dec->filter_levels_[dec->segment_];
-    if (dec->filter_hdr_.use_lf_delta_) {
-      // TODO(skal): only CURRENT is handled for now.
-      level += dec->filter_hdr_.ref_lf_delta_[0];
-      if (dec->is_i4x4_) {
-        level += dec->filter_hdr_.mode_lf_delta_[0];
+    int s;
+    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
+    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+      int i4x4;
+      // First, compute the initial level
+      int base_level;
+      if (dec->segment_hdr_.use_segment_) {
+        base_level = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          base_level += hdr->level_;
+        }
+      } else {
+        base_level = hdr->level_;
+      }
+      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
+        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
+        int level = base_level;
+        if (hdr->use_lf_delta_) {
+          level += hdr->ref_lf_delta_[0];
+          if (i4x4) {
+            level += hdr->mode_lf_delta_[0];
+          }
+        }
+        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+        if (level > 0) {
+          int ilevel = level;
+          if (hdr->sharpness_ > 0) {
+            if (hdr->sharpness_ > 4) {
+              ilevel >>= 2;
+            } else {
+              ilevel >>= 1;
+            }
+            if (ilevel > 9 - hdr->sharpness_) {
+              ilevel = 9 - hdr->sharpness_;
+            }
+          }
+          if (ilevel < 1) ilevel = 1;
+          info->f_ilevel_ = ilevel;
+          info->f_limit_ = 2 * level + ilevel;
+          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+        } else {
+          info->f_limit_ = 0;  // no filtering
+        }
+        info->f_inner_ = i4x4;
       }
     }
-    level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-    info->f_level_ = level;
+  }
+}
 
-    if (dec->filter_hdr_.sharpness_ > 0) {
-      if (dec->filter_hdr_.sharpness_ > 4) {
-        level >>= 2;
-      } else {
-        level >>= 1;
+//------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
+        }
+        all_amp |= dqm->dither_;
       }
-      if (level > 9 - dec->filter_hdr_.sharpness_) {
-        level = 9 - dec->filter_hdr_.sharpness_;
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
       }
     }
+    // potentially allow alpha dithering
+    dec->alpha_dithering_ = options->alpha_dithering_strength;
+    if (dec->alpha_dithering_ > 100) {
+      dec->alpha_dithering_ = 100;
+    } else if (dec->alpha_dithering_ < 0) {
+      dec->alpha_dithering_ = 0;
+    }
+  }
+}
 
-    info->f_ilevel_ = (level < 1) ? 1 : level;
-    info->f_inner_ = (!skip || dec->is_i4x4_);
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
   }
-  {
-    // Transfer samples to row cache
-    int y;
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
-    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
-    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
-    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
-    for (y = 0; y < 16; ++y) {
-      memcpy(ydst + y * dec->cache_y_stride_,
-             dec->yuv_b_ + Y_OFF + y * BPS, 16);
-    }
-    for (y = 0; y < 8; ++y) {
-      memcpy(udst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + U_OFF + y * BPS, 8);
-      memcpy(vdst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + V_OFF + y * BPS, 8);
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
     }
   }
 }
@@ -165,25 +413,35 @@ void VP8StoreBlock(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
-  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
   uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
   uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
   uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int first_row = (ctx->mb_y_ == 0);
-  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
-  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
-  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+  const int mb_y = ctx->mb_y_;
+  const int is_first_row = (mb_y == 0);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
+
+  if (dec->mt_method_ == 2) {
+    ReconstructRow(dec, ctx);
+  }
 
   if (ctx->filter_row_) {
     FilterRow(dec);
   }
 
-  if (io->put) {
-    if (!first_row) {
+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
+  if (io->put != NULL) {
+    int y_start = MACROBLOCK_VPOS(mb_y);
+    int y_end = MACROBLOCK_VPOS(mb_y + 1);
+    if (!is_first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
       io->u = udst;
@@ -194,7 +452,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
       io->v = dec->cache_v_ + uv_offset;
     }
 
-    if (!last_row) {
+    if (!is_last_row) {
       y_end -= extra_y_rows;
     }
     if (y_end > io->crop_bottom) {
@@ -202,11 +460,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
     io->a = NULL;
     if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): several things to correct here:
-      // * testing presence of alpha with dec->alpha_data_ is not a good idea
-      // * we're actually decompressing the full plane only once. It should be
-      //   more obvious from signature.
-      // * we could free alpha_data_ right after this call, but we don't own.
+      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
+      // good idea.
       io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
       if (io->a == NULL) {
         return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -238,8 +493,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
   }
   // rotate top samples if needed
-  if (ctx->id_ + 1 == dec->num_caches_) {
-    if (!last_row) {
+  if (cache_id + 1 == dec->num_caches_) {
+    if (!is_last_row) {
       memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
       memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
       memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@@ -256,27 +511,40 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  if (!dec->use_threads_) {
+  const int filter_row =
+      (dec->filter_type_ > 0) &&
+      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  if (dec->mt_method_ == 0) {
     // ctx->id_ and ctx->f_info_ are already set
     ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = dec->filter_row_;
+    ctx->filter_row_ = filter_row;
+    ReconstructRow(dec, ctx);
     ok = FinishRow(dec, io);
   } else {
     WebPWorker* const worker = &dec->worker_;
     // Finish previous job *before* updating context
-    ok &= WebPWorkerSync(worker);
+    ok &= WebPGetWorkerInterface()->Sync(worker);
     assert(worker->status_ == OK);
     if (ok) {   // spawn a new deblocking/output job
       ctx->io_ = *io;
       ctx->id_ = dec->cache_id_;
       ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = dec->filter_row_;
-      if (ctx->filter_row_) {    // just swap filter info
+      ctx->filter_row_ = filter_row;
+      if (dec->mt_method_ == 2) {  // swap macroblock data
+        VP8MBData* const tmp = ctx->mb_data_;
+        ctx->mb_data_ = dec->mb_data_;
+        dec->mb_data_ = tmp;
+      } else {
+        // perform reconstruction directly in main thread
+        ReconstructRow(dec, ctx);
+      }
+      if (filter_row) {            // swap filter info
         VP8FInfo* const tmp = ctx->f_info_;
         ctx->f_info_ = dec->f_info_;
         dec->f_info_ = tmp;
       }
-      WebPWorkerLaunch(worker);
+      // (reconstruct)+filter in parallel
+      WebPGetWorkerInterface()->Launch(worker);
       if (++dec->cache_id_ == dec->num_caches_) {
         dec->cache_id_ = 0;
       }
@@ -290,8 +558,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
 
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() not matter what.
-  if (io->setup && !io->setup(io)) {
+  // Note: Afterward, we must call teardown() no matter what.
+  if (io->setup != NULL && !io->setup(io)) {
     VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
     return dec->status_;
   }
@@ -304,7 +572,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 
   // Define the area where we can skip in-loop filtering, in case of cropping.
   //
-  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // 'Simple' filter reads two luma samples outside of the macroblock
   // and filters one. It doesn't filter the chroma samples. Hence, we can
   // avoid doing the in-loop filtering before crop_top/crop_left position.
   // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@@ -339,16 +607,17 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
       dec->br_mb_y_ = dec->mb_h_;
     }
   }
+  PrecomputeFilterStrengths(dec);
   return VP8_STATUS_OK;
 }
 
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
-  if (dec->use_threads_) {
-    ok = WebPWorkerSync(&dec->worker_);
+  if (dec->mt_method_ > 0) {
+    ok = WebPGetWorkerInterface()->Sync(&dec->worker_);
   }
 
-  if (io->teardown) {
+  if (io->teardown != NULL) {
     io->teardown(io);
   }
   return ok;
@@ -384,9 +653,9 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
   dec->cache_id_ = 0;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     WebPWorker* const worker = &dec->worker_;
-    if (!WebPWorkerReset(worker)) {
+    if (!WebPGetWorkerInterface()->Reset(worker)) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                          "thread initialization failed.");
     }
@@ -401,6 +670,28 @@ static int InitThreadContext(VP8Decoder* const dec) {
   return 1;
 }
 
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height) {
+  if (options == NULL || options->use_threads == 0) {
+    return 0;
+  }
+  (void)headers;
+  (void)width;
+  (void)height;
+  assert(headers == NULL || !headers->is_lossless);
+#if defined(WEBP_USE_THREAD)
+  if (width < MIN_WIDTH_FOR_THREADS) return 0;
+  // TODO(skal): tune the heuristic further
+#if 0
+  if (height < 2 * width) return 2;
+#endif
+  return 2;
+#else   // !WEBP_USE_THREAD
+  return 0;
+#endif
+}
+
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES
 
@@ -412,14 +703,15 @@ static int AllocateMemory(VP8Decoder* const dec) {
   const int mb_w = dec->mb_w_;
   // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
   const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
   const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
   const size_t f_info_size =
       (dec->filter_type_ > 0) ?
-          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
         : 0;
   const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t mb_data_size =
+      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
   const size_t cache_height = (16 * num_caches
                             + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
   const size_t cache_size = top_size * cache_height;
@@ -428,13 +720,13 @@ static int AllocateMemory(VP8Decoder* const dec) {
       (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
   const uint64_t needed = (uint64_t)intra_pred_mode_size
                         + top_size + mb_info_size + f_info_size
-                        + yuv_size + coeffs_size
-                        + cache_size + alpha_size + ALIGN_MASK;
+                        + yuv_size + mb_data_size
+                        + cache_size + alpha_size + WEBP_ALIGN_CST;
   uint8_t* mem;
 
   if (needed != (size_t)needed) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
-    free(dec->mem_);
+    WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
     dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
     if (dec->mem_ == NULL) {
@@ -449,12 +741,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
   dec->intra_t_ = (uint8_t*)mem;
   mem += intra_pred_mode_size;
 
-  dec->y_t_ = (uint8_t*)mem;
-  mem += 16 * mb_w;
-  dec->u_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-  dec->v_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
+  dec->yuv_t_ = (VP8TopSamples*)mem;
+  mem += top_size;
 
   dec->mb_info_ = ((VP8MB*)mem) + 1;
   mem += mb_info_size;
@@ -463,20 +751,24 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
     dec->thread_ctx_.f_info_ += mb_w;
   }
 
-  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
-  assert((yuv_size & ALIGN_MASK) == 0);
+  mem = (uint8_t*)WEBP_ALIGN(mem);
+  assert((yuv_size & WEBP_ALIGN_CST) == 0);
   dec->yuv_b_ = (uint8_t*)mem;
   mem += yuv_size;
 
-  dec->coeffs_ = (int16_t*)mem;
-  mem += coeffs_size;
+  dec->mb_data_ = (VP8MBData*)mem;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
+  if (dec->mt_method_ == 2) {
+    dec->thread_ctx_.mb_data_ += mb_w;
+  }
+  mem += mb_data_size;
 
   dec->cache_y_stride_ = 16 * mb_w;
   dec->cache_uv_stride_ = 8 * mb_w;
@@ -496,9 +788,11 @@ static int AllocateMemory(VP8Decoder* const dec) {
   // alpha plane
   dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
   mem += alpha_size;
+  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
 
-  // note: left-info is initialized once for all.
+  // note: left/top-info is initialized once for all.
   memset(dec->mb_info_ - 1, 0, mb_info_size);
+  VP8InitScanline(dec);   // initialize left too.
 
   // initialize top
   memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@@ -517,7 +811,7 @@ static void InitIo(VP8Decoder* const dec, VP8Io* io) {
   io->a = NULL;
 }
 
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) {
   if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
   if (!AllocateMemory(dec)) return 0;
   InitIo(dec, io);
@@ -526,154 +820,3 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
 }
 
 //------------------------------------------------------------------------------
-// Main reconstruction function.
-
-static const int kScan[16] = {
-  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
-  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
-  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
-  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
-};
-
-static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
-  if (mode == B_DC_PRED) {
-    if (dec->mb_x_ == 0) {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
-    } else {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
-    }
-  }
-  return mode;
-}
-
-static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
-  *(uint32_t*)dst = *(uint32_t*)src;
-}
-
-void VP8ReconstructBlock(VP8Decoder* const dec) {
-  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
-  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
-  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-
-  // Rotate in the left samples from previously decoded block. We move four
-  // pixels at a time for alignment reason, and because of in-loop filter.
-  if (dec->mb_x_ > 0) {
-    int j;
-    for (j = -1; j < 16; ++j) {
-      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-    }
-    for (j = -1; j < 8; ++j) {
-      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-    }
-  } else {
-    int j;
-    for (j = 0; j < 16; ++j) {
-      y_dst[j * BPS - 1] = 129;
-    }
-    for (j = 0; j < 8; ++j) {
-      u_dst[j * BPS - 1] = 129;
-      v_dst[j * BPS - 1] = 129;
-    }
-    // Init top-left sample on left column too
-    if (dec->mb_y_ > 0) {
-      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-    }
-  }
-  {
-    // bring top samples into the cache
-    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
-    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
-    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
-    const int16_t* coeffs = dec->coeffs_;
-    int n;
-
-    if (dec->mb_y_ > 0) {
-      memcpy(y_dst - BPS, top_y, 16);
-      memcpy(u_dst - BPS, top_u, 8);
-      memcpy(v_dst - BPS, top_v, 8);
-    } else if (dec->mb_x_ == 0) {
-      // we only need to do this init once at block (0,0).
-      // Afterward, it remains valid for the whole topmost row.
-      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-      memset(u_dst - BPS - 1, 127, 8 + 1);
-      memset(v_dst - BPS - 1, 127, 8 + 1);
-    }
-
-    // predict and add residuals
-
-    if (dec->is_i4x4_) {   // 4x4
-      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-
-      if (dec->mb_y_ > 0) {
-        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
-          top_right[0] = top_y[15] * 0x01010101u;
-        } else {
-          memcpy(top_right, top_y + 16, sizeof(*top_right));
-        }
-      }
-      // replicate the top-right pixels below
-      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-      // predict and add residues for all 4x4 blocks in turn.
-      for (n = 0; n < 16; n++) {
-        uint8_t* const dst = y_dst + kScan[n];
-        VP8PredLuma4[dec->imodes_[n]](dst);
-        if (dec->non_zero_ac_ & (1 << n)) {
-          VP8Transform(coeffs + n * 16, dst, 0);
-        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-          VP8TransformDC(coeffs + n * 16, dst);
-        }
-      }
-    } else {    // 16x16
-      const int pred_func = CheckMode(dec, dec->imodes_[0]);
-      VP8PredLuma16[pred_func](y_dst);
-      if (dec->non_zero_) {
-        for (n = 0; n < 16; n++) {
-          uint8_t* const dst = y_dst + kScan[n];
-          if (dec->non_zero_ac_ & (1 << n)) {
-            VP8Transform(coeffs + n * 16, dst, 0);
-          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-            VP8TransformDC(coeffs + n * 16, dst);
-          }
-        }
-      }
-    }
-    {
-      // Chroma
-      const int pred_func = CheckMode(dec, dec->uvmode_);
-      VP8PredChroma8[pred_func](u_dst);
-      VP8PredChroma8[pred_func](v_dst);
-
-      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
-        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
-        if (dec->non_zero_ac_ & 0x0f0000) {
-          VP8TransformUV(u_coeffs, u_dst);
-        } else {
-          VP8TransformDCUV(u_coeffs, u_dst);
-        }
-      }
-      if (dec->non_zero_ & 0xf00000) {   // chroma-V
-        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
-        if (dec->non_zero_ac_ & 0xf00000) {
-          VP8TransformUV(v_coeffs, v_dst);
-        } else {
-          VP8TransformDCUV(v_coeffs, v_dst);
-        }
-      }
-
-      // stash away top samples for next block
-      if (dec->mb_y_ < dec->mb_h_ - 1) {
-        memcpy(top_y, y_dst + 15 * BPS, 16);
-        memcpy(top_u, u_dst +  7 * BPS,  8);
-        memcpy(top_v, v_dst +  7 * BPS,  8);
-      }
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/idec.c b/drivers/webp/dec/idec.c
index 7df790ced8..abafb9f3d1 100644
--- a/drivers/webp/dec/idec.c
+++ b/drivers/webp/dec/idec.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Incremental decoding
@@ -13,14 +15,11 @@
 #include <string.h>
 #include <stdlib.h>
 
+#include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
 #define CHUNK_SIZE 4096
@@ -29,11 +28,13 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Data structures for memory and states
 
-// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
+// Decoding states. State normally flows as:
+// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
+// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
-  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
+  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
   STATE_VP8_PARTS0,
   STATE_VP8_DATA,
   STATE_VP8L_HEADER,
@@ -71,32 +72,41 @@ struct WebPIDecoder {
   MemBuffer mem_;          // input memory buffer.
   WebPDecBuffer output_;   // output buffer (when no external one is supplied)
   size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
+
+  int last_mb_y_;          // last row reached for intra-mode decoding
 };
 
 // MB context to restore in case VP8DecodeMB() fails
 typedef struct {
   VP8MB left_;
   VP8MB info_;
-  uint8_t intra_t_[4];
-  uint8_t intra_l_[4];
-  VP8BitReader br_;
   VP8BitReader token_br_;
 } MBContext;
 
 //------------------------------------------------------------------------------
 // MemBuffer: incoming data handling
 
-static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
-  if (br->buf_ != NULL) {
-    br->buf_ += offset;
-    br->buf_end_ += offset;
-  }
-}
-
 static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
   return (mem->end_ - mem->start_);
 }
 
+// Check if we need to preserve the compressed alpha data, as it may not have
+// been decoded yet.
+static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
+    // We haven't parsed the headers yet, so we don't know whether the image is
+    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
+    return 0;
+  }
+  if (idec->is_lossless_) {
+    return 0;  // ALPH chunk is not present for lossless images.
+  } else {
+    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
+    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
+  }
+}
+
 static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const new_base = mem->buf_ + mem->start_;
@@ -112,16 +122,36 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
       if (offset != 0) {
         int p;
         for (p = 0; p <= last_part; ++p) {
-          RemapBitReader(dec->parts_ + p, offset);
+          VP8RemapBitReader(dec->parts_ + p, offset);
         }
         // Remap partition #0 data pointer to new offset, but only in MAP
         // mode (in APPEND mode, partition #0 is copied into a fixed memory).
         if (mem->mode_ == MEM_MODE_MAP) {
-          RemapBitReader(&dec->br_, offset);
+          VP8RemapBitReader(&dec->br_, offset);
+        }
+      }
+      {
+        const uint8_t* const last_start = dec->parts_[last_part].buf_;
+        assert(last_part >= 0);
+        VP8BitReaderSetBuffer(&dec->parts_[last_part], last_start,
+                              mem->buf_ + mem->end_ - last_start);
+      }
+      if (NeedCompressedAlpha(idec)) {
+        ALPHDecoder* const alph_dec = dec->alph_dec_;
+        dec->alpha_data_ += offset;
+        if (alph_dec != NULL) {
+          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
+            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
+            assert(alph_vp8l_dec != NULL);
+            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
+            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
+                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
+                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
+          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
+            // Nothing special to do in this case.
+          }
         }
       }
-      assert(last_part >= 0);
-      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
     } else {    // Resize lossless bitreader
       VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
       VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@@ -133,8 +163,12 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                              const uint8_t* const data, size_t data_size) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const int need_compressed_alpha = NeedCompressedAlpha(idec);
+  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_base =
+      need_compressed_alpha ? dec->alpha_data_ : old_start;
   assert(mem->mode_ == MEM_MODE_APPEND);
   if (data_size > MAX_CHUNK_PAYLOAD) {
     // security safeguard: trying to allocate more than what the format
@@ -143,17 +177,18 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   }
 
   if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    const size_t current_size = MemDataSize(mem);
+    const size_t new_mem_start = old_start - old_base;
+    const size_t current_size = MemDataSize(mem) + new_mem_start;
     const uint64_t new_size = (uint64_t)current_size + data_size;
     const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
     uint8_t* const new_buf =
         (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
     memcpy(new_buf, old_base, current_size);
-    free(mem->buf_);
+    WebPSafeFree(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
-    mem->start_ = 0;
+    mem->start_ = new_mem_start;
     mem->end_ = current_size;
   }
 
@@ -161,14 +196,15 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
 
-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
   return 1;
 }
 
 static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const uint8_t* const old_buf = mem->buf_;
+  const uint8_t* const old_start = old_buf + mem->start_;
   assert(mem->mode_ == MEM_MODE_MAP);
 
   if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
@@ -176,7 +212,7 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
   mem->buf_ = (uint8_t*)data;
   mem->end_ = mem->buf_size_ = data_size;
 
-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
   return 1;
 }
 
@@ -191,8 +227,8 @@ static void InitMemBuffer(MemBuffer* const mem) {
 static void ClearMemBuffer(MemBuffer* const mem) {
   assert(mem);
   if (mem->mode_ == MEM_MODE_APPEND) {
-    free(mem->buf_);
-    free((void*)mem->part0_buf_);
+    WebPSafeFree(mem->buf_);
+    WebPSafeFree((void*)mem->part0_buf_);
   }
 }
 
@@ -206,35 +242,34 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
   return 1;
 }
 
+// To be called last.
+static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
+  const WebPDecoderOptions* const options = idec->params_.options;
+  WebPDecBuffer* const output = idec->params_.output;
+
+  idec->state_ = STATE_DONE;
+  if (options != NULL && options->flip) {
+    return WebPFlipBuffer(output);
+  } else {
+    return VP8_STATUS_OK;
+  }
+}
+
 //------------------------------------------------------------------------------
 // Macroblock-decoding contexts
 
 static void SaveContext(const VP8Decoder* dec, const VP8BitReader* token_br,
                         MBContext* const context) {
-  const VP8BitReader* const br = &dec->br_;
-  const VP8MB* const left = dec->mb_info_ - 1;
-  const VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-
-  context->left_ = *left;
-  context->info_ = *info;
-  context->br_ = *br;
+  context->left_ = dec->mb_info_[-1];
+  context->info_ = dec->mb_info_[dec->mb_x_];
   context->token_br_ = *token_br;
-  memcpy(context->intra_t_, dec->intra_t_ + 4 * dec->mb_x_, 4);
-  memcpy(context->intra_l_, dec->intra_l_, 4);
 }
 
 static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
                            VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
-  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-
-  *left = context->left_;
-  *info = context->info_;
-  *br = context->br_;
+  dec->mb_info_[-1] = context->left_;
+  dec->mb_info_[dec->mb_x_] = context->info_;
   *token_br = context->token_br_;
-  memcpy(dec->intra_t_ + 4 * dec->mb_x_, context->intra_t_, 4);
-  memcpy(dec->intra_l_, context->intra_l_, 4);
 }
 
 //------------------------------------------------------------------------------
@@ -242,7 +277,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
   if (idec->state_ == STATE_VP8_DATA) {
     VP8Io* const io = &idec->io_;
-    if (io->teardown) {
+    if (io->teardown != NULL) {
       io->teardown(io);
     }
   }
@@ -270,6 +305,7 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 
   headers.data = data;
   headers.data_size = curr_size;
+  headers.have_all_data = 0;
   status = WebPParseHeaders(&headers);
   if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
     return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
@@ -285,15 +321,9 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
     idec->dec_ = dec;
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = (idec->params_.options != NULL) &&
-                        (idec->params_.options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
   } else {
     VP8LDecoder* const dec = VP8LNew();
     if (dec == NULL) {
@@ -308,13 +338,14 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
   const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
   const size_t curr_size = MemDataSize(&idec->mem_);
+  int width, height;
   uint32_t bits;
 
   if (curr_size < VP8_FRAME_HEADER_SIZE) {
     // Not enough data bytes to extract VP8 Frame Header.
     return VP8_STATUS_SUSPENDED;
   }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
@@ -328,30 +359,32 @@ static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
 }
 
 // Partition #0
-static int CopyParts0Data(WebPIDecoder* const idec) {
+static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   VP8BitReader* const br = &dec->br_;
-  const size_t psize = br->buf_end_ - br->buf_;
+  const size_t part_size = br->buf_end_ - br->buf_;
   MemBuffer* const mem = &idec->mem_;
   assert(!idec->is_lossless_);
   assert(mem->part0_buf_ == NULL);
-  assert(psize > 0);
-  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  // the following is a format limitation, no need for runtime check:
+  assert(part_size <= mem->part0_size_);
+  if (part_size == 0) {   // can't have zero-size partition #0
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
   if (mem->mode_ == MEM_MODE_APPEND) {
     // We copy and grab ownership of the partition #0 data.
-    uint8_t* const part0_buf = (uint8_t*)malloc(psize);
+    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
     if (part0_buf == NULL) {
-      return 0;
+      return VP8_STATUS_OUT_OF_MEMORY;
     }
-    memcpy(part0_buf, br->buf_, psize);
+    memcpy(part0_buf, br->buf_, part_size);
     mem->part0_buf_ = part0_buf;
-    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + psize;
+    VP8BitReaderSetBuffer(br, part0_buf, part_size);
   } else {
     // Else: just keep pointers to the partition #0's data in dec_->br_.
   }
-  mem->start_ += psize;
-  return 1;
+  mem->start_ += part_size;
+  return VP8_STATUS_OK;
 }
 
 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
@@ -381,9 +414,14 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   if (dec->status_ != VP8_STATUS_OK) {
     return IDecError(idec, dec->status_);
   }
+  // This change must be done before calling VP8InitFrame()
+  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
+                                       io->width, io->height);
+  VP8InitDithering(params->options, dec);
 
-  if (!CopyParts0Data(idec)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+  dec->status_ = CopyParts0Data(idec);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
   }
 
   // Finish setting up the decoding parameters. Will call io->setup().
@@ -407,50 +445,52 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   VP8Io* const io = &idec->io_;
 
   assert(dec->ready_);
-
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
-    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    if (dec->mb_x_ == 0) {
-      VP8InitScanline(dec);
+    if (idec->last_mb_y_ != dec->mb_y_) {
+      if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+        // note: normally, error shouldn't occur since we already have the whole
+        // partition0 available here in DecodeRemaining(). Reaching EOF while
+        // reading intra modes really means a BITSTREAM_ERROR.
+        return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+      }
+      idec->last_mb_y_ = dec->mb_y_;
     }
-    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+      VP8BitReader* const token_br =
+          &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
       MBContext context;
       SaveContext(dec, token_br, &context);
-
       if (!VP8DecodeMB(dec, token_br)) {
-        RestoreContext(&context, dec, token_br);
         // We shouldn't fail when MAX_MB data was available
         if (dec->num_parts_ == 1 && MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
           return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
         }
+        RestoreContext(&context, dec, token_br);
         return VP8_STATUS_SUSPENDED;
       }
-      VP8ReconstructBlock(dec);
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
-
       // Release buffer only if there is only one partition
       if (dec->num_parts_ == 1) {
         idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
-    dec->mb_x_ = 0;
   }
   // Synchronize the thread and check for errors.
   if (!VP8ExitCritical(dec, io)) {
     return IDecError(idec, VP8_STATUS_USER_ABORT);
   }
   dec->ready_ = 0;
-  idec->state_ = STATE_DONE;
-
-  return VP8_STATUS_OK;
+  return FinishDecoding(idec);
 }
 
-static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
+                                         VP8StatusCode status) {
   if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
     return VP8_STATUS_SUSPENDED;
   }
@@ -467,9 +507,15 @@ static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {
 
   // Wait until there's enough data for decoding header.
   if (curr_size < (idec->chunk_size_ >> 3)) {
-    return VP8_STATUS_SUSPENDED;
+    dec->status_ = VP8_STATUS_SUSPENDED;
+    return ErrorStatusLossless(idec, dec->status_);
   }
+
   if (!VP8LDecodeHeader(dec, io)) {
+    if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR &&
+        curr_size < idec->chunk_size_) {
+      dec->status_ = VP8_STATUS_SUSPENDED;
+    }
     return ErrorStatusLossless(idec, dec->status_);
   }
   // Allocate/verify output buffer now.
@@ -488,33 +534,29 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
   const size_t curr_size = MemDataSize(&idec->mem_);
   assert(idec->is_lossless_);
 
-  // At present Lossless decoder can't decode image incrementally. So wait till
-  // all the image data is aggregated before image can be decoded.
-  if (curr_size < idec->chunk_size_) {
-    return VP8_STATUS_SUSPENDED;
-  }
+  // Switch to incremental decoding if we don't have all the bytes available.
+  dec->incremental_ = (curr_size < idec->chunk_size_);
 
   if (!VP8LDecodeImage(dec)) {
     return ErrorStatusLossless(idec, dec->status_);
   }
-
-  idec->state_ = STATE_DONE;
-
-  return VP8_STATUS_OK;
+  assert(dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED);
+  return (dec->status_ == VP8_STATUS_SUSPENDED) ? dec->status_
+                                                : FinishDecoding(idec);
 }
 
   // Main decoding loop
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
   VP8StatusCode status = VP8_STATUS_SUSPENDED;
 
-  if (idec->state_ == STATE_PRE_VP8) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
     status = DecodeWebPHeaders(idec);
   } else {
     if (idec->dec_ == NULL) {
       return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
     }
   }
-  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+  if (idec->state_ == STATE_VP8_HEADER) {
     status = DecodeVP8FrameHeader(idec);
   }
   if (idec->state_ == STATE_VP8_PARTS0) {
@@ -536,20 +578,23 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 // Public functions
 
 WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
+  WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
   if (idec == NULL) {
     return NULL;
   }
 
-  idec->state_ = STATE_PRE_VP8;
+  idec->state_ = STATE_WEBP_HEADER;
   idec->chunk_size_ = 0;
 
+  idec->last_mb_y_ = -1;
+
   InitMemBuffer(&idec->mem_);
   WebPInitDecBuffer(&idec->output_);
   VP8InitIo(&idec->io_);
 
   WebPResetDecParams(&idec->params_);
-  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  idec->params_.output = (output_buffer != NULL) ? output_buffer
+                                                 : &idec->output_;
   WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
 
   return idec;
@@ -581,14 +626,18 @@ void WebPIDelete(WebPIDecoder* idec) {
   if (idec == NULL) return;
   if (idec->dec_ != NULL) {
     if (!idec->is_lossless_) {
-      VP8Delete(idec->dec_);
+      if (idec->state_ == STATE_VP8_DATA) {
+        // Synchronize the thread, clean-up and check for errors.
+        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+      }
+      VP8Delete((VP8Decoder*)idec->dec_);
     } else {
-      VP8LDelete(idec->dec_);
+      VP8LDelete((VP8LDecoder*)idec->dec_);
     }
   }
   ClearMemBuffer(&idec->mem_);
   WebPFreeDecBuffer(&idec->output_);
-  free(idec);
+  WebPSafeFree(idec);
 }
 
 //------------------------------------------------------------------------------
@@ -596,12 +645,22 @@ void WebPIDelete(WebPIDecoder* idec) {
 
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                           size_t output_buffer_size, int output_stride) {
+  const int is_external_memory = (output_buffer != NULL);
   WebPIDecoder* idec;
+
   if (mode >= MODE_YUV) return NULL;
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    output_buffer_size = 0;
+    output_stride = 0;
+  } else {  // A buffer was passed. Validate the other params.
+    if (output_stride == 0 || output_buffer_size == 0) {
+      return NULL;   // invalid parameter.
+    }
+  }
   idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
   idec->output_.colorspace = mode;
-  idec->output_.is_external_memory = 1;
+  idec->output_.is_external_memory = is_external_memory;
   idec->output_.u.RGBA.rgba = output_buffer;
   idec->output_.u.RGBA.stride = output_stride;
   idec->output_.u.RGBA.size = output_buffer_size;
@@ -612,10 +671,30 @@ WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
                            uint8_t* u, size_t u_size, int u_stride,
                            uint8_t* v, size_t v_size, int v_stride,
                            uint8_t* a, size_t a_size, int a_stride) {
-  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  const int is_external_memory = (luma != NULL);
+  WebPIDecoder* idec;
+  WEBP_CSP_MODE colorspace;
+
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    luma_size = u_size = v_size = a_size = 0;
+    luma_stride = u_stride = v_stride = a_stride = 0;
+    u = v = a = NULL;
+    colorspace = MODE_YUVA;
+  } else {  // A luma buffer was passed. Validate the other parameters.
+    if (u == NULL || v == NULL) return NULL;
+    if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
+    if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
+    if (a != NULL) {
+      if (a_size == 0 || a_stride == 0) return NULL;
+    }
+    colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
+  }
+
+  idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
-  idec->output_.colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
-  idec->output_.is_external_memory = 1;
+
+  idec->output_.colorspace = colorspace;
+  idec->output_.is_external_memory = is_external_memory;
   idec->output_.u.YUVA.y = luma;
   idec->output_.u.YUVA.y_stride = luma_stride;
   idec->output_.u.YUVA.y_size = luma_size;
@@ -768,7 +847,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                     VP8IoSetupHook setup,
                     VP8IoTeardownHook teardown,
                     void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
+  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
     return 0;
   }
 
@@ -779,7 +858,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
 
   return 1;
 }
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/io.c b/drivers/webp/dec/io.c
index 594804c2e6..13e469ab73 100644
--- a/drivers/webp/dec/io.c
+++ b/drivers/webp/dec/io.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
@@ -15,10 +17,7 @@
 #include "./webpi.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@@ -46,57 +45,17 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 
 // Point-sampling U/V sampler.
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
-  const uint8_t* u_src = io->u;
-  const uint8_t* v_src = io->v;
-  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
-  const int mb_w = io->mb_w;
-  const int last = io->mb_h - 1;
-  int j;
-  for (j = 0; j < last; j += 2) {
-    sample(y_src, y_src + io->y_stride, u_src, v_src,
-           dst, dst + buf->stride, mb_w);
-    y_src += 2 * io->y_stride;
-    u_src += io->uv_stride;
-    v_src += io->uv_stride;
-    dst += 2 * buf->stride;
-  }
-  if (j == last) {  // Just do the last line twice
-    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
-  }
+  WebPDecBuffer* const output = p->output;
+  WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  WebPSamplerProcessPlane(io->y, io->y_stride,
+                          io->u, io->v, io->uv_stride,
+                          dst, buf->stride, io->mb_w, io->mb_h,
+                          WebPSamplers[output->colorspace]);
   return io->mb_h;
 }
 
 //------------------------------------------------------------------------------
-// YUV444 -> RGB conversion
-
-#if 0   // TODO(skal): this is for future rescaling.
-static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
-  const uint8_t* u_src = io->u;
-  const uint8_t* v_src = io->v;
-  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
-  const int mb_w = io->mb_w;
-  const int last = io->mb_h;
-  int j;
-  for (j = 0; j < last; ++j) {
-    convert(y_src, u_src, v_src, dst, mb_w);
-    y_src += io->y_stride;
-    u_src += io->uv_stride;
-    v_src += io->uv_stride;
-    dst += buf->stride;
-  }
-  return io->mb_h;
-}
-#endif
-
-//------------------------------------------------------------------------------
 // Fancy upsampling
 
 #ifdef FANCY_UPSAMPLING
@@ -117,7 +76,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
 
   if (y == 0) {
     // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
   } else {
     // We can finish the left-over line from previous call.
     upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@@ -160,14 +119,16 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 //------------------------------------------------------------------------------
 
-static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
   const uint8_t* alpha = io->a;
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
   int j;
-
+  (void)expected_num_lines_out;
+  assert(expected_num_lines_out == mb_h);
   if (alpha != NULL) {
     for (j = 0; j < mb_h; ++j) {
       memcpy(dst, alpha, mb_w * sizeof(*dst));
@@ -210,7 +171,8 @@ static int GetAlphaSourceRow(const VP8Io* const io,
   return start_y;
 }
 
-static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                        int expected_num_lines_out) {
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
@@ -221,21 +183,13 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
     int num_rows;
     const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
-    uint32_t alpha_mask = 0xff;
-    int i, j;
-
-    for (j = 0; j < num_rows; ++j) {
-      for (i = 0; i < mb_w; ++i) {
-        const uint32_t alpha_value = alpha[i];
-        dst[4 * i] = alpha_value;
-        alpha_mask &= alpha_value;
-      }
-      alpha += io->width;
-      dst += buf->stride;
-    }
-    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
-    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
+    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
+                                            num_rows, dst, buf->stride);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
+    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
       WebPApplyAlphaMultiply(base_rgba, alpha_first,
                              mb_w, num_rows, buf->stride);
     }
@@ -243,7 +197,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
   return 0;
 }
 
-static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
+                             int expected_num_lines_out) {
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
@@ -252,10 +207,13 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
     int num_rows;
     const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+    uint8_t* alpha_dst = base_rgba;
+#else
     uint8_t* alpha_dst = base_rgba + 1;
+#endif
     uint32_t alpha_mask = 0x0f;
     int i, j;
-
     for (j = 0; j < num_rows; ++j) {
       for (i = 0; i < mb_w; ++i) {
         // Fill in the alpha value (converted to 4 bits).
@@ -266,6 +224,8 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
       alpha += io->width;
       alpha_dst += buf->stride;
     }
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_rows);
     if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
       WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
     }
@@ -291,15 +251,35 @@ static int Rescale(const uint8_t* src, int src_stride,
 static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
   const int mb_h = io->mb_h;
   const int uv_mb_h = (mb_h + 1) >> 1;
-  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  WebPRescaler* const scaler = &p->scaler_y;
+  int num_lines_out = 0;
+  if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
+    // Before rescaling, we premultiply the luma directly into the io->y
+    // internal buffer. This is OK since these samples are not used for
+    // intra-prediction (the top samples are saved in cache_y_/u_/v_).
+    // But we need to cast the const away, though.
+    WebPMultRows((uint8_t*)io->y, io->y_stride,
+                 io->a, io->width, io->mb_w, mb_h, 0);
+  }
+  num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
   Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
   Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
   return num_lines_out;
 }
 
-static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_lines_out) {
   if (io->a != NULL) {
-    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
+    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
+    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    (void)expected_num_lines_out;
+    assert(expected_num_lines_out == num_lines_out);
+    if (num_lines_out > 0) {   // unmultiply the Y
+      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
+                   p->scaler_a.dst_width, num_lines_out, 1);
+    }
   }
   return 0;
 }
@@ -316,39 +296,34 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
   size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;
 
-  tmp_size = work_size + 2 * uv_work_size;
+  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size;
+    tmp_size += work_size * sizeof(*work);
   }
-  p->memory = calloc(1, tmp_size * sizeof(*work));
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
   WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                    buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                    work);
   WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                    buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                    work + work_size);
   WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                    buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                    work + work_size + uv_work_size);
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
     WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                      buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                      work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
+    WebPInitAlphaProcessing();
   }
   return 1;
 }
@@ -360,13 +335,13 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   const WebPYUV444Converter convert =
       WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* dst = buf->rgba + y_pos * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
   while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
          WebPRescalerHasPendingOutput(&p->scaler_u)) {
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(y_pos + num_lines_out < p->output->height);
     assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
     WebPRescalerExportRow(&p->scaler_y);
     WebPRescalerExportRow(&p->scaler_u);
@@ -388,65 +363,69 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
     const int y_lines_in =
         WebPRescalerImport(&p->scaler_y, mb_h - j,
                            io->y + j * io->y_stride, io->y_stride);
-    const int u_lines_in =
-        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
-                           io->u + uv_j * io->uv_stride, io->uv_stride);
-    const int v_lines_in =
-        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
-                           io->v + uv_j * io->uv_stride, io->uv_stride);
-    (void)v_lines_in;   // remove a gcc warning
-    assert(u_lines_in == v_lines_in);
     j += y_lines_in;
-    uv_j += u_lines_in;
-    num_lines_out += ExportRGB(p, num_lines_out);
+    if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) {
+      const int u_lines_in =
+          WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                             io->u + uv_j * io->uv_stride, io->uv_stride);
+      const int v_lines_in =
+          WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      (void)v_lines_in;   // remove a gcc warning
+      assert(u_lines_in == v_lines_in);
+      uv_j += u_lines_in;
+    }
+    num_lines_out += ExportRGB(p, p->last_y + num_lines_out);
   }
   return num_lines_out;
 }
 
-static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int alpha_first =
       (colorspace == MODE_ARGB || colorspace == MODE_Argb);
   uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
   int num_lines_out = 0;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
-  uint32_t alpha_mask = 0xff;
+  uint32_t non_opaque = 0;
   const int width = p->scaler_a.dst_width;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
-    int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
+    assert(y_pos + num_lines_out < p->output->height);
     WebPRescalerExportRow(&p->scaler_a);
-    for (i = 0; i < width; ++i) {
-      const uint32_t alpha_value = p->scaler_a.dst[i];
-      dst[4 * i] = alpha_value;
-      alpha_mask &= alpha_value;
-    }
+    non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0);
     dst += buf->stride;
     ++num_lines_out;
   }
-  if (is_premult_alpha && alpha_mask != 0xff) {
+  if (is_premult_alpha && non_opaque) {
     WebPApplyAlphaMultiply(base_rgba, alpha_first,
                            width, num_lines_out, buf->stride);
   }
   return num_lines_out;
 }
 
-static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
+                               int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+  uint8_t* alpha_dst = base_rgba;
+#else
   uint8_t* alpha_dst = base_rgba + 1;
+#endif
   int num_lines_out = 0;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int width = p->scaler_a.dst_width;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
   uint32_t alpha_mask = 0x0f;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+         num_lines_out < max_lines_out) {
     int i;
-    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(y_pos + num_lines_out < p->output->height);
     WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
@@ -463,15 +442,17 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
   return num_lines_out;
 }
 
-static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
+                                int expected_num_out_lines) {
   if (io->a != NULL) {
     WebPRescaler* const scaler = &p->scaler_a;
-    int j = 0;
-    int pos = 0;
-    while (j < io->mb_h) {
-      j += WebPRescalerImport(scaler, io->mb_h - j,
-                              io->a + j * io->width, io->width);
-      pos += p->emit_alpha_row(p, pos);
+    int lines_left = expected_num_out_lines;
+    const int y_end = p->last_y + lines_left;
+    while (lines_left > 0) {
+      const int row_offset = scaler->src_y - io->mb_y;
+      WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
+                         io->a + row_offset * io->width, io->width);
+      lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
     }
   }
   return 0;
@@ -484,9 +465,9 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
   const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2;
+  size_t tmp_size1, tmp_size2, total_size;
 
   tmp_size1 = 3 * work_size;
   tmp_size2 = 3 * out_width;
@@ -494,30 +475,28 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size1 += work_size;
     tmp_size2 += out_width;
   }
-  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
   WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                    tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                    work + 0 * work_size);
   WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                    tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                    work + 1 * work_size);
   WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                    tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                    work + 2 * work_size);
   p->emit = EmitRescaledRGB;
+  WebPInitYUV444Converters();
 
   if (has_alpha) {
     WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                      tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                      work + 3 * work_size);
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
@@ -526,6 +505,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     } else {
       p->emit_alpha_row = ExportAlpha;
     }
+    WebPInitAlphaProcessing();
   }
   return 1;
 }
@@ -546,7 +526,9 @@ static int CustomSetup(VP8Io* io) {
   if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
     return 0;
   }
-
+  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
+    WebPInitUpsamplers();
+  }
   if (io->use_scaling) {
     const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
     if (!ok) {
@@ -554,11 +536,12 @@ static int CustomSetup(VP8Io* io) {
     }
   } else {
     if (is_rgb) {
+      WebPInitSamplers();
       p->emit = EmitSampledRGB;   // default
-#ifdef FANCY_UPSAMPLING
       if (io->fancy_upsampling) {
+#ifdef FANCY_UPSAMPLING
         const int uv_width = (io->mb_w + 1) >> 1;
-        p->memory = malloc(io->mb_w + 2 * uv_width);
+        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
         if (p->memory == NULL) {
           return 0;   // memory error.
         }
@@ -567,18 +550,20 @@ static int CustomSetup(VP8Io* io) {
         p->tmp_v = p->tmp_u + uv_width;
         p->emit = EmitFancyRGB;
         WebPInitUpsamplers();
-      }
 #endif
+      }
     } else {
       p->emit = EmitYUV;
     }
     if (is_alpha) {  // need transparency output
-      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
       p->emit_alpha =
           (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
               EmitAlphaRGBA4444
           : is_rgb ? EmitAlphaRGB
           : EmitAlphaYUV;
+      if (is_rgb) {
+        WebPInitAlphaProcessing();
+      }
     }
   }
 
@@ -601,8 +586,8 @@ static int CustomPut(const VP8Io* io) {
     return 0;
   }
   num_lines_out = p->emit(io, p);
-  if (p->emit_alpha) {
-    p->emit_alpha(io, p);
+  if (p->emit_alpha != NULL) {
+    p->emit_alpha(io, p, num_lines_out);
   }
   p->last_y += num_lines_out;
   return 1;
@@ -612,7 +597,7 @@ static int CustomPut(const VP8Io* io) {
 
 static void CustomTeardown(const VP8Io* io) {
   WebPDecParams* const p = (WebPDecParams*)io->opaque;
-  free(p->memory);
+  WebPSafeFree(p->memory);
   p->memory = NULL;
 }
 
@@ -627,7 +612,3 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
 }
 
 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/layer.c b/drivers/webp/dec/layer.c
deleted file mode 100644
index a3a5bdcfe8..0000000000
--- a/drivers/webp/dec/layer.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Enhancement layer (for YUV444/422)
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "./vp8i.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-
-int VP8DecodeLayer(VP8Decoder* const dec) {
-  assert(dec);
-  assert(dec->layer_data_size_ > 0);
-  (void)dec;
-
-  // TODO: handle enhancement layer here.
-
-  return 1;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/quant.c b/drivers/webp/dec/quant.c
index d54097af0d..5b648f942c 100644
--- a/drivers/webp/dec/quant.c
+++ b/drivers/webp/dec/quant.c
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
@@ -11,10 +13,6 @@
 
 #include "./vp8i.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
 }
@@ -102,12 +100,11 @@ void VP8ParseQuant(VP8Decoder* const dec) {
 
       m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
       m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
     }
   }
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/tree.c b/drivers/webp/dec/tree.c
index 82484e4c55..c2007ea733 100644
--- a/drivers/webp/dec/tree.c
+++ b/drivers/webp/dec/tree.c
@@ -1,22 +1,21 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "vp8i.h"
+#include "./vp8i.h"
+#include "../utils/bit_reader_inl.h"
 
 #define USE_GENERIC_TREE
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
   -B_DC_PRED, 1,
@@ -31,61 +30,12 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif
 
-#ifndef ONLY_KEYFRAME_CODE
-
-// inter prediction modes
-enum {
-  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
-  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
-
-static const int8_t kYModesInter[8] = {
-  -DC_PRED, 1,
-    2, 3,
-      -V_PRED, -H_PRED,
-      -TM_PRED, -B_PRED
-};
-
-static const int8_t kMBSplit[6] = {
-  -3, 1,
-    -2, 2,
-      -0, -1
-};
-
-static const int8_t kMVRef[8] = {
-  -ZEROMV, 1,
-    -NEARESTMV, 2,
-      -NEARMV, 3,
-        -NEWMV, -SPLITMV
-};
-
-static const int8_t kMVRef4[6] = {
-  -LEFT4, 1,
-    -ABOVE4, 2,
-      -ZERO4, -NEW4
-};
-#endif
-
 //------------------------------------------------------------------------------
 // Default probabilities
 
-// Inter
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
-static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
-static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
-  { 162, 128, 225, 146, 172, 147, 214,  39,
-    156, 128, 129, 132,  75, 145, 178, 206,
-    239, 254, 254 },
-  { 164, 128, 204, 170, 119, 235, 140, 230,
-    228, 128, 130, 130,  74, 148, 180, 203,
-    236, 254, 254 }
-};
-#endif
-
 // Paragraph 13.5
 static const uint8_t
   CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -326,28 +276,38 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
 
 void VP8ResetProba(VP8Proba* const proba) {
   memset(proba->segments_, 255u, sizeof(proba->segments_));
-  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
-#ifndef ONLY_KEYFRAME_CODE
-  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
-  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
-  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
-#endif
+  // proba->bands_[][] is initialized later
 }
 
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
-  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+static void ParseIntraMode(VP8BitReader* const br,
+                           VP8Decoder* const dec, int mb_x) {
+  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
   uint8_t* const left = dec->intra_l_;
-  // Hardcoded 16x16 intra-mode decision tree.
-  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!dec->is_i4x4_) {
+  VP8MBData* const block = dec->mb_data_ + mb_x;
+
+  // Note: we don't save segment map (yet), as we don't expect
+  // to decode more than 1 keyframe.
+  if (dec->segment_hdr_.update_map_) {
+    // Hardcoded tree parsing
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
+                    ? VP8GetBit(br, dec->proba_.segments_[1])
+                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+  } else {
+    block->segment_ = 0;  // default for intra
+  }
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+
+  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!block->is_i4x4_) {
+    // Hardcoded 16x16 intra-mode decision tree.
     const int ymode =
         VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                            : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    dec->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(top[0]));
-    memset(left, ymode, 4 * sizeof(left[0]));
+    block->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(*top));
+    memset(left, ymode, 4 * sizeof(*left));
   } else {
-    uint8_t* modes = dec->imodes_;
+    uint8_t* modes = block->imodes_;
     int y;
     for (y = 0; y < 4; ++y) {
       int ymode = left[y];
@@ -356,10 +316,10 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
         // Generic tree-parsing
-        int i = 0;
-        do {
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        while (i > 0) {
           i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        } while (i > 0);
+        }
         ymode = -i;
 #else
         // Hardcoded tree parsing
@@ -374,15 +334,24 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
                             (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
         top[x] = ymode;
-        *modes++ = ymode;
       }
+      memcpy(modes, top, 4 * sizeof(*top));
+      modes += 4;
       left[y] = ymode;
     }
   }
   // Hardcoded UVMode decision tree
-  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-               : !VP8GetBit(br, 114) ? V_PRED
-               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+                 : !VP8GetBit(br, 114) ? V_PRED
+                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+}
+
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
+  int mb_x;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    ParseIntraMode(br, dec, mb_x);
+  }
+  return !dec->br_.eof_;
 }
 
 //------------------------------------------------------------------------------
@@ -524,18 +493,13 @@ static const uint8_t
   }
 };
 
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
-  { 237, 246, 253, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 250, 250,
-    252, 254, 254 },
-  { 231, 243, 245, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 251, 251,
-    254, 254, 254 }
+// Paragraph 9.9
+
+static const int kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
 };
-#endif
 
-// Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   VP8Proba* const proba = &dec->proba_;
   int t, b, c, p;
@@ -543,47 +507,19 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
-            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
-          }
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          proba->bands_[t][b].probas_[c][p] = v;
         }
       }
     }
+    for (b = 0; b < 16 + 1; ++b) {
+      proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
+    }
   }
   dec->use_skip_proba_ = VP8Get(br);
   if (dec->use_skip_proba_) {
     dec->skip_p_ = VP8GetValue(br, 8);
   }
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->frm_hdr_.key_frame_) {
-    int i;
-    dec->intra_p_ = VP8GetValue(br, 8);
-    dec->last_p_ = VP8GetValue(br, 8);
-    dec->golden_p_ = VP8GetValue(br, 8);
-    if (VP8Get(br)) {   // update y-mode
-      for (i = 0; i < 4; ++i) {
-        proba->ymode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    if (VP8Get(br)) {   // update uv-mode
-      for (i = 0; i < 3; ++i) {
-        proba->uvmode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    // update MV
-    for (i = 0; i < 2; ++i) {
-      int k;
-      for (k = 0; k < NUM_MV_PROBAS; ++k) {
-        if (VP8GetBit(br, MVUpdateProba[i][k])) {
-          const int v = VP8GetValue(br, 7);
-          proba->mv_[i][k] = v ? v << 1 : 1;
-        }
-      }
-    }
-  }
-#endif
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/vp8.c b/drivers/webp/dec/vp8.c
index b0ccfa2a06..d89eb1c59e 100644
--- a/drivers/webp/dec/vp8.c
+++ b/drivers/webp/dec/vp8.c
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@@ -11,14 +13,12 @@
 
 #include <stdlib.h>
 
+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../utils/bit_reader.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/bit_reader_inl.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -45,10 +45,10 @@ int VP8InitIoInternal(VP8Io* const io, int version) {
 }
 
 VP8Decoder* VP8New(void) {
-  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  VP8Decoder* const dec = (VP8Decoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec != NULL) {
     SetOk(dec);
-    WebPWorkerInit(&dec->worker_);
+    WebPGetWorkerInterface()->Init(&dec->worker_);
     dec->ready_ = 0;
     dec->num_parts_ = 1;
   }
@@ -69,16 +69,13 @@ const char* VP8StatusMessage(VP8Decoder* const dec) {
 void VP8Delete(VP8Decoder* const dec) {
   if (dec != NULL) {
     VP8Clear(dec);
-    free(dec);
+    WebPSafeFree(dec);
   }
 }
 
 int VP8SetError(VP8Decoder* const dec,
                 VP8StatusCode error, const char* const msg) {
-  // TODO This check would be unnecessary if alpha decompression was separated
-  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
-  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
-  // failure.
+  // The oldest error reported takes precedence over the new one.
   if (dec->status_ == VP8_STATUS_OK) {
     dec->status_ = error;
     dec->error_msg_ = msg;
@@ -121,6 +118,9 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
     if (((bits >> 5)) >= chunk_size) {  // partition_length
       return 0;         // inconsistent size information.
     }
+    if (w == 0 || h == 0) {
+      return 0;         // We don't support both width and height to be zero.
+    }
 
     if (width) {
       *width = w;
@@ -190,25 +190,27 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
   const uint8_t* sz = buf;
   const uint8_t* buf_end = buf + size;
   const uint8_t* part_start;
-  int last_part;
-  int p;
+  size_t size_left = size;
+  size_t last_part;
+  size_t p;
 
   dec->num_parts_ = 1 << VP8GetValue(br, 2);
   last_part = dec->num_parts_ - 1;
-  part_start = buf + last_part * 3;
-  if (buf_end < part_start) {
+  if (size < 3 * last_part) {
     // we can't even read the sizes with sz[]! That's a failure.
     return VP8_STATUS_NOT_ENOUGH_DATA;
   }
+  part_start = buf + last_part * 3;
+  size_left -= last_part * 3;
   for (p = 0; p < last_part; ++p) {
-    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
-    const uint8_t* part_end = part_start + psize;
-    if (part_end > buf_end) part_end = buf_end;
-    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
-    part_start = part_end;
+    size_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    if (psize > size_left) psize = size_left;
+    VP8InitBitReader(dec->parts_ + p, part_start, psize);
+    part_start += psize;
+    size_left -= psize;
     sz += 3;
   }
-  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
+  VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
   return (part_start < buf_end) ? VP8_STATUS_OK :
            VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
 }
@@ -236,20 +238,6 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
     }
   }
   dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
-  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
-    if (dec->segment_hdr_.use_segment_) {
-      int s;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        int strength = dec->segment_hdr_.filter_strength_[s];
-        if (!dec->segment_hdr_.absolute_delta_) {
-          strength += hdr->level_;
-        }
-        dec->filter_levels_[s] = strength;
-      }
-    } else {
-      dec->filter_levels_[0] = hdr->level_;
-    }
-  }
   return !br->eof_;
 }
 
@@ -261,7 +249,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   VP8PictureHeader* pic_hdr;
   VP8BitReader* br;
   VP8StatusCode status;
-  WebPHeaderStructure headers;
 
   if (dec == NULL) {
     return 0;
@@ -271,33 +258,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                        "null VP8Io passed to VP8GetHeaders()");
   }
-
-  // Process Pre-VP8 chunks.
-  headers.data = io->data;
-  headers.data_size = io->data_size;
-  status = WebPParseHeaders(&headers);
-  if (status != VP8_STATUS_OK) {
-    return VP8SetError(dec, status, "Incorrect/incomplete header.");
-  }
-  if (headers.is_lossless) {
-    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                       "Unexpected lossless format encountered.");
-  }
-
-  if (dec->alpha_data_ == NULL) {
-    assert(dec->alpha_data_size_ == 0);
-    // We have NOT set alpha data yet. Set it now.
-    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
-    // WebPParseHeaders() is called more than once, as in incremental decoding
-    // case.)
-    dec->alpha_data_ = headers.alpha_data;
-    dec->alpha_data_size_ = headers.alpha_data_size;
-  }
-
-  // Process the VP8 frame header.
-  buf = headers.data + headers.offset;
-  buf_size = headers.data_size - headers.offset;
-  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
+  buf = io->data;
+  buf_size = io->data_size;
   if (buf_size < 4) {
     return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                        "Truncated header.");
@@ -355,7 +317,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
     VP8ResetProba(&dec->proba_);
     ResetSegmentHeader(&dec->segment_hdr_);
-    dec->segment_ = 0;    // default for intra
   }
 
   // Check if we have all the partition #0 available, and initialize dec->br_
@@ -366,7 +327,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   }
 
   br = &dec->br_;
-  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
+  VP8InitBitReader(br, buf, frm_hdr->partition_length_);
   buf += frm_hdr->partition_length_;
   buf_size -= frm_hdr->partition_length_;
 
@@ -393,63 +354,14 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
   // Frame buffer marking
   if (!frm_hdr->key_frame_) {
-    // Paragraph 9.7
-#ifndef ONLY_KEYFRAME_CODE
-    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
-    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
-    if (!(dec->buffer_flags_ & 1)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
-    }
-    if (!(dec->buffer_flags_ & 2)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
-    }
-    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
-    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
-#else
     return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                        "Not a key frame.");
-#endif
-  } else {
-    dec->buffer_flags_ = 0x003 | 0x100;
   }
 
-  // Paragraph 9.8
-#ifndef ONLY_KEYFRAME_CODE
-  dec->update_proba_ = VP8Get(br);
-  if (!dec->update_proba_) {    // save for later restore
-    dec->proba_saved_ = dec->proba_;
-  }
-  dec->buffer_flags_ &= 1 << 8;
-  dec->buffer_flags_ |=
-      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
-#else
-  VP8Get(br);   // just ignore the value of update_proba_
-#endif
+  VP8Get(br);   // ignore the value of update_proba_
 
   VP8ParseProba(br, dec);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  // Extensions
-  if (dec->pic_hdr_.colorspace_) {
-    const size_t kTrailerSize = 8;
-    const uint8_t kTrailerMarker = 0x01;
-    const uint8_t* ext_buf = buf - kTrailerSize;
-    size_t size;
-
-    if (frm_hdr->partition_length_ < kTrailerSize ||
-        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Inconsistent extra information.");
-    }
-
-    // Layer
-    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
-    dec->layer_data_size_ = size;
-    dec->layer_data_ = NULL;  // will be set later
-    dec->layer_colorspace_ = ext_buf[3];
-  }
-#endif
-
   // sanitized state
   dec->ready_ = 1;
   return 1;
@@ -458,11 +370,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)
 
-static const uint8_t kBands[16 + 1] = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
-  0  // extra entry as sentinel
-};
-
 static const uint8_t kCat3[] = { 173, 148, 140, 0 };
 static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
@@ -473,253 +380,226 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
+  if (!VP8GetBit(br, p[3])) {
+    if (!VP8GetBit(br, p[4])) {
+      v = 2;
+    } else {
+      v = 3 + VP8GetBit(br, p[5]);
+    }
+  } else {
+    if (!VP8GetBit(br, p[6])) {
+      if (!VP8GetBit(br, p[7])) {
+        v = 5 + VP8GetBit(br, 159);
+      } else {
+        v = 7 + 2 * VP8GetBit(br, 165);
+        v += VP8GetBit(br, 145);
+      }
+    } else {
+      const uint8_t* tab;
+      const int bit1 = VP8GetBit(br, p[8]);
+      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int cat = 2 * bit1 + bit0;
+      v = 0;
+      for (tab = kCat3456[cat]; *tab; ++tab) {
+        v += v + VP8GetBit(br, *tab);
+      }
+      v += 3 + (8 << cat);
+    }
+  }
+  return v;
+}
 
 // Returns the position of the last non-zero coeff plus one
-// (and 0 if there's no coeff at all)
-static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
                      int ctx, const quant_t dq, int n, int16_t* out) {
-  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n][ctx];
-  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
-    return 0;
-  }
-  while (1) {
-    ++n;
-    if (!VP8GetBit(br, p[1])) {
-      p = prob[kBands[n]][0];
-    } else {  // non zero coeff
-      int v, j;
+  const uint8_t* p = prob[n]->probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBit(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+      p = prob[++n]->probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
+      int v;
       if (!VP8GetBit(br, p[2])) {
-        p = prob[kBands[n]][1];
         v = 1;
+        p = p_ctx[1];
       } else {
-        if (!VP8GetBit(br, p[3])) {
-          if (!VP8GetBit(br, p[4])) {
-            v = 2;
-          } else {
-            v = 3 + VP8GetBit(br, p[5]);
-          }
-        } else {
-          if (!VP8GetBit(br, p[6])) {
-            if (!VP8GetBit(br, p[7])) {
-              v = 5 + VP8GetBit(br, 159);
-            } else {
-              v = 7 + 2 * VP8GetBit(br, 165);
-              v += VP8GetBit(br, 145);
-            }
-          } else {
-            const uint8_t* tab;
-            const int bit1 = VP8GetBit(br, p[8]);
-            const int bit0 = VP8GetBit(br, p[9 + bit1]);
-            const int cat = 2 * bit1 + bit0;
-            v = 0;
-            for (tab = kCat3456[cat]; *tab; ++tab) {
-              v += v + VP8GetBit(br, *tab);
-            }
-            v += 3 + (8 << cat);
-          }
-        }
-        p = prob[kBands[n]][2];
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
       }
-      j = kZigzag[n - 1];
-      out[j] = VP8GetSigned(br, v) * dq[j > 0];
-      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
-        return n;
-      }
-    }
-    if (n == 16) {
-      return 16;
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
     }
   }
+  return 16;
 }
 
-// Alias-safe way of converting 4bytes to 32bits.
-typedef union {
-  uint8_t  i8[4];
-  uint32_t i32;
-} PackedNz;
-
-// Table to unpack four bits into four bytes
-static const PackedNz kUnpackTab[16] = {
-  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
-  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
-  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
-  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
-
-// Macro to pack four LSB of four bytes into four bits.
-#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
-    defined(__BIG_ENDIAN__)
-#define PACK_CST 0x08040201U
-#else
-#define PACK_CST 0x01020408U
-#endif
-#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
-
-static void ParseResiduals(VP8Decoder* const dec,
-                           VP8MB* const mb, VP8BitReader* const token_br) {
-  int out_t_nz, out_l_nz, first;
-  ProbaArray ac_prob;
-  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
-  int16_t* dst = dec->coeffs_;
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
+  nz_coeffs <<= 2;
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
+  return nz_coeffs;
+}
+
+static int ParseResiduals(VP8Decoder* const dec,
+                          VP8MB* const mb, VP8BitReader* const token_br) {
+  const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
+  const VP8BandProbas* const * ac_proba;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
+  int16_t* dst = block->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
-  PackedNz tnz, lnz;
-  uint32_t non_zero_ac = 0;
-  uint32_t non_zero_dc = 0;
+  uint8_t tnz, lnz;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
   int x, y, ch;
+  uint32_t out_t_nz, out_l_nz;
+  int first;
 
-  nz_dc.i32 = nz_ac.i32 = 0;
   memset(dst, 0, 384 * sizeof(*dst));
-  if (!dec->is_i4x4_) {    // parse DC
+  if (!block->is_i4x4_) {    // parse DC
     int16_t dc[16] = { 0 };
-    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
-    mb->dc_nz_ = left_mb->dc_nz_ =
-        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
-                   ctx, q->y2_mat_, 0, dc) > 0);
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
+      VP8TransformWHT(dc, dst);
+    } else {        // only DC is non-zero -> inlined simplified transform
+      int i;
+      const int dc0 = (dc[0] + 3) >> 3;
+      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
+    }
     first = 1;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
-    VP8TransformWHT(dc, dst);
+    ac_proba = bands[0];
   } else {
     first = 0;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+    ac_proba = bands[3];
   }
 
-  tnz = kUnpackTab[mb->nz_ & 0xf];
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
   for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
+    uint32_t nz_coeffs = 0;
     for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
-      const int nz = GetCoeffs(token_br, ac_prob, ctx,
-                               q->y1_mat_, first, dst);
-      tnz.i8[x] = l = (nz > 0);
-      nz_dc.i8[x] = (dst[0] != 0);
-      nz_ac.i8[x] = (nz > 1);
+      const int ctx = l + (tnz & 1);
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
+      l = (nz > first);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
       dst += 16;
     }
-    lnz.i8[y] = l;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
   }
-  out_t_nz = PACK(tnz, 24);
-  out_l_nz = PACK(lnz, 24);
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;
 
-  tnz = kUnpackTab[mb->nz_ >> 4];
-  lnz = kUnpackTab[left_mb->nz_ >> 4];
   for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_coeffs = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
     for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
       for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
-        const int nz =
-            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
-                      ctx, q->uv_mat_, 0, dst);
-        tnz.i8[ch + x] = l = (nz > 0);
-        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
-        nz_ac.i8[y * 2 + x] = (nz > 1);
+        const int ctx = l + (tnz & 1);
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
         dst += 16;
       }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
     }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_uv |= nz_coeffs << (4 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
   }
-  out_t_nz |= PACK(tnz, 20);
-  out_l_nz |= PACK(lnz, 20);
   mb->nz_ = out_t_nz;
   left_mb->nz_ = out_l_nz;
 
-  dec->non_zero_ac_ = non_zero_ac;
-  dec->non_zero_ = non_zero_ac | non_zero_dc;
-  mb->skip_ = !dec->non_zero_;
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
-#undef PACK
 
 //------------------------------------------------------------------------------
 // Main loop
 
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
   VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-
-  // Note: we don't save segment map (yet), as we don't expect
-  // to decode more than 1 keyframe.
-  if (dec->segment_hdr_.update_map_) {
-    // Hardcoded tree parsing
-    dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
-        VP8GetBit(br, dec->proba_.segments_[1]) :
-        2 + VP8GetBit(br, dec->proba_.segments_[2]);
-  }
-  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int skip = dec->use_skip_proba_ ? block->skip_ : 0;
 
-  VP8ParseIntraMode(br, dec);
-  if (br->eof_) {
-    return 0;
-  }
-
-  if (!info->skip_) {
-    ParseResiduals(dec, info, token_br);
+  if (!skip) {
+    skip = ParseResiduals(dec, mb, token_br);
   } else {
-    left->nz_ = info->nz_ = 0;
-    if (!dec->is_i4x4_) {
-      left->dc_nz_ = info->dc_nz_ = 0;
+    left->nz_ = mb->nz_ = 0;
+    if (!block->is_i4x4_) {
+      left->nz_dc_ = mb->nz_dc_ = 0;
     }
-    dec->non_zero_ = 0;
-    dec->non_zero_ac_ = 0;
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
+    block->dither_ = 0;
+  }
+
+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
+    finfo->f_inner_ |= !skip;
   }
 
-  return (!token_br->eof_);
+  return !token_br->eof_;
 }
 
 void VP8InitScanline(VP8Decoder* const dec) {
   VP8MB* const left = dec->mb_info_ - 1;
   left->nz_ = 0;
-  left->dc_nz_ = 0;
+  left->nz_dc_ = 0;
   memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->filter_row_ =
-    (dec->filter_type_ > 0) &&
-    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  dec->mb_x_ = 0;
 }
 
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
   for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    VP8InitScanline(dec);
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "Premature end-of-partition0 encountered.");
+    }
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
       if (!VP8DecodeMB(dec, token_br)) {
         return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                            "Premature end-of-file encountered.");
       }
-      VP8ReconstructBlock(dec);
-
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
     }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
-  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
-    return 0;
-  }
-
-  // Finish
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->update_proba_) {
-    dec->proba_ = dec->proba_saved_;
+  if (dec->mt_method_ > 0) {
+    if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) return 0;
   }
-#endif
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (dec->layer_data_size_ > 0) {
-    if (!VP8DecodeLayer(dec)) {
-      return 0;
-    }
-  }
-#endif
 
   return 1;
 }
@@ -768,12 +648,10 @@ void VP8Clear(VP8Decoder* const dec) {
   if (dec == NULL) {
     return;
   }
-  if (dec->use_threads_) {
-    WebPWorkerEnd(&dec->worker_);
-  }
-  if (dec->mem_) {
-    free(dec->mem_);
-  }
+  WebPGetWorkerInterface()->End(&dec->worker_);
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
+  WebPSafeFree(dec->mem_);
   dec->mem_ = NULL;
   dec->mem_size_ = 0;
   memset(&dec->br_, 0, sizeof(dec->br_));
@@ -782,6 +660,3 @@ void VP8Clear(VP8Decoder* const dec) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/vp8i.h b/drivers/webp/dec/vp8i.h
index 4382edfd8e..b5f2b23009 100644
--- a/drivers/webp/dec/vp8i.h
+++ b/drivers/webp/dec/vp8i.h
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@@ -13,12 +15,14 @@
 #define WEBP_DEC_VP8I_H_
 
 #include <string.h>     // for memcpy()
+#include "./common.h"
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
+#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -27,48 +31,10 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 2
-#define DEC_REV_VERSION 0
-
-#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
-
-// intra prediction modes
-enum { B_DC_PRED = 0,   // 4x4 modes
-       B_TM_PRED,
-       B_VE_PRED,
-       B_HE_PRED,
-       B_RD_PRED,
-       B_VR_PRED,
-       B_LD_PRED,
-       B_VL_PRED,
-       B_HD_PRED,
-       B_HU_PRED,
-       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
-
-       // Luma16 or UV modes
-       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       B_PRED = NUM_BMODES,   // refined I4x4 mode
-
-       // special modes
-       B_DC_PRED_NOTOP = 4,
-       B_DC_PRED_NOLEFT = 5,
-       B_DC_PRED_NOTOPLEFT = 6,
-       NUM_B_DC_MODES = 7 };
-
-enum { MB_FEATURE_TREE_PROBS = 3,
-       NUM_MB_SEGMENTS = 4,
-       NUM_REF_LF_DELTAS = 4,
-       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
-       MAX_NUM_PARTITIONS = 8,
-       // Probabilities
-       NUM_TYPES = 4,
-       NUM_BANDS = 8,
-       NUM_CTX = 3,
-       NUM_PROBAS = 11,
-       NUM_MV_PROBAS = 19 };
-
-// YUV-cache parameters.
+#define DEC_MIN_VERSION 4
+#define DEC_REV_VERSION 4
+
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
 // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
 // in order to be SIMD-friendly. We also need to store the top, left and
@@ -90,14 +56,15 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 //  'y' = y-samples   'u' = u-samples     'v' = u-samples
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
-// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
-#define BPS       32    // this is the common stride used by yuv[]
 #define YUV_SIZE (BPS * 17 + BPS * 9)
 #define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 
+// minimal width under which lossy multi-threading is always disabled
+#define MIN_WIDTH_FOR_THREADS 512
+
 //------------------------------------------------------------------------------
 // Headers
 
@@ -126,15 +93,19 @@ typedef struct {
   int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;
 
+// probas associated to one of the contexts
+typedef uint8_t VP8ProbaArray[NUM_PROBAS];
+
+typedef struct {   // all the probas associated to one band
+  VP8ProbaArray probas_[NUM_CTX];
+} VP8BandProbas;
+
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
   uint8_t segments_[MB_FEATURE_TREE_PROBS];
   // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t ymode_[4], uvmode_[3];
-  uint8_t mv_[2][NUM_MV_PROBAS];
-#endif
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
 } VP8Proba;
 
 // Filter parameters
@@ -151,32 +122,61 @@ typedef struct {
 // Informations about the macroblocks.
 
 typedef struct {  // filter specs
-  unsigned int f_level_:6;      // filter strength: 0..63
-  unsigned int f_ilevel_:6;     // inner limit: 1..63
-  unsigned int f_inner_:1;      // do inner filtering?
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
+  uint8_t f_ilevel_;     // inner limit in [1..63]
+  uint8_t f_inner_;      // do inner filtering?
+  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
 } VP8FInfo;
 
-typedef struct {  // used for syntax-parsing
-  unsigned int nz_;          // non-zero AC/DC coeffs
-  unsigned int dc_nz_:1;     // non-zero DC coeffs
-  unsigned int skip_:1;      // block type
+typedef struct {  // Top/Left Contexts used for syntax-parsing
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
 } VP8MB;
 
 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
   quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 
+// Data needed to reconstruct a macroblock
+typedef struct {
+  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+  uint8_t skip_;
+  uint8_t segment_;
+} VP8MBData;
+
 // Persistent information needed by the parallel processing
 typedef struct {
-  int id_;            // cache row to process (in [0..2])
-  int mb_y_;          // macroblock position of the row
-  int filter_row_;    // true if row-filtering is needed
-  VP8FInfo* f_info_;  // filter strengths
-  VP8Io io_;          // copy of the VP8Io to pass to put()
+  int id_;              // cache row to process (in [0..2])
+  int mb_y_;            // macroblock position of the row
+  int filter_row_;      // true if row-filtering is needed
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
+  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 
+// Saved top samples, per macroblock. Fits into a cache-line.
+typedef struct {
+  uint8_t y[16], u[8], v[8];
+} VP8TopSamples;
+
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
 
@@ -196,7 +196,8 @@ struct VP8Decoder {
 
   // Worker
   WebPWorker worker_;
-  int use_threads_;    // use multi-thread
+  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
+                       // 2=[parse][recon+filter]
   int cache_id_;       // current cache row
   int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
   VP8ThreadContext thread_ctx_;  // Thread context
@@ -213,12 +214,9 @@ struct VP8Decoder {
   // per-partition boolean decoders.
   VP8BitReader parts_[MAX_NUM_PARTITIONS];
 
-  // buffer refresh flags
-  //   bit 0: refresh Gold, bit 1: refresh Alt
-  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
-  //   bit 6: Gold sign bias, bit 7: Alt sign bias
-  //   bit 8: refresh last frame
-  uint32_t buffer_flags_;
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering
 
   // dequantization (one set of DC/AC dequant factor per segment)
   VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@@ -227,24 +225,18 @@ struct VP8Decoder {
   VP8Proba proba_;
   int use_skip_proba_;
   uint8_t skip_p_;
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t intra_p_, last_p_, golden_p_;
-  VP8Proba proba_saved_;
-  int update_proba_;
-#endif
 
   // Boundary data cache and persistent buffers.
-  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
-  uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];   // left intra modes values
 
-  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
-  VP8FInfo* f_info_;     // filter strength info
-  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
-  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
+  VP8TopSamples* yuv_t_;  // top y/u/v samples
 
-  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
+  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;      // filter strength info
+  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
+
+  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
   uint8_t* cache_u_;
   uint8_t* cache_v_;
   int cache_y_stride_;
@@ -256,31 +248,19 @@ struct VP8Decoder {
 
   // Per macroblock non-persistent infos.
   int mb_x_, mb_y_;       // current position, in macroblock units
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  uint8_t segment_;       // block's segment
-
-  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
-  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
-  // If the bit is set, the 4x4 block contains some non-zero coefficients.
-  uint32_t non_zero_;
-  uint32_t non_zero_ac_;
+  VP8MBData* mb_data_;    // parsed reconstruction data
 
   // Filtering side-info
-  int filter_type_;                         // 0=off, 1=simple, 2=complex
-  int filter_row_;                          // per-row flag
-  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
 
-  // extensions
-  const uint8_t* alpha_data_;   // compressed alpha data (if present)
+  // Alpha
+  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
+  const uint8_t* alpha_data_;     // compressed alpha data (if present)
   size_t alpha_data_size_;
-  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
-
-  int layer_colorspace_;
-  const uint8_t* layer_data_;   // compressed layer data (if present)
-  size_t layer_data_size_;
+  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
+  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
 };
 
 //------------------------------------------------------------------------------
@@ -293,15 +273,14 @@ int VP8SetError(VP8Decoder* const dec,
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+// parses one row of intra mode data in partition 0, returns !eof
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 
 // in quant.c
 void VP8ParseQuant(VP8Decoder* const dec);
 
 // in frame.c
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Predict a block and add residual
-void VP8ReconstructBlock(VP8Decoder* const dec);
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@@ -310,10 +289,16 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Process the last decoded row (filtering + output)
+// Return the multi-threading method to use (0=off), depending
+// on options and bitstream size. Only for lossy decoding.
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height);
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
-// Store a block, along with filtering params
-void VP8StoreBlock(VP8Decoder* const dec);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
@@ -323,12 +308,9 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows);
 
-// in layer.c
-int VP8DecodeLayer(VP8Decoder* const dec);
-
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/vp8l.c b/drivers/webp/dec/vp8l.c
index 897e4395c7..19665a007d 100644
--- a/drivers/webp/dec/vp8l.c
+++ b/drivers/webp/dec/vp8l.c
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@@ -10,18 +12,17 @@
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)
 
-#include <stdio.h>
 #include <stdlib.h>
+
+#include "./alphai.h"
 #include "./vp8li.h"
+#include "../dsp/dsp.h"
 #include "../dsp/lossless.h"
 #include "../dsp/yuv.h"
+#include "../utils/endian_inl.h"
 #include "../utils/huffman.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define NUM_ARGB_CACHE_ROWS          16
 
 static const int kCodeLengthLiterals = 16;
@@ -50,6 +51,9 @@ static const uint16_t kAlphabetSize[HUFFMAN_CODES_PER_META_CODE] = {
   NUM_DISTANCE_CODES
 };
 
+static const uint8_t kLiteralMap[HUFFMAN_CODES_PER_META_CODE] = {
+  0, 1, 1, 1, 0
+};
 
 #define NUM_CODE_LENGTH_CODES       19
 static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
@@ -57,19 +61,43 @@ static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
 };
 
 #define CODE_TO_PLANE_CODES        120
-static const uint8_t code_to_plane_lut[CODE_TO_PLANE_CODES] = {
-   0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
-   0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
-   0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
-   0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
-   0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
-   0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
-   0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
-   0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
-   0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
-   0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
-   0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
-   0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
+static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
+  0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+  0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+  0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+  0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+  0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+  0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+  0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+  0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+  0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+  0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+  0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+  0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
+};
+
+// Memory needed for lookup tables of one Huffman tree group. Red, blue, alpha
+// and distance alphabets are constant (256 for red, blue and alpha, 40 for
+// distance) and lookup table sizes for them in worst case are 630 and 410
+// respectively. Size of green alphabet depends on color cache size and is equal
+// to 256 (green component values) + 24 (length prefix values)
+// + color_cache_size (between 0 and 2048).
+// All values computed for 8-bit first level lookup with Mark Adler's tool:
+// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+#define FIXED_TABLE_SIZE (630 * 3 + 410)
+static const int kTableSize[12] = {
+  FIXED_TABLE_SIZE + 654,
+  FIXED_TABLE_SIZE + 656,
+  FIXED_TABLE_SIZE + 658,
+  FIXED_TABLE_SIZE + 662,
+  FIXED_TABLE_SIZE + 670,
+  FIXED_TABLE_SIZE + 686,
+  FIXED_TABLE_SIZE + 718,
+  FIXED_TABLE_SIZE + 782,
+  FIXED_TABLE_SIZE + 912,
+  FIXED_TABLE_SIZE + 1168,
+  FIXED_TABLE_SIZE + 1680,
+  FIXED_TABLE_SIZE + 2704
 };
 
 static int DecodeImageStream(int xsize, int ysize,
@@ -80,27 +108,28 @@ static int DecodeImageStream(int xsize, int ysize,
 //------------------------------------------------------------------------------
 
 int VP8LCheckSignature(const uint8_t* const data, size_t size) {
-  return (size >= 1) && (data[0] == VP8L_MAGIC_BYTE);
+  return (size >= VP8L_FRAME_HEADER_SIZE &&
+          data[0] == VP8L_MAGIC_BYTE &&
+          (data[4] >> 5) == 0);  // version
 }
 
 static int ReadImageInfo(VP8LBitReader* const br,
                          int* const width, int* const height,
                          int* const has_alpha) {
-  const uint8_t signature = VP8LReadBits(br, 8);
-  if (!VP8LCheckSignature(&signature, 1)) {
-    return 0;
-  }
+  if (VP8LReadBits(br, 8) != VP8L_MAGIC_BYTE) return 0;
   *width = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
   *height = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
   *has_alpha = VP8LReadBits(br, 1);
-  VP8LReadBits(br, VP8L_VERSION_BITS);  // Read/ignore the version number.
-  return 1;
+  if (VP8LReadBits(br, VP8L_VERSION_BITS) != 0) return 0;
+  return !br->eos_;
 }
 
 int VP8LGetInfo(const uint8_t* data, size_t data_size,
                 int* const width, int* const height, int* const has_alpha) {
   if (data == NULL || data_size < VP8L_FRAME_HEADER_SIZE) {
     return 0;         // not enough data
+  } else if (!VP8LCheckSignature(data, data_size)) {
+    return 0;         // bad signature
   } else {
     int w, h, a;
     VP8LBitReader br;
@@ -138,39 +167,80 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
   if (plane_code > CODE_TO_PLANE_CODES) {
     return plane_code - CODE_TO_PLANE_CODES;
   } else {
-    const int dist_code = code_to_plane_lut[plane_code - 1];
+    const int dist_code = kCodeToPlane[plane_code - 1];
     const int yoffset = dist_code >> 4;
     const int xoffset = 8 - (dist_code & 0xf);
     const int dist = yoffset * xsize + xoffset;
-    return (dist >= 1) ? dist : 1;
+    return (dist >= 1) ? dist : 1;  // dist<1 can happen if xsize is very small
   }
 }
 
 //------------------------------------------------------------------------------
 // Decodes the next Huffman code from bit-stream.
 // FillBitWindow(br) needs to be called at minimum every second call
-// to ReadSymbolUnsafe.
-static int ReadSymbolUnsafe(const HuffmanTree* tree, VP8LBitReader* const br) {
-  const HuffmanTreeNode* node = tree->root_;
-  assert(node != NULL);
-  while (!HuffmanTreeNodeIsLeaf(node)) {
-    node = HuffmanTreeNextNode(node, VP8LReadOneBitUnsafe(br));
-  }
-  return node->symbol_;
+// to ReadSymbol, in order to pre-fetch enough bits.
+static WEBP_INLINE int ReadSymbol(const HuffmanCode* table,
+                                  VP8LBitReader* const br) {
+  int nbits;
+  uint32_t val = VP8LPrefetchBits(br);
+  table += val & HUFFMAN_TABLE_MASK;
+  nbits = table->bits - HUFFMAN_TABLE_BITS;
+  if (nbits > 0) {
+    VP8LSetBitPos(br, br->bit_pos_ + HUFFMAN_TABLE_BITS);
+    val = VP8LPrefetchBits(br);
+    table += table->value;
+    table += val & ((1 << nbits) - 1);
+  }
+  VP8LSetBitPos(br, br->bit_pos_ + table->bits);
+  return table->value;
 }
 
-static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
-                                  VP8LBitReader* const br) {
-  const int read_safe = (br->pos_ + 8 > br->len_);
-  if (!read_safe) {
-    return ReadSymbolUnsafe(tree, br);
+// Reads packed symbol depending on GREEN channel
+#define BITS_SPECIAL_MARKER 0x100  // something large enough (and a bit-mask)
+#define PACKED_NON_LITERAL_CODE 0  // must be < NUM_LITERAL_CODES
+static WEBP_INLINE int ReadPackedSymbols(const HTreeGroup* group,
+                                         VP8LBitReader* const br,
+                                         uint32_t* const dst) {
+  const uint32_t val = VP8LPrefetchBits(br) & (HUFFMAN_PACKED_TABLE_SIZE - 1);
+  const HuffmanCode32 code = group->packed_table[val];
+  assert(group->use_packed_table);
+  if (code.bits < BITS_SPECIAL_MARKER) {
+    VP8LSetBitPos(br, br->bit_pos_ + code.bits);
+    *dst = code.value;
+    return PACKED_NON_LITERAL_CODE;
   } else {
-    const HuffmanTreeNode* node = tree->root_;
-    assert(node != NULL);
-    while (!HuffmanTreeNodeIsLeaf(node)) {
-      node = HuffmanTreeNextNode(node, VP8LReadOneBit(br));
+    VP8LSetBitPos(br, br->bit_pos_ + code.bits - BITS_SPECIAL_MARKER);
+    assert(code.value >= NUM_LITERAL_CODES);
+    return code.value;
+  }
+}
+
+static int AccumulateHCode(HuffmanCode hcode, int shift,
+                           HuffmanCode32* const huff) {
+  huff->bits += hcode.bits;
+  huff->value |= (uint32_t)hcode.value << shift;
+  assert(huff->bits <= HUFFMAN_TABLE_BITS);
+  return hcode.bits;
+}
+
+static void BuildPackedTable(HTreeGroup* const htree_group) {
+  uint32_t code;
+  for (code = 0; code < HUFFMAN_PACKED_TABLE_SIZE; ++code) {
+    uint32_t bits = code;
+    HuffmanCode32* const huff = &htree_group->packed_table[bits];
+    HuffmanCode hcode = htree_group->htrees[GREEN][bits];
+    if (hcode.value >= NUM_LITERAL_CODES) {
+      huff->bits = hcode.bits + BITS_SPECIAL_MARKER;
+      huff->value = hcode.value;
+    } else {
+      huff->bits = 0;
+      huff->value = 0;
+      bits >>= AccumulateHCode(hcode, 8, huff);
+      bits >>= AccumulateHCode(htree_group->htrees[RED][bits], 16, huff);
+      bits >>= AccumulateHCode(htree_group->htrees[BLUE][bits], 0, huff);
+      bits >>= AccumulateHCode(htree_group->htrees[ALPHA][bits], 24, huff);
+      (void)bits;
     }
-    return node->symbol_;
   }
 }
 
@@ -182,19 +252,18 @@ static int ReadHuffmanCodeLengths(
   int symbol;
   int max_symbol;
   int prev_code_len = DEFAULT_CODE_LENGTH;
-  HuffmanTree tree;
+  HuffmanCode table[1 << LENGTHS_TABLE_BITS];
 
-  if (!HuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
-                                NUM_CODE_LENGTH_CODES)) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-    return 0;
+  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths,
+                             NUM_CODE_LENGTH_CODES)) {
+    goto End;
   }
 
   if (VP8LReadBits(br, 1)) {    // use length
     const int length_nbits = 2 + 2 * VP8LReadBits(br, 3);
     max_symbol = 2 + VP8LReadBits(br, length_nbits);
     if (max_symbol > num_symbols) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
       goto End;
     }
   } else {
@@ -203,10 +272,13 @@ static int ReadHuffmanCodeLengths(
 
   symbol = 0;
   while (symbol < num_symbols) {
+    const HuffmanCode* p;
     int code_len;
     if (max_symbol-- == 0) break;
     VP8LFillBitWindow(br);
-    code_len = ReadSymbol(&tree, br);
+    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    VP8LSetBitPos(br, br->bit_pos_ + p->bits);
+    code_len = p->value;
     if (code_len < kCodeLengthLiterals) {
       code_lengths[symbol++] = code_len;
       if (code_len != 0) prev_code_len = code_len;
@@ -217,7 +289,6 @@ static int ReadHuffmanCodeLengths(
       const int repeat_offset = kCodeLengthRepeatOffsets[slot];
       int repeat = VP8LReadBits(br, extra_bits) + repeat_offset;
       if (symbol + repeat > num_symbols) {
-        dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
         goto End;
       } else {
         const int length = use_prev ? prev_code_len : 0;
@@ -228,36 +299,34 @@ static int ReadHuffmanCodeLengths(
   ok = 1;
 
  End:
-  HuffmanTreeRelease(&tree);
+  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
   return ok;
 }
 
+// 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
+// tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
-                           HuffmanTree* const tree) {
+                           int* const code_lengths, HuffmanCode* const table) {
   int ok = 0;
+  int size = 0;
   VP8LBitReader* const br = &dec->br_;
   const int simple_code = VP8LReadBits(br, 1);
 
+  memset(code_lengths, 0, alphabet_size * sizeof(*code_lengths));
+
   if (simple_code) {  // Read symbols, codes & code lengths directly.
-    int symbols[2];
-    int codes[2];
-    int code_lengths[2];
     const int num_symbols = VP8LReadBits(br, 1) + 1;
     const int first_symbol_len_code = VP8LReadBits(br, 1);
     // The first code is either 1 bit or 8 bit code.
-    symbols[0] = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
-    codes[0] = 0;
-    code_lengths[0] = num_symbols - 1;
+    int symbol = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
+    code_lengths[symbol] = 1;
     // The second code (if present), is always 8 bit long.
     if (num_symbols == 2) {
-      symbols[1] = VP8LReadBits(br, 8);
-      codes[1] = 1;
-      code_lengths[1] = num_symbols - 1;
+      symbol = VP8LReadBits(br, 8);
+      code_lengths[symbol] = 1;
     }
-    ok = HuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
-                                  alphabet_size, num_symbols);
+    ok = 1;
   } else {  // Decode Huffman-coded code lengths.
-    int* code_lengths = NULL;
     int i;
     int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
     const int num_codes = VP8LReadBits(br, 4) + 4;
@@ -266,42 +335,23 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
       return 0;
     }
 
-    code_lengths =
-        (int*)WebPSafeCalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
-    if (code_lengths == NULL) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      return 0;
-    }
-
     for (i = 0; i < num_codes; ++i) {
       code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
     }
     ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths, alphabet_size,
                                 code_lengths);
-    if (ok) {
-      ok = HuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
-    }
-    free(code_lengths);
   }
-  ok = ok && !br->error_;
-  if (!ok) {
+
+  ok = ok && !br->eos_;
+  if (ok) {
+    size = VP8LBuildHuffmanTable(table, HUFFMAN_TABLE_BITS,
+                                 code_lengths, alphabet_size);
+  }
+  if (!ok || size == 0) {
     dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
     return 0;
   }
-  return 1;
-}
-
-static void DeleteHtreeGroups(HTreeGroup* htree_groups, int num_htree_groups) {
-  if (htree_groups != NULL) {
-    int i, j;
-    for (i = 0; i < num_htree_groups; ++i) {
-      HuffmanTree* const htrees = htree_groups[i].htrees_;
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        HuffmanTreeRelease(&htrees[j]);
-      }
-    }
-    free(htree_groups);
-  }
+  return size;
 }
 
 static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
@@ -311,7 +361,12 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
+  HuffmanCode* huffman_tables = NULL;
+  HuffmanCode* next = NULL;
   int num_htree_groups = 1;
+  int max_alphabet_size = 0;
+  int* code_lengths = NULL;
+  const int table_size = kTableSize[color_cache_bits];
 
   if (allow_recursion && VP8LReadBits(br, 1)) {
     // use meta Huffman codes.
@@ -321,51 +376,108 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     const int huffman_pixs = huffman_xsize * huffman_ysize;
     if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
                            &huffman_image)) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
       goto Error;
     }
     hdr->huffman_subsample_bits_ = huffman_precision;
     for (i = 0; i < huffman_pixs; ++i) {
       // The huffman data is stored in red and green bytes.
-      const int index = (huffman_image[i] >> 8) & 0xffff;
-      huffman_image[i] = index;
-      if (index >= num_htree_groups) {
-        num_htree_groups = index + 1;
+      const int group = (huffman_image[i] >> 8) & 0xffff;
+      huffman_image[i] = group;
+      if (group >= num_htree_groups) {
+        num_htree_groups = group + 1;
       }
     }
   }
 
-  if (br->error_) goto Error;
+  if (br->eos_) goto Error;
+
+  // Find maximum alphabet size for the htree group.
+  for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+    int alphabet_size = kAlphabetSize[j];
+    if (j == 0 && color_cache_bits > 0) {
+      alphabet_size += 1 << color_cache_bits;
+    }
+    if (max_alphabet_size < alphabet_size) {
+      max_alphabet_size = alphabet_size;
+    }
+  }
+
+  huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
+                                                sizeof(*huffman_tables));
+  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
+                                      sizeof(*code_lengths));
 
-  assert(num_htree_groups <= 0x10000);
-  htree_groups =
-      (HTreeGroup*)WebPSafeCalloc((uint64_t)num_htree_groups,
-                                  sizeof(*htree_groups));
-  if (htree_groups == NULL) {
+  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     goto Error;
   }
 
+  next = huffman_tables;
   for (i = 0; i < num_htree_groups; ++i) {
-    HuffmanTree* const htrees = htree_groups[i].htrees_;
+    HTreeGroup* const htree_group = &htree_groups[i];
+    HuffmanCode** const htrees = htree_group->htrees;
+    int size;
+    int total_size = 0;
+    int is_trivial_literal = 1;
+    int max_bits = 0;
     for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
       int alphabet_size = kAlphabetSize[j];
+      htrees[j] = next;
       if (j == 0 && color_cache_bits > 0) {
         alphabet_size += 1 << color_cache_bits;
       }
-      if (!ReadHuffmanCode(alphabet_size, dec, htrees + j)) goto Error;
+      size = ReadHuffmanCode(alphabet_size, dec, code_lengths, next);
+      if (is_trivial_literal && kLiteralMap[j] == 1) {
+        is_trivial_literal = (next->bits == 0);
+      }
+      total_size += next->bits;
+      next += size;
+      if (size == 0) {
+        goto Error;
+      }
+      if (j <= ALPHA) {
+        int local_max_bits = code_lengths[0];
+        int k;
+        for (k = 1; k < alphabet_size; ++k) {
+          if (code_lengths[k] > local_max_bits) {
+            local_max_bits = code_lengths[k];
+          }
+        }
+        max_bits += local_max_bits;
+      }
     }
+    htree_group->is_trivial_literal = is_trivial_literal;
+    htree_group->is_trivial_code = 0;
+    if (is_trivial_literal) {
+      const int red = htrees[RED][0].value;
+      const int blue = htrees[BLUE][0].value;
+      const int alpha = htrees[ALPHA][0].value;
+      htree_group->literal_arb =
+          ((uint32_t)alpha << 24) | (red << 16) | blue;
+      if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
+        htree_group->is_trivial_code = 1;
+        htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+      }
+    }
+    htree_group->use_packed_table = !htree_group->is_trivial_code &&
+                                    (max_bits < HUFFMAN_PACKED_BITS);
+    if (htree_group->use_packed_table) BuildPackedTable(htree_group);
   }
+  WebPSafeFree(code_lengths);
 
   // All OK. Finalize pointers and return.
   hdr->huffman_image_ = huffman_image;
   hdr->num_htree_groups_ = num_htree_groups;
   hdr->htree_groups_ = htree_groups;
+  hdr->huffman_tables_ = huffman_tables;
   return 1;
 
  Error:
-  free(huffman_image);
-  DeleteHtreeGroups(htree_groups, num_htree_groups);
+  WebPSafeFree(code_lengths);
+  WebPSafeFree(huffman_image);
+  WebPSafeFree(huffman_tables);
+  VP8LHtreeGroupsFree(htree_groups);
   return 0;
 }
 
@@ -379,13 +491,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   const int in_height = io->mb_h;
   const int out_height = io->scaled_height;
   const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
-  int32_t* work;        // Rescaler work area.
-  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  rescaler_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = (uint64_t)out_width;
   uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
   const uint64_t memory_size = sizeof(*dec->rescaler) +
                                work_size * sizeof(*work) +
                                scaled_data_size * sizeof(*scaled_data);
-  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
   if (memory == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
@@ -395,13 +507,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 
   dec->rescaler = (WebPRescaler*)memory;
   memory += sizeof(*dec->rescaler);
-  work = (int32_t*)memory;
+  work = (rescaler_t*)memory;
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
   WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels,
-                   in_width, out_width, in_height, out_height, work);
+                   out_width, out_height, 0, num_channels, work);
   return 1;
 }
 
@@ -411,12 +522,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
-  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  uint32_t* const src = (uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
+    WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
     ++num_lines_out;
   }
@@ -424,18 +536,22 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
 }
 
 // Emit scaled rows.
-static int EmitRescaledRows(const VP8LDecoder* const dec,
-                            const uint32_t* const data, int in_stride, int mb_h,
-                            uint8_t* const out, int out_stride) {
+static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
+                                uint8_t* in, int in_stride, int mb_h,
+                                uint8_t* const out, int out_stride) {
   const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
-  const uint8_t* const in = (const uint8_t*)data;
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    uint8_t* const row_in = in + num_lines_in * in_stride;
     uint8_t* const row_out = out + num_lines_out * out_stride;
-    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
-                                       row_in, in_stride);
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    assert(needed_lines > 0 && needed_lines <= lines_left);
+    WebPMultARGBRows(row_in, in_stride,
+                     dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
+    num_lines_in += needed_lines;
     num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
@@ -443,11 +559,10 @@ static int EmitRescaledRows(const VP8LDecoder* const dec,
 
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
-                    const uint32_t* const data, int in_stride,
+                    const uint8_t* row_in, int in_stride,
                     int mb_w, int mb_h,
                     uint8_t* const out, int out_stride) {
   int lines = mb_h;
-  const uint8_t* row_in = (const uint8_t*)data;
   uint8_t* row_out = out;
   while (lines-- > 0) {
     VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
@@ -463,72 +578,37 @@ static int EmitRows(WEBP_CSP_MODE colorspace,
 static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
                           const WebPDecBuffer* const output) {
   const WebPYUVABuffer* const buf = &output->u.YUVA;
+
   // first, the luma plane
-  {
-    int i;
-    uint8_t* const y = buf->y + y_pos * buf->y_stride;
-    for (i = 0; i < width; ++i) {
-      const uint32_t p = src[i];
-      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
-    }
-  }
+  WebPConvertARGBToY(src, buf->y + y_pos * buf->y_stride, width);
 
   // then U/V planes
   {
     uint8_t* const u = buf->u + (y_pos >> 1) * buf->u_stride;
     uint8_t* const v = buf->v + (y_pos >> 1) * buf->v_stride;
-    const int uv_width = width >> 1;
-    int i;
-    for (i = 0; i < uv_width; ++i) {
-      const uint32_t v0 = src[2 * i + 0];
-      const uint32_t v1 = src[2 * i + 1];
-      // VP8RGBToU/V expects four accumulated pixels. Hence we need to
-      // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
-      const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
-      const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
-      const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
-      if (!(y_pos & 1)) {  // even lines: store values
-        u[i] = VP8RGBToU(r, g, b);
-        v[i] = VP8RGBToV(r, g, b);
-      } else {             // odd lines: average with previous values
-        const int tmp_u = VP8RGBToU(r, g, b);
-        const int tmp_v = VP8RGBToV(r, g, b);
-        // Approximated average-of-four. But it's an acceptable diff.
-        u[i] = (u[i] + tmp_u + 1) >> 1;
-        v[i] = (v[i] + tmp_v + 1) >> 1;
-      }
-    }
-    if (width & 1) {       // last pixel
-      const uint32_t v0 = src[2 * i + 0];
-      const int r = (v0 >> 14) & 0x3fc;
-      const int g = (v0 >>  6) & 0x3fc;
-      const int b = (v0 <<  2) & 0x3fc;
-      if (!(y_pos & 1)) {  // even lines
-        u[i] = VP8RGBToU(r, g, b);
-        v[i] = VP8RGBToV(r, g, b);
-      } else {             // odd lines (note: we could just skip this)
-        const int tmp_u = VP8RGBToU(r, g, b);
-        const int tmp_v = VP8RGBToV(r, g, b);
-        u[i] = (u[i] + tmp_u + 1) >> 1;
-        v[i] = (v[i] + tmp_v + 1) >> 1;
-      }
-    }
+    // even lines: store values
+    // odd lines: average with previous values
+    WebPConvertARGBToUV(src, u, v, width, !(y_pos & 1));
   }
   // Lastly, store alpha if needed.
   if (buf->a != NULL) {
-    int i;
     uint8_t* const a = buf->a + y_pos * buf->a_stride;
-    for (i = 0; i < width; ++i) a[i] = (src[i] >> 24);
+#if defined(WORDS_BIGENDIAN)
+    WebPExtractAlpha((uint8_t*)src + 0, 0, width, 1, a, 0);
+#else
+    WebPExtractAlpha((uint8_t*)src + 3, 0, width, 1, a, 0);
+#endif
   }
 }
 
 static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
   WebPRescaler* const rescaler = dec->rescaler;
-  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  uint32_t* const src = (uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     WebPRescalerExportRow(rescaler);
+    WebPMultARGBRow(src, dst_width, 1);
     ConvertToYUVA(src, dst_width, y_pos, dec->output_);
     ++y_pos;
     ++num_lines_out;
@@ -537,28 +617,28 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
 }
 
 static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
-                                const uint32_t* const data,
-                                int in_stride, int mb_h) {
-  const uint8_t* const in = (const uint8_t*)data;
+                                uint8_t* in, int in_stride, int mb_h) {
   int num_lines_in = 0;
   int y_pos = dec->last_out_row_;
   while (num_lines_in < mb_h) {
-    const uint8_t* const row_in = in + num_lines_in * in_stride;
-    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
-                                       row_in, in_stride);
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
+    num_lines_in += needed_lines;
+    in += needed_lines * in_stride;
     y_pos += ExportYUVA(dec, y_pos);
   }
   return y_pos;
 }
 
 static int EmitRowsYUVA(const VP8LDecoder* const dec,
-                        const uint32_t* const data, int in_stride,
+                        const uint8_t* in, int in_stride,
                         int mb_w, int num_rows) {
   int y_pos = dec->last_out_row_;
-  const uint8_t* row_in = (const uint8_t*)data;
   while (num_rows-- > 0) {
-    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
-    row_in += in_stride;
+    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output_);
+    in += in_stride;
     ++y_pos;
   }
   return y_pos;
@@ -569,11 +649,11 @@ static int EmitRowsYUVA(const VP8LDecoder* const dec,
 
 // Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
 // crop options. Also updates the input data pointer, so that it points to the
-// start of the cropped window.
-// Note that 'pixel_stride' is in units of 'uint32_t' (and not 'bytes).
+// start of the cropped window. Note that pixels are in ARGB format even if
+// 'in_data' is uint8_t*.
 // Returns true if the crop window is not empty.
 static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
-                         const uint32_t** const in_data, int pixel_stride) {
+                         uint8_t** const in_data, int pixel_stride) {
   assert(y_start < y_end);
   assert(io->crop_left < io->crop_right);
   if (y_end > io->crop_bottom) {
@@ -582,11 +662,11 @@ static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
   if (y_start < io->crop_top) {
     const int delta = io->crop_top - y_start;
     y_start = io->crop_top;
-    *in_data += pixel_stride * delta;
+    *in_data += delta * pixel_stride;
   }
   if (y_start >= y_end) return 0;  // Crop window is empty.
 
-  *in_data += io->crop_left;
+  *in_data += io->crop_left * sizeof(uint32_t);
 
   io->mb_y = y_start - io->crop_top;
   io->mb_w = io->crop_right - io->crop_left;
@@ -634,10 +714,24 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
   }
 }
 
+// Special method for paletted alpha data.
+static void ApplyInverseTransformsAlpha(VP8LDecoder* const dec, int num_rows,
+                                        const uint8_t* const rows) {
+  const int start_row = dec->last_row_;
+  const int end_row = start_row + num_rows;
+  const uint8_t* rows_in = rows;
+  uint8_t* rows_out = (uint8_t*)dec->io_->opaque + dec->io_->width * start_row;
+  VP8LTransform* const transform = &dec->transforms_[0];
+  assert(dec->next_transform_ == 1);
+  assert(transform->type_ == COLOR_INDEXING_TRANSFORM);
+  VP8LColorIndexInverseTransformAlpha(transform, start_row, end_row, rows_in,
+                                      rows_out);
+}
+
 // Processes (transforms, scales & color-converts) the rows decoded after the
 // last call.
 static void ProcessRows(VP8LDecoder* const dec, int row) {
-  const uint32_t* const rows = dec->argb_ + dec->width_ * dec->last_row_;
+  const uint32_t* const rows = dec->pixels_ + dec->width_ * dec->last_row_;
   const int num_rows = row - dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
@@ -646,18 +740,18 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   // Emit output.
   {
     VP8Io* const io = dec->io_;
-    const uint32_t* rows_data = dec->argb_cache_;
-    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
+    uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
+    const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
-      const int in_stride = io->width * sizeof(*rows_data);
-      if (output->colorspace < MODE_YUV) {  // convert to RGBA
+      if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
         uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
         const int num_rows_out = io->use_scaling ?
-            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
-                             rgba, buf->stride) :
+            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
+                                 rgba, buf->stride) :
             EmitRows(output->colorspace, rows_data, in_stride,
                      io->mb_w, io->mb_h, rgba, buf->stride);
         // Update 'last_out_row_'.
@@ -676,50 +770,317 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   assert(dec->last_row_ <= dec->height_);
 }
 
-static int DecodeImageData(VP8LDecoder* const dec,
-                           uint32_t* const data, int width, int height,
-                           ProcessRowsFunc process_func) {
+// Row-processing for the special case when alpha data contains only one
+// transform (color indexing), and trivial non-green literals.
+static int Is8bOptimizable(const VP8LMetadata* const hdr) {
+  int i;
+  if (hdr->color_cache_size_ > 0) return 0;
+  // When the Huffman tree contains only one symbol, we can skip the
+  // call to ReadSymbol() for red/blue/alpha channels.
+  for (i = 0; i < hdr->num_htree_groups_; ++i) {
+    HuffmanCode** const htrees = hdr->htree_groups_[i].htrees;
+    if (htrees[RED][0].bits > 0) return 0;
+    if (htrees[BLUE][0].bits > 0) return 0;
+    if (htrees[ALPHA][0].bits > 0) return 0;
+  }
+  return 1;
+}
+
+static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint8_t* const in =
+      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
+  if (num_rows > 0) {
+    ApplyInverseTransformsAlpha(dec, num_rows, in);
+  }
+  dec->last_row_ = dec->last_out_row_ = row;
+}
+
+//------------------------------------------------------------------------------
+// Helper functions for fast pattern copy (8b and 32b)
+
+// cyclic rotation of pattern word
+static WEBP_INLINE uint32_t Rotate8b(uint32_t V) {
+#if defined(WORDS_BIGENDIAN)
+  return ((V & 0xff000000u) >> 24) | (V << 8);
+#else
+  return ((V & 0xffu) << 24) | (V >> 8);
+#endif
+}
+
+// copy 1, 2 or 4-bytes pattern
+static WEBP_INLINE void CopySmallPattern8b(const uint8_t* src, uint8_t* dst,
+                                           int length, uint32_t pattern) {
+  int i;
+  // align 'dst' to 4-bytes boundary. Adjust the pattern along the way.
+  while ((uintptr_t)dst & 3) {
+    *dst++ = *src++;
+    pattern = Rotate8b(pattern);
+    --length;
+  }
+  // Copy the pattern 4 bytes at a time.
+  for (i = 0; i < (length >> 2); ++i) {
+    ((uint32_t*)dst)[i] = pattern;
+  }
+  // Finish with left-overs. 'pattern' is still correctly positioned,
+  // so no Rotate8b() call is needed.
+  for (i <<= 2; i < length; ++i) {
+    dst[i] = src[i];
+  }
+}
+
+static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
+  const uint8_t* src = dst - dist;
+  if (length >= 8) {
+    uint32_t pattern = 0;
+    switch (dist) {
+      case 1:
+        pattern = src[0];
+#if defined(__arm__) || defined(_M_ARM)   // arm doesn't like multiply that much
+        pattern |= pattern << 8;
+        pattern |= pattern << 16;
+#elif defined(WEBP_USE_MIPS_DSP_R2)
+        __asm__ volatile ("replv.qb %0, %0" : "+r"(pattern));
+#else
+        pattern = 0x01010101u * pattern;
+#endif
+        break;
+      case 2:
+        memcpy(&pattern, src, sizeof(uint16_t));
+#if defined(__arm__) || defined(_M_ARM)
+        pattern |= pattern << 16;
+#elif defined(WEBP_USE_MIPS_DSP_R2)
+        __asm__ volatile ("replv.ph %0, %0" : "+r"(pattern));
+#else
+        pattern = 0x00010001u * pattern;
+#endif
+        break;
+      case 4:
+        memcpy(&pattern, src, sizeof(uint32_t));
+        break;
+      default:
+        goto Copy;
+        break;
+    }
+    CopySmallPattern8b(src, dst, length, pattern);
+    return;
+  }
+ Copy:
+  if (dist >= length) {  // no overlap -> use memcpy()
+    memcpy(dst, src, length * sizeof(*dst));
+  } else {
+    int i;
+    for (i = 0; i < length; ++i) dst[i] = src[i];
+  }
+}
+
+// copy pattern of 1 or 2 uint32_t's
+static WEBP_INLINE void CopySmallPattern32b(const uint32_t* src,
+                                            uint32_t* dst,
+                                            int length, uint64_t pattern) {
+  int i;
+  if ((uintptr_t)dst & 4) {           // Align 'dst' to 8-bytes boundary.
+    *dst++ = *src++;
+    pattern = (pattern >> 32) | (pattern << 32);
+    --length;
+  }
+  assert(0 == ((uintptr_t)dst & 7));
+  for (i = 0; i < (length >> 1); ++i) {
+    ((uint64_t*)dst)[i] = pattern;    // Copy the pattern 8 bytes at a time.
+  }
+  if (length & 1) {                   // Finish with left-over.
+    dst[i << 1] = src[i << 1];
+  }
+}
+
+static WEBP_INLINE void CopyBlock32b(uint32_t* const dst,
+                                     int dist, int length) {
+  const uint32_t* const src = dst - dist;
+  if (dist <= 2 && length >= 4 && ((uintptr_t)dst & 3) == 0) {
+    uint64_t pattern;
+    if (dist == 1) {
+      pattern = (uint64_t)src[0];
+      pattern |= pattern << 32;
+    } else {
+      memcpy(&pattern, src, sizeof(pattern));
+    }
+    CopySmallPattern32b(src, dst, length, pattern);
+  } else if (dist >= length) {  // no overlap
+    memcpy(dst, src, length * sizeof(*dst));
+  } else {
+    int i;
+    for (i = 0; i < length; ++i) dst[i] = src[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
+                           int width, int height, int last_row) {
   int ok = 1;
-  int col = 0, row = 0;
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
   VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
-  HTreeGroup* htree_group = hdr->htree_groups_;
-  uint32_t* src = data;
-  uint32_t* last_cached = data;
-  uint32_t* const src_end = data + width * height;
+  const HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  int pos = dec->last_pixel_;         // current position
+  const int end = width * height;     // End of data
+  const int last = width * last_row;  // Last pixel to decode
   const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
-  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
-  VP8LColorCache* const color_cache =
-      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
   const int mask = hdr->huffman_mask_;
-
   assert(htree_group != NULL);
+  assert(pos < end);
+  assert(last_row <= height);
+  assert(Is8bOptimizable(hdr));
 
-  while (!br->eos_ && src < src_end) {
+  while (!br->eos_ && pos < last) {
     int code;
-    // Only update when changing tile. Note we could use the following test:
-    //   if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
-    // but that's actually slower and requires storing the previous col/row
+    // Only update when changing tile.
     if ((col & mask) == 0) {
       htree_group = GetHtreeGroupForPos(hdr, col, row);
     }
     VP8LFillBitWindow(br);
-    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
-    if (code < NUM_LITERAL_CODES) {   // Literal.
-      int red, green, blue, alpha;
-      red = ReadSymbol(&htree_group->htrees_[RED], br);
-      green = code;
+    code = ReadSymbol(htree_group->htrees[GREEN], br);
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      data[pos] = code;
+      ++pos;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+    } else if (code < len_code_limit) {  // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(htree_group->htrees[DIST], br);
       VP8LFillBitWindow(br);
-      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
-      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
-      *src = (alpha << 24) + (red << 16) + (green << 8) + blue;
- AdvanceByOne:
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (pos >= dist && end - pos >= length) {
+        CopyBlock8b(data + pos, dist, length);
+      } else {
+        ok = 0;
+        goto End;
+      }
+      pos += length;
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+      if (pos < last && (col & mask)) {
+        htree_group = GetHtreeGroupForPos(hdr, col, row);
+      }
+    } else {  // Not reached
+      ok = 0;
+      goto End;
+    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
+  }
+  // Process the remaining rows corresponding to last row-block.
+  ExtractPalettedAlphaRows(dec, row);
+
+ End:
+  if (!ok || (br->eos_ && pos < end)) {
+    ok = 0;
+    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
+                            : VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    dec->last_pixel_ = pos;
+  }
+  return ok;
+}
+
+static void SaveState(VP8LDecoder* const dec, int last_pixel) {
+  assert(dec->incremental_);
+  dec->saved_br_ = dec->br_;
+  dec->saved_last_pixel_ = last_pixel;
+  if (dec->hdr_.color_cache_size_ > 0) {
+    VP8LColorCacheCopy(&dec->hdr_.color_cache_, &dec->hdr_.saved_color_cache_);
+  }
+}
+
+static void RestoreState(VP8LDecoder* const dec) {
+  assert(dec->br_.eos_);
+  dec->status_ = VP8_STATUS_SUSPENDED;
+  dec->br_ = dec->saved_br_;
+  dec->last_pixel_ = dec->saved_last_pixel_;
+  if (dec->hdr_.color_cache_size_ > 0) {
+    VP8LColorCacheCopy(&dec->hdr_.saved_color_cache_, &dec->hdr_.color_cache_);
+  }
+}
+
+#define SYNC_EVERY_N_ROWS 8  // minimum number of rows between check-points
+static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
+                           int width, int height, int last_row,
+                           ProcessRowsFunc process_func) {
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  uint32_t* src = data + dec->last_pixel_;
+  uint32_t* last_cached = src;
+  uint32_t* const src_end = data + width * height;     // End of data
+  uint32_t* const src_last = data + width * last_row;  // Last pixel to decode
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
+  int next_sync_row = dec->incremental_ ? row : 1 << 24;
+  VP8LColorCache* const color_cache =
+      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
+  const int mask = hdr->huffman_mask_;
+  assert(htree_group != NULL);
+  assert(src < src_end);
+  assert(src_last <= src_end);
+
+  while (src < src_last) {
+    int code;
+    if (row >= next_sync_row) {
+      SaveState(dec, (int)(src - data));
+      next_sync_row = row + SYNC_EVERY_N_ROWS;
+    }
+    // Only update when changing tile. Note we could use this test:
+    // if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
+    // but that's actually slower and needs storing the previous col/row.
+    if ((col & mask) == 0) htree_group = GetHtreeGroupForPos(hdr, col, row);
+    if (htree_group->is_trivial_code) {
+      *src = htree_group->literal_arb;
+      goto AdvanceByOne;
+    }
+    VP8LFillBitWindow(br);
+    if (htree_group->use_packed_table) {
+      code = ReadPackedSymbols(htree_group, br, src);
+      if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
+    } else {
+      code = ReadSymbol(htree_group->htrees[GREEN], br);
+    }
+    if (br->eos_) break;  // early out
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      if (htree_group->is_trivial_literal) {
+        *src = htree_group->literal_arb | (code << 8);
+      } else {
+        int red, blue, alpha;
+        red = ReadSymbol(htree_group->htrees[RED], br);
+        VP8LFillBitWindow(br);
+        blue = ReadSymbol(htree_group->htrees[BLUE], br);
+        alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
+        if (br->eos_) break;
+        *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
+      }
+    AdvanceByOne:
       ++src;
       ++col;
       if (col >= width) {
         col = 0;
         ++row;
-        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
           process_func(dec, row);
         }
         if (color_cache != NULL) {
@@ -728,40 +1089,39 @@ static int DecodeImageData(VP8LDecoder* const dec,
           }
         }
       }
-    } else if (code < len_code_limit) {           // Backward reference
+    } else if (code < len_code_limit) {  // Backward reference
       int dist_code, dist;
       const int length_sym = code - NUM_LITERAL_CODES;
       const int length = GetCopyLength(length_sym, br);
-      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
+      const int dist_symbol = ReadSymbol(htree_group->htrees[DIST], br);
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
-      if (src - data < dist || src_end - src < length) {
-        ok = 0;
-        goto End;
-      }
-      {
-        int i;
-        for (i = 0; i < length; ++i) src[i] = src[i - dist];
-        src += length;
+      if (br->eos_) break;
+      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
+        goto Error;
+      } else {
+        CopyBlock32b(src, dist, length);
       }
+      src += length;
       col += length;
       while (col >= width) {
         col -= width;
         ++row;
-        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
           process_func(dec, row);
         }
       }
-      if (src < src_end) {
-        htree_group = GetHtreeGroupForPos(hdr, col, row);
-        if (color_cache != NULL) {
-          while (last_cached < src) {
-            VP8LColorCacheInsert(color_cache, *last_cached++);
-          }
+      // Because of the check done above (before 'src' was incremented by
+      // 'length'), the following holds true.
+      assert(src <= src_end);
+      if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
+      if (color_cache != NULL) {
+        while (last_cached < src) {
+          VP8LColorCacheInsert(color_cache, *last_cached++);
         }
       }
-    } else if (code < color_cache_limit) {    // Color cache.
+    } else if (code < color_cache_limit) {  // Color cache
       const int key = code - len_code_limit;
       assert(color_cache != NULL);
       while (last_cached < src) {
@@ -769,33 +1129,38 @@ static int DecodeImageData(VP8LDecoder* const dec,
       }
       *src = VP8LColorCacheLookup(color_cache, key);
       goto AdvanceByOne;
-    } else {    // Not reached.
-      ok = 0;
-      goto End;
+    } else {  // Not reached
+      goto Error;
     }
-    ok = !br->error_;
-    if (!ok) goto End;
+    assert(br->eos_ == VP8LIsEndOfStream(br));
   }
-  // Process the remaining rows corresponding to last row-block.
-  if (process_func != NULL) process_func(dec, row);
 
- End:
-  if (br->error_ || !ok || (br->eos_ && src < src_end)) {
-    ok = 0;
-    dec->status_ = (!br->eos_) ?
-        VP8_STATUS_BITSTREAM_ERROR : VP8_STATUS_SUSPENDED;
-  } else if (src == src_end) {
-    dec->state_ = READ_DATA;
+  if (dec->incremental_ && br->eos_ && src < src_end) {
+    RestoreState(dec);
+  } else if (!br->eos_) {
+    // Process the remaining rows corresponding to last row-block.
+    if (process_func != NULL) {
+      process_func(dec, row);
+    }
+    dec->status_ = VP8_STATUS_OK;
+    dec->last_pixel_ = (int)(src - data);  // end-of-scan marker
+  } else {
+    // if not incremental, and we are past the end of buffer (eos_=1), then this
+    // is a real bitstream error.
+    goto Error;
   }
+  return 1;
 
-  return ok;
+ Error:
+  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  return 0;
 }
 
 // -----------------------------------------------------------------------------
 // VP8LTransform
 
 static void ClearTransform(VP8LTransform* const transform) {
-  free(transform->data_);
+  WebPSafeFree(transform->data_);
   transform->data_ = NULL;
 }
 
@@ -819,7 +1184,7 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
     }
     for (; i < 4 * final_num_colors; ++i)
       new_data[i] = 0;  // black tail.
-    free(transform->data_);
+    WebPSafeFree(transform->data_);
     transform->data_ = new_color_map;
   }
   return 1;
@@ -882,16 +1247,18 @@ static int ReadTransform(int* const xsize, int const* ysize,
 // VP8LMetadata
 
 static void InitMetadata(VP8LMetadata* const hdr) {
-  assert(hdr);
+  assert(hdr != NULL);
   memset(hdr, 0, sizeof(*hdr));
 }
 
 static void ClearMetadata(VP8LMetadata* const hdr) {
-  assert(hdr);
+  assert(hdr != NULL);
 
-  free(hdr->huffman_image_);
-  DeleteHtreeGroups(hdr->htree_groups_, hdr->num_htree_groups_);
+  WebPSafeFree(hdr->huffman_image_);
+  WebPSafeFree(hdr->huffman_tables_);
+  VP8LHtreeGroupsFree(hdr->htree_groups_);
   VP8LColorCacheClear(&hdr->color_cache_);
+  VP8LColorCacheClear(&hdr->saved_color_cache_);
   InitMetadata(hdr);
 }
 
@@ -899,11 +1266,13 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
 // VP8LDecoder
 
 VP8LDecoder* VP8LNew(void) {
-  VP8LDecoder* const dec = (VP8LDecoder*)calloc(1, sizeof(*dec));
+  VP8LDecoder* const dec = (VP8LDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec == NULL) return NULL;
   dec->status_ = VP8_STATUS_OK;
-  dec->action_ = READ_DIM;
   dec->state_ = READ_DIM;
+
+  VP8LDspInit();  // Init critical function pointers.
+
   return dec;
 }
 
@@ -912,15 +1281,15 @@ void VP8LClear(VP8LDecoder* const dec) {
   if (dec == NULL) return;
   ClearMetadata(&dec->hdr_);
 
-  free(dec->argb_);
-  dec->argb_ = NULL;
+  WebPSafeFree(dec->pixels_);
+  dec->pixels_ = NULL;
   for (i = 0; i < dec->next_transform_; ++i) {
     ClearTransform(&dec->transforms_[i]);
   }
   dec->next_transform_ = 0;
   dec->transforms_seen_ = 0;
 
-  free(dec->rescaler_memory);
+  WebPSafeFree(dec->rescaler_memory);
   dec->rescaler_memory = NULL;
 
   dec->output_ = NULL;   // leave no trace behind
@@ -929,7 +1298,7 @@ void VP8LClear(VP8LDecoder* const dec) {
 void VP8LDelete(VP8LDecoder* const dec) {
   if (dec != NULL) {
     VP8LClear(dec);
-    free(dec);
+    WebPSafeFree(dec);
   }
 }
 
@@ -1009,19 +1378,14 @@ static int DecodeImageStream(int xsize, int ysize,
   }
 
   // Use the Huffman trees to decode the LZ77 encoded data.
-  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
-  ok = ok && !br->error_;
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize,
+                       transform_ysize, NULL);
+  ok = ok && !br->eos_;
 
  End:
-
   if (!ok) {
-    free(data);
+    WebPSafeFree(data);
     ClearMetadata(hdr);
-    // If not enough data (br.eos_) resulted in BIT_STREAM_ERROR, update the
-    // status appropriately.
-    if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR && dec->br_.eos_) {
-      dec->status_ = VP8_STATUS_SUSPENDED;
-    }
   } else {
     if (decoded_data != NULL) {
       *decoded_data = data;
@@ -1031,41 +1395,52 @@ static int DecodeImageStream(int xsize, int ysize,
       assert(data == NULL);
       assert(is_level0);
     }
+    dec->last_pixel_ = 0;  // Reset for future DECODE_DATA_FUNC() calls.
     if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
   }
   return ok;
 }
 
 //------------------------------------------------------------------------------
-// Allocate dec->argb_ and dec->argb_cache_ using dec->width_ and dec->height_
-
-static int AllocateARGBBuffers(VP8LDecoder* const dec, int final_width) {
+// Allocate internal buffers dec->pixels_ and dec->argb_cache_.
+static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
   // Scratch buffer corresponding to top-prediction row for transforming the
-  // first row in the row-blocks.
-  const uint64_t cache_top_pixels = final_width;
-  // Scratch buffer for temporary BGRA storage.
+  // first row in the row-blocks. Not needed for paletted alpha.
+  const uint64_t cache_top_pixels = (uint16_t)final_width;
+  // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
   const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
   const uint64_t total_num_pixels =
       num_pixels + cache_top_pixels + cache_pixels;
 
   assert(dec->width_ <= final_width);
-  dec->argb_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(*dec->argb_));
-  if (dec->argb_ == NULL) {
+  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
+  if (dec->pixels_ == NULL) {
     dec->argb_cache_ = NULL;    // for sanity check
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
-  dec->argb_cache_ = dec->argb_ + num_pixels + cache_top_pixels;
+  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
+  return 1;
+}
+
+static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
+  const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
+  dec->argb_cache_ = NULL;    // for sanity check
+  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
+  if (dec->pixels_ == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
   return 1;
 }
 
 //------------------------------------------------------------------------------
-// Special row-processing that only stores the alpha data.
 
+// Special row-processing that only stores the alpha data.
 static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
   const int num_rows = row - dec->last_row_;
-  const uint32_t* const in = dec->argb_ + dec->width_ * dec->last_row_;
+  const uint32_t* const in = dec->pixels_ + dec->width_ * dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
   ApplyInverseTransforms(dec, num_rows, in);
@@ -1079,44 +1454,77 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
     int i;
     for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
   }
-
   dec->last_row_ = dec->last_out_row_ = row;
 }
 
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
-                               size_t data_size, uint8_t* const output) {
-  VP8Io io;
+int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output) {
   int ok = 0;
-  VP8LDecoder* const dec = VP8LNew();
-  if (dec == NULL) return 0;
-
-  dec->width_ = width;
-  dec->height_ = height;
-  dec->io_ = &io;
+  VP8LDecoder* dec;
+  VP8Io* io;
+  assert(alph_dec != NULL);
+  alph_dec->vp8l_dec_ = VP8LNew();
+  if (alph_dec->vp8l_dec_ == NULL) return 0;
+  dec = alph_dec->vp8l_dec_;
+
+  dec->width_ = alph_dec->width_;
+  dec->height_ = alph_dec->height_;
+  dec->io_ = &alph_dec->io_;
+  io = dec->io_;
 
-  VP8InitIo(&io);
-  WebPInitCustomIo(NULL, &io);    // Just a sanity Init. io won't be used.
-  io.opaque = output;
-  io.width = width;
-  io.height = height;
+  VP8InitIo(io);
+  WebPInitCustomIo(NULL, io);  // Just a sanity Init. io won't be used.
+  io->opaque = output;
+  io->width = alph_dec->width_;
+  io->height = alph_dec->height_;
 
   dec->status_ = VP8_STATUS_OK;
   VP8LInitBitReader(&dec->br_, data, data_size);
 
-  dec->action_ = READ_HDR;
-  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Err;
+  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
+    goto Err;
+  }
 
-  // Allocate output (note that dec->width_ may have changed here).
-  if (!AllocateARGBBuffers(dec, width)) goto Err;
+  // Special case: if alpha data uses only the color indexing transform and
+  // doesn't use color cache (a frequent case), we will use DecodeAlphaData()
+  // method that only needs allocation of 1 byte per pixel (alpha channel).
+  if (dec->next_transform_ == 1 &&
+      dec->transforms_[0].type_ == COLOR_INDEXING_TRANSFORM &&
+      Is8bOptimizable(&dec->hdr_)) {
+    alph_dec->use_8b_decode = 1;
+    ok = AllocateInternalBuffers8b(dec);
+  } else {
+    // Allocate internal buffers (note that dec->width_ may have changed here).
+    alph_dec->use_8b_decode = 0;
+    ok = AllocateInternalBuffers32b(dec, alph_dec->width_);
+  }
 
-  // Decode (with special row processing).
-  dec->action_ = READ_DATA;
-  ok = DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
-                       ExtractAlphaRows);
+  if (!ok) goto Err;
+
+  return 1;
 
  Err:
-  VP8LDelete(dec);
-  return ok;
+  VP8LDelete(alph_dec->vp8l_dec_);
+  alph_dec->vp8l_dec_ = NULL;
+  return 0;
+}
+
+int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
+  VP8LDecoder* const dec = alph_dec->vp8l_dec_;
+  assert(dec != NULL);
+  assert(last_row <= dec->height_);
+
+  if (dec->last_pixel_ == dec->width_ * dec->height_) {
+    return 1;  // done
+  }
+
+  // Decode (with special row processing).
+  return alph_dec->use_8b_decode ?
+      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
+                      last_row) :
+      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
+                      last_row, ExtractAlphaRows);
 }
 
 //------------------------------------------------------------------------------
@@ -1141,14 +1549,13 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
   io->width = width;
   io->height = height;
 
-  dec->action_ = READ_HDR;
   if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Error;
   return 1;
 
  Error:
-   VP8LClear(dec);
-   assert(dec->status_ != VP8_STATUS_OK);
-   return 0;
+  VP8LClear(dec);
+  assert(dec->status_ != VP8_STATUS_OK);
+  return 0;
 }
 
 int VP8LDecodeImage(VP8LDecoder* const dec) {
@@ -1158,33 +1565,57 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   // Sanity checks.
   if (dec == NULL) return 0;
 
+  assert(dec->hdr_.huffman_tables_ != NULL);
+  assert(dec->hdr_.htree_groups_ != NULL);
+  assert(dec->hdr_.num_htree_groups_ > 0);
+
   io = dec->io_;
   assert(io != NULL);
   params = (WebPDecParams*)io->opaque;
   assert(params != NULL);
-  dec->output_ = params->output;
-  assert(dec->output_ != NULL);
 
   // Initialization.
-  if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
-    dec->status_ = VP8_STATUS_INVALID_PARAM;
-    goto Err;
-  }
+  if (dec->state_ != READ_DATA) {
+    dec->output_ = params->output;
+    assert(dec->output_ != NULL);
+
+    if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
+      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      goto Err;
+    }
 
-  if (!AllocateARGBBuffers(dec, io->width)) goto Err;
+    if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
 
-  if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
+    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
+
+    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+      // need the alpha-multiply functions for premultiplied output or rescaling
+      WebPInitAlphaProcessing();
+    }
+    if (!WebPIsRGBMode(dec->output_->colorspace)) {
+      WebPInitConvertARGBToYUV();
+      if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
+    }
+    if (dec->incremental_) {
+      if (dec->hdr_.color_cache_size_ > 0 &&
+          dec->hdr_.saved_color_cache_.colors_ == NULL) {
+        if (!VP8LColorCacheInit(&dec->hdr_.saved_color_cache_,
+                                dec->hdr_.color_cache_.hash_bits_)) {
+          dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+          goto Err;
+        }
+      }
+    }
+    dec->state_ = READ_DATA;
+  }
 
   // Decode.
-  dec->action_ = READ_DATA;
-  if (!DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
-                       ProcessRows)) {
+  if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
+                       dec->height_, ProcessRows)) {
     goto Err;
   }
 
-  // Cleanup.
   params->last_y = dec->last_out_row_;
-  VP8LClear(dec);
   return 1;
 
  Err:
@@ -1194,7 +1625,3 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 }
 
 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/vp8li.h b/drivers/webp/dec/vp8li.h
index 5f6cd6a01c..8886e47f62 100644
--- a/drivers/webp/dec/vp8li.h
+++ b/drivers/webp/dec/vp8li.h
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Lossless decoder: internal header.
@@ -18,9 +20,8 @@
 #include "../utils/bit_reader.h"
 #include "../utils/color_cache.h"
 #include "../utils/huffman.h"
-#include "../format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -40,12 +41,9 @@ struct VP8LTransform {
 };
 
 typedef struct {
-  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
-} HTreeGroup;
-
-typedef struct {
   int             color_cache_size_;
   VP8LColorCache  color_cache_;
+  VP8LColorCache  saved_color_cache_;  // for incremental
 
   int             huffman_mask_;
   int             huffman_subsample_bits_;
@@ -53,24 +51,32 @@ typedef struct {
   uint32_t       *huffman_image_;
   int             num_htree_groups_;
   HTreeGroup     *htree_groups_;
+  HuffmanCode    *huffman_tables_;
 } VP8LMetadata;
 
-typedef struct {
+typedef struct VP8LDecoder VP8LDecoder;
+struct VP8LDecoder {
   VP8StatusCode    status_;
-  VP8LDecodeState  action_;
   VP8LDecodeState  state_;
   VP8Io           *io_;
 
   const WebPDecBuffer *output_;    // shortcut to io->opaque->output
 
-  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+                                   // or uint32_t* for BGRA.
   uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
   VP8LBitReader    br_;
+  int              incremental_;   // if true, incremental decoding is expected
+  VP8LBitReader    saved_br_;      // note: could be local variables too
+  int              saved_last_pixel_;
 
   int              width_;
   int              height_;
   int              last_row_;      // last input row decoded so far.
+  int              last_pixel_;    // last pixel decoded so far. However, it may
+                                   // not be transformed, scaled and
+                                   // color-converted yet.
   int              last_out_row_;  // last row output so far.
 
   VP8LMetadata     hdr_;
@@ -82,18 +88,27 @@ typedef struct {
 
   uint8_t         *rescaler_memory;  // Working memory for rescaling work.
   WebPRescaler    *rescaler;         // Common rescaler for all channels.
-} VP8LDecoder;
+};
 
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 
+struct ALPHDecoder;  // Defined in dec/alphai.h.
+
 // in vp8l.c
 
-// Decodes a raw image stream (without header) and store the alpha data
-// into *output, which must be of size width x height. Returns false in case
-// of error.
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
-                               size_t data_size, uint8_t* const output);
+// Decodes image header for alpha data stored using lossless compression.
+// Returns false in case of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output);
+
+// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
+// already decoded in previous call(s), it will resume decoding from where it
+// was paused.
+// Returns false in case of bitstream error.
+int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
+                               int last_row);
 
 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@@ -114,7 +129,7 @@ void VP8LDelete(VP8LDecoder* const dec);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/dec/webp.c b/drivers/webp/dec/webp.c
index f44bc2b8ae..952178fa89 100644
--- a/drivers/webp/dec/webp.c
+++ b/drivers/webp/dec/webp.c
@@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
@@ -14,11 +16,8 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../format_constants.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"
+#include "../webp/mux_types.h"  // ALPHA_FLAG
 
 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -40,27 +39,20 @@ extern "C" {
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8,
-// META  ...)
+// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
+// VP8L, XMP, EXIF  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.
 
-static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16);
-}
-
-static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
-  return (uint32_t)get_le24(data) | (data[3] << 24);
-}
-
 // Validates the RIFF container (if detected) and skips over it.
-// If a RIFF container is detected,
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
-//         VP8_STATUS_OK otherwise.
+// If a RIFF container is detected, returns:
+//     VP8_STATUS_BITSTREAM_ERROR for invalid header,
+//     VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
+// and VP8_STATUS_OK otherwise.
 // In case there are not enough bytes (partial RIFF container), return 0 for
 // *riff_size. Else return the RIFF size extracted from the header.
 static VP8StatusCode ParseRIFF(const uint8_t** const data,
-                               size_t* const data_size,
+                               size_t* const data_size, int have_all_data,
                                size_t* const riff_size) {
   assert(data != NULL);
   assert(data_size != NULL);
@@ -71,11 +63,17 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
     if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
       return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
     } else {
-      const uint32_t size = get_le32(*data + TAG_SIZE);
+      const uint32_t size = GetLE32(*data + TAG_SIZE);
       // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
       if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
         return VP8_STATUS_BITSTREAM_ERROR;
       }
+      if (size > MAX_CHUNK_PAYLOAD) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+        return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+      }
       // We have a RIFF container. Skip it.
       *riff_size = size;
       *data += RIFF_HEADER_SIZE;
@@ -111,7 +109,7 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
   if (!memcmp(*data, "VP8X", TAG_SIZE)) {
     int width, height;
     uint32_t flags;
-    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    const uint32_t chunk_size = GetLE32(*data + TAG_SIZE);
     if (chunk_size != VP8X_CHUNK_SIZE) {
       return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
     }
@@ -120,9 +118,9 @@ static VP8StatusCode ParseVP8X(const uint8_t** const data,
     if (*data_size < vp8x_size) {
       return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
     }
-    flags = get_le32(*data + 8);
-    width = 1 + get_le24(*data + 12);
-    height = 1 + get_le24(*data + 15);
+    flags = GetLE32(*data + 8);
+    width = 1 + GetLE24(*data + 12);
+    height = 1 + GetLE24(*data + 15);
     if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
       return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
     }
@@ -176,7 +174,10 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_NOT_ENOUGH_DATA;
     }
 
-    chunk_size = get_le32(buf + TAG_SIZE);
+    chunk_size = GetLE32(buf + TAG_SIZE);
+    if (chunk_size > MAX_CHUNK_PAYLOAD) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
     // For odd-sized chunk-payload, there's one byte padding at the end.
     disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
     total_size += disk_chunk_size;
@@ -186,6 +187,15 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
     }
 
+    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
+    // parsed all the optional chunks.
+    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
+    // below to allow incomplete VP8/VP8L chunks.
+    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+        !memcmp(buf, "VP8L", TAG_SIZE)) {
+      return VP8_STATUS_OK;
+    }
+
     if (buf_size < disk_chunk_size) {             // Insufficient data.
       return VP8_STATUS_NOT_ENOUGH_DATA;
     }
@@ -193,9 +203,6 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
     if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
       *alpha_data = buf + CHUNK_HEADER_SIZE;
       *alpha_size = chunk_size;
-    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
-               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
-      return VP8_STATUS_OK;  // Found.
     }
 
     // We have a full and valid chunk; skip it.
@@ -213,9 +220,8 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
 // extracted from the VP8/VP8L chunk header.
 // The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
 static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
-                                    size_t* const data_size,
-                                    size_t riff_size,
-                                    size_t* const chunk_size,
+                                    size_t* const data_size, int have_all_data,
+                                    size_t riff_size, size_t* const chunk_size,
                                     int* const is_lossless) {
   const uint8_t* const data = *data_ptr;
   const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
@@ -234,10 +240,13 @@ static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
 
   if (is_vp8 || is_vp8l) {
     // Bitstream contains VP8/VP8L header.
-    const uint32_t size = get_le32(data + TAG_SIZE);
+    const uint32_t size = GetLE32(data + TAG_SIZE);
     if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
       return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
     }
+    if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+    }
     // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
     *chunk_size = size;
     *data_ptr += CHUNK_HEADER_SIZE;
@@ -270,9 +279,19 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                           int* const width,
                                           int* const height,
                                           int* const has_alpha,
+                                          int* const has_animation,
+                                          int* const format,
                                           WebPHeaderStructure* const headers) {
+  int canvas_width = 0;
+  int canvas_height = 0;
+  int image_width = 0;
+  int image_height = 0;
   int found_riff = 0;
   int found_vp8x = 0;
+  int animation_present = 0;
+  int fragments_present = 0;
+  const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
+
   VP8StatusCode status;
   WebPHeaderStructure hdrs;
 
@@ -284,7 +303,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   hdrs.data_size = data_size;
 
   // Skip over RIFF header.
-  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
+  status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
   if (status != VP8_STATUS_OK) {
     return status;   // Wrong RIFF header / insufficient data.
   }
@@ -293,22 +312,35 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   // Skip over VP8X.
   {
     uint32_t flags = 0;
-    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
+    status = ParseVP8X(&data, &data_size, &found_vp8x,
+                       &canvas_width, &canvas_height, &flags);
     if (status != VP8_STATUS_OK) {
       return status;  // Wrong VP8X / insufficient data.
     }
+    animation_present = !!(flags & ANIMATION_FLAG);
+    fragments_present = !!(flags & FRAGMENTS_FLAG);
     if (!found_riff && found_vp8x) {
       // Note: This restriction may be removed in the future, if it becomes
       // necessary to send VP8X chunk to the decoder.
       return VP8_STATUS_BITSTREAM_ERROR;
     }
-    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG_BIT);
-    if (found_vp8x && headers == NULL) {
-      return VP8_STATUS_OK;  // Return features from VP8X header.
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
+    if (has_animation != NULL) *has_animation = animation_present;
+    if (format != NULL) *format = 0;   // default = undefined
+
+    image_width = canvas_width;
+    image_height = canvas_height;
+    if (found_vp8x && (animation_present || fragments_present) &&
+        headers == NULL) {
+      status = VP8_STATUS_OK;
+      goto ReturnWidthHeight;  // Just return features from VP8X header.
     }
   }
 
-  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+  if (data_size < TAG_SIZE) {
+    status = VP8_STATUS_NOT_ENOUGH_DATA;
+    goto ReturnWidthHeight;
+  }
 
   // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
   if ((found_riff && found_vp8x) ||
@@ -316,43 +348,49 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                  &hdrs.alpha_data, &hdrs.alpha_data_size);
     if (status != VP8_STATUS_OK) {
-      return status;  // Found an invalid chunk size / insufficient data.
+      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
     }
   }
 
   // Skip over VP8/VP8L header.
-  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+  status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
                           &hdrs.compressed_size, &hdrs.is_lossless);
   if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
   }
   if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
     return VP8_STATUS_BITSTREAM_ERROR;
   }
 
+  if (format != NULL && !(animation_present || fragments_present)) {
+    *format = hdrs.is_lossless ? 2 : 1;
+  }
+
   if (!hdrs.is_lossless) {
     if (data_size < VP8_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
     }
     // Validates raw VP8 data.
-    if (!VP8GetInfo(data, data_size,
-                    (uint32_t)hdrs.compressed_size, width, height)) {
+    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
+                    &image_width, &image_height)) {
       return VP8_STATUS_BITSTREAM_ERROR;
     }
   } else {
     if (data_size < VP8L_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
     }
     // Validates raw VP8L data.
-    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
       return VP8_STATUS_BITSTREAM_ERROR;
     }
   }
-
-  if (has_alpha != NULL) {
-    // If the data did not contain a VP8X/VP8L chunk the only definitive way
-    // to set this is by looking for alpha data (from an ALPH chunk).
-    *has_alpha |= (hdrs.alpha_data != NULL);
+  // Validates image size coherency.
+  if (found_vp8x) {
+    if (canvas_width != image_width || canvas_height != image_height) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
   }
   if (headers != NULL) {
     *headers = hdrs;
@@ -360,21 +398,44 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
     assert(headers->offset == headers->data_size - data_size);
   }
-  return VP8_STATUS_OK;  // Return features from VP8 header.
+ ReturnWidthHeight:
+  if (status == VP8_STATUS_OK ||
+      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
+    if (has_alpha != NULL) {
+      // If the data did not contain a VP8X/VP8L chunk the only definitive way
+      // to set this is by looking for alpha data (from an ALPH chunk).
+      *has_alpha |= (hdrs.alpha_data != NULL);
+    }
+    if (width != NULL) *width = image_width;
+    if (height != NULL) *height = image_height;
+    return VP8_STATUS_OK;
+  } else {
+    return status;
+  }
 }
 
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  VP8StatusCode status;
+  int has_animation = 0;
   assert(headers != NULL);
   // fill out headers, ignore width/height/has_alpha.
-  return ParseHeadersInternal(headers->data, headers->data_size,
-                              NULL, NULL, NULL, headers);
+  status = ParseHeadersInternal(headers->data, headers->data_size,
+                                NULL, NULL, NULL, &has_animation,
+                                NULL, headers);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    // TODO(jzern): full support of animation frames will require API additions.
+    if (has_animation) {
+      status = VP8_STATUS_UNSUPPORTED_FEATURE;
+    }
+  }
+  return status;
 }
 
 //------------------------------------------------------------------------------
 // WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params) {
+  if (params != NULL) {
     memset(params, 0, sizeof(*params));
   }
 }
@@ -391,6 +452,7 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
 
   headers.data = data;
   headers.data_size = data_size;
+  headers.have_all_data = 1;
   status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
   if (status != VP8_STATUS_OK) {
     return status;
@@ -407,11 +469,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
     if (dec == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = params->options && (params->options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
 
@@ -423,6 +480,10 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
       status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                      params->output);
       if (status == VP8_STATUS_OK) {  // Decode
+        // This change must be done before calling VP8Decode()
+        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
+                                             io.width, io.height);
+        VP8InitDithering(params->options, dec);
         if (!VP8Decode(dec, &io)) {
           status = dec->status_;
         }
@@ -452,6 +513,10 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
   if (status != VP8_STATUS_OK) {
     WebPFreeDecBuffer(params->output);
   }
+
+  if (params->options != NULL && params->options->flip) {
+    status = WebPFlipBuffer(params->output);
+  }
   return status;
 }
 
@@ -609,7 +674,6 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
   assert(features != NULL);
   memset(features, 0, sizeof(*features));
-  features->bitstream_version = 0;
 }
 
 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@@ -619,10 +683,11 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
   }
   DefaultFeatures(features);
 
-  // Only parse enough of the data to retrieve width/height/has_alpha.
+  // Only parse enough of the data to retrieve the features.
   return ParseHeadersInternal(data, data_size,
                               &features->width, &features->height,
-                              &features->has_alpha, NULL);
+                              &features->has_alpha, &features->has_animation,
+                              &features->format, NULL);
 }
 
 //------------------------------------------------------------------------------
@@ -666,19 +731,13 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
 VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
                                       WebPBitstreamFeatures* features,
                                       int version) {
-  VP8StatusCode status;
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
     return VP8_STATUS_INVALID_PARAM;   // version mismatch
   }
   if (features == NULL) {
     return VP8_STATUS_INVALID_PARAM;
   }
-
-  status = GetFeatures(data, data_size, features);
-  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
-  }
-  return status;
+  return GetFeatures(data, data_size, features);
 }
 
 VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
@@ -722,9 +781,9 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
     h = options->crop_height;
     x = options->crop_left;
     y = options->crop_top;
-    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420
       x &= ~1;
-      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+      y &= ~1;
     }
     if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
       return 0;  // out of frame boundary error
@@ -740,11 +799,13 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   // Scaling
   io->use_scaling = (options != NULL) && (options->use_scaling > 0);
   if (io->use_scaling) {
-    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+    int scaled_width = options->scaled_width;
+    int scaled_height = options->scaled_height;
+    if (!WebPRescalerGetScaledDimensions(w, h, &scaled_width, &scaled_height)) {
       return 0;
     }
-    io->scaled_width = options->scaled_width;
-    io->scaled_height = options->scaled_height;
+    io->scaled_width = scaled_width;
+    io->scaled_height = scaled_height;
   }
 
   // Filter
@@ -766,6 +827,3 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/dec/webpi.h b/drivers/webp/dec/webpi.h
index 44e5744411..c75a2e4a5b 100644
--- a/drivers/webp/dec/webpi.h
+++ b/drivers/webp/dec/webpi.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
@@ -12,7 +14,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -24,7 +26,10 @@ extern "C" {
 
 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
-typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+typedef int (*OutputAlphaFunc)(const VP8Io* const io, WebPDecParams* const p,
+                               int expected_num_out_lines);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos,
+                             int max_out_lines);
 
 struct WebPDecParams {
   WebPDecBuffer* output;             // output buffer.
@@ -38,7 +43,7 @@ struct WebPDecParams {
   void* memory;                  // overall scratch memory for the output work.
 
   OutputFunc emit;               // output RGB or YUV samples
-  OutputFunc emit_alpha;         // output alpha channel
+  OutputAlphaFunc emit_alpha;    // output alpha channel
   OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
 };
 
@@ -52,6 +57,7 @@ void WebPResetDecParams(WebPDecParams* const params);
 typedef struct {
   const uint8_t* data;         // input buffer
   size_t data_size;            // input buffer size
+  int have_all_data;           // true if all data is known to be available
   size_t offset;               // offset to main data chunk (VP8 or VP8L)
   const uint8_t* alpha_data;   // points to alpha chunk (if present)
   size_t alpha_data_size;      // alpha chunk size
@@ -61,10 +67,10 @@ typedef struct {
 } WebPHeaderStructure;
 
 // Skips over all valid chunks prior to the first VP8/VP8L frame header.
-// Returns VP8_STATUS_OK on success,
-//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
-//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
-// In 'headers', compressed_size, offset, alpha_data, alpha_size and lossless
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
 // fields are updated appropriately upon success.
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 
@@ -91,10 +97,15 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 // dimension / etc.). If *options is not NULL, also verify that the options'
 // parameters are valid and apply them to the width/height dimensions of the
 // output buffer. This takes cropping / scaling / rotation into account.
+// Also incorporates the options->flip flag to flip the buffer parameters if
+// needed.
 VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                     const WebPDecoderOptions* const options,
                                     WebPDecBuffer* const buffer);
 
+// Flip buffer vertically by negating the various strides.
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
+
 // Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
 // memory (still held by 'src').
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
@@ -103,11 +114,9 @@ void WebPCopyDecBuffer(const WebPDecBuffer* const src,
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 
-
-
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif