20 files changed, 1423 insertions, 1176 deletions
diff --git a/drivers/webp/utils/bit_reader.c b/drivers/webp/utils/bit_reader.c
index 1afb1db890..cd265321bb 100644
--- a/drivers/webp/utils/bit_reader.c
+++ b/drivers/webp/utils/bit_reader.c
@@ -1,36 +1,54 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Boolean decoder
+// Boolean decoder non-inlined methods
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./bit_reader.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
 #endif
 
-#define MK(X) (((bit_t)(X) << (BITS)) | (MASK))
+#include "./bit_reader_inl.h"
 
 //------------------------------------------------------------------------------
 // VP8BitReader
 
+void VP8BitReaderSetBuffer(VP8BitReader* const br,
+                           const uint8_t* const start,
+                           size_t size) {
+  br->buf_     = start;
+  br->buf_end_ = start + size;
+  br->buf_max_ =
+      (size >= sizeof(lbit_t)) ? start + size - sizeof(lbit_t) + 1
+                               : start;
+}
+
 void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end) {
+                      const uint8_t* const start, size_t size) {
   assert(br != NULL);
   assert(start != NULL);
-  assert(start <= end);
-  br->range_   = MK(255 - 1);
-  br->buf_     = start;
-  br->buf_end_ = end;
+  assert(size < (1u << 31));   // limit ensured by format and upstream checks
+  br->range_   = 255 - 1;
   br->value_   = 0;
-  br->missing_ = 8;   // to load the very first 8bits
+  br->bits_    = -8;   // to load the very first 8bits
   br->eof_     = 0;
+  VP8BitReaderSetBuffer(br, start, size);
+  VP8LoadNewBytes(br);
+}
+
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+    br->buf_max_ += offset;
+  }
 }
 
 const uint8_t kVP8Log2Range[128] = {
@@ -45,36 +63,38 @@ const uint8_t kVP8Log2Range[128] = {
   0
 };
 
-// range = (range << kVP8Log2Range[range]) + trailing 1's
-const bit_t kVP8NewRange[128] = {
-  MK(127), MK(127), MK(191), MK(127), MK(159), MK(191), MK(223), MK(127),
-  MK(143), MK(159), MK(175), MK(191), MK(207), MK(223), MK(239), MK(127),
-  MK(135), MK(143), MK(151), MK(159), MK(167), MK(175), MK(183), MK(191),
-  MK(199), MK(207), MK(215), MK(223), MK(231), MK(239), MK(247), MK(127),
-  MK(131), MK(135), MK(139), MK(143), MK(147), MK(151), MK(155), MK(159),
-  MK(163), MK(167), MK(171), MK(175), MK(179), MK(183), MK(187), MK(191),
-  MK(195), MK(199), MK(203), MK(207), MK(211), MK(215), MK(219), MK(223),
-  MK(227), MK(231), MK(235), MK(239), MK(243), MK(247), MK(251), MK(127),
-  MK(129), MK(131), MK(133), MK(135), MK(137), MK(139), MK(141), MK(143),
-  MK(145), MK(147), MK(149), MK(151), MK(153), MK(155), MK(157), MK(159),
-  MK(161), MK(163), MK(165), MK(167), MK(169), MK(171), MK(173), MK(175),
-  MK(177), MK(179), MK(181), MK(183), MK(185), MK(187), MK(189), MK(191),
-  MK(193), MK(195), MK(197), MK(199), MK(201), MK(203), MK(205), MK(207),
-  MK(209), MK(211), MK(213), MK(215), MK(217), MK(219), MK(221), MK(223),
-  MK(225), MK(227), MK(229), MK(231), MK(233), MK(235), MK(237), MK(239),
-  MK(241), MK(243), MK(245), MK(247), MK(249), MK(251), MK(253), MK(127)
+// range = ((range - 1) << kVP8Log2Range[range]) + 1
+const uint8_t kVP8NewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127,
+  143, 159, 175, 191, 207, 223, 239, 127,
+  135, 143, 151, 159, 167, 175, 183, 191,
+  199, 207, 215, 223, 231, 239, 247, 127,
+  131, 135, 139, 143, 147, 151, 155, 159,
+  163, 167, 171, 175, 179, 183, 187, 191,
+  195, 199, 203, 207, 211, 215, 219, 223,
+  227, 231, 235, 239, 243, 247, 251, 127,
+  129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159,
+  161, 163, 165, 167, 169, 171, 173, 175,
+  177, 179, 181, 183, 185, 187, 189, 191,
+  193, 195, 197, 199, 201, 203, 205, 207,
+  209, 211, 213, 215, 217, 219, 221, 223,
+  225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
 };
 
-#undef MK
-
 void VP8LoadFinalBytes(VP8BitReader* const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Only read 8bits at a time
   if (br->buf_ < br->buf_end_) {
-    br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 + br->missing_);
-    br->missing_ -= 8;
-  } else {
+    br->bits_ += 8;
+    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
+  } else if (!br->eof_) {
+    br->value_ <<= 8;
+    br->bits_ += 8;
     br->eof_ = 1;
+  } else {
+    br->bits_ = 0;  // This is to avoid undefined behaviour with shifts.
   }
 }
 
@@ -97,32 +117,47 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 //------------------------------------------------------------------------------
 // VP8LBitReader
 
-#define MAX_NUM_BIT_READ 25
+#define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 
-static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
-  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
-  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
+#if !defined(WEBP_FORCE_ALIGNED) && \
+    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+     defined(__i386__) || defined(_M_IX86) || \
+     defined(__x86_64__) || defined(_M_X64))
+#define VP8L_USE_UNALIGNED_LOAD
+#endif
+
+static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
+  0,
+  0x000001, 0x000003, 0x000007, 0x00000f,
+  0x00001f, 0x00003f, 0x00007f, 0x0000ff,
+  0x0001ff, 0x0003ff, 0x0007ff, 0x000fff,
+  0x001fff, 0x003fff, 0x007fff, 0x00ffff,
+  0x01ffff, 0x03ffff, 0x07ffff, 0x0fffff,
+  0x1fffff, 0x3fffff, 0x7fffff, 0xffffff
 };
 
-void VP8LInitBitReader(VP8LBitReader* const br,
-                       const uint8_t* const start,
+void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
                        size_t length) {
   size_t i;
+  vp8l_val_t value = 0;
   assert(br != NULL);
   assert(start != NULL);
   assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
 
-  br->buf_ = start;
   br->len_ = length;
   br->val_ = 0;
-  br->pos_ = 0;
   br->bit_pos_ = 0;
   br->eos_ = 0;
-  br->error_ = 0;
-  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
-    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
-    ++br->pos_;
+
+  if (length > sizeof(br->val_)) {
+    length = sizeof(br->val_);
+  }
+  for (i = 0; i < length; ++i) {
+    value |= (vp8l_val_t)start[i] << (8 * i);
   }
+  br->val_ = value;
+  br->pos_ = length;
+  br->buf_ = start;
 }
 
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
@@ -130,100 +165,62 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
   assert(br != NULL);
   assert(buf != NULL);
   assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
-  br->eos_ = (br->pos_ >= len);
   br->buf_ = buf;
   br->len_ = len;
+  // pos_ > len_ should be considered a param error.
+  br->eos_ = (br->pos_ > br->len_) || VP8LIsEndOfStream(br);
+}
+
+static void VP8LSetEndOfStream(VP8LBitReader* const br) {
+  br->eos_ = 1;
+  br->bit_pos_ = 0;  // To avoid undefined behaviour with shifts.
 }
 
+// If not at EOS, reload up to VP8L_LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
   while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
     br->val_ >>= 8;
-    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << 56;
+    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
     ++br->pos_;
     br->bit_pos_ -= 8;
   }
-}
-
-void VP8LFillBitWindow(VP8LBitReader* const br) {
-  if (br->bit_pos_ >= 32) {
-#if defined(__x86_64__) || defined(_M_X64)
-    if (br->pos_ + 8 < br->len_) {
-      br->val_ >>= 32;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= *(const uint64_t *)(br->buf_ + br->pos_) << 32;
-      br->pos_ += 4;
-      br->bit_pos_ -= 32;
-    } else {
-      // Slow path.
-      ShiftBytes(br);
-    }
-#else
-    // Always the slow path.
-    ShiftBytes(br);
-#endif
-  }
-  if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
-    br->eos_ = 1;
+  if (VP8LIsEndOfStream(br)) {
+    VP8LSetEndOfStream(br);
   }
 }
 
-uint32_t VP8LReadOneBit(VP8LBitReader* const br) {
-  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
-  // Flag an error at end_of_stream.
-  if (!br->eos_) {
-    ++br->bit_pos_;
-    if (br->bit_pos_ >= 32) {
-      ShiftBytes(br);
-    }
-    // After this last bit is read, check if eos needs to be flagged.
-    if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
-      br->eos_ = 1;
-    }
-  } else {
-    br->error_ = 1;
+void VP8LDoFillBitWindow(VP8LBitReader* const br) {
+  assert(br->bit_pos_ >= VP8L_WBITS);
+  // TODO(jzern): given the fixed read size it may be possible to force
+  //              alignment in this block.
+#if defined(VP8L_USE_UNALIGNED_LOAD)
+  if (br->pos_ + sizeof(br->val_) < br->len_) {
+    br->val_ >>= VP8L_WBITS;
+    br->bit_pos_ -= VP8L_WBITS;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                (VP8L_LBITS - VP8L_WBITS);
+    br->pos_ += VP8L_LOG8_WBITS;
+    return;
   }
-  return val;
+#endif
+  ShiftBytes(br);       // Slow path.
 }
 
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
-  uint32_t val = 0;
   assert(n_bits >= 0);
   // Flag an error if end_of_stream or n_bits is more than allowed limit.
-  if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
-    // If this read is going to cross the read buffer, set the eos flag.
-    if (br->pos_ == br->len_) {
-      if ((br->bit_pos_ + n_bits) >= 64) {
-        br->eos_ = 1;
-        if ((br->bit_pos_ + n_bits) > 64) return val;
-      }
-    }
-    val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
-    br->bit_pos_ += n_bits;
-    if (br->bit_pos_ >= 40) {
-      if (br->pos_ + 5 < br->len_) {
-        br->val_ >>= 40;
-        br->val_ |=
-            (((uint64_t)br->buf_[br->pos_ + 0]) << 24) |
-            (((uint64_t)br->buf_[br->pos_ + 1]) << 32) |
-            (((uint64_t)br->buf_[br->pos_ + 2]) << 40) |
-            (((uint64_t)br->buf_[br->pos_ + 3]) << 48) |
-            (((uint64_t)br->buf_[br->pos_ + 4]) << 56);
-        br->pos_ += 5;
-        br->bit_pos_ -= 40;
-      }
-      if (br->bit_pos_ >= 8) {
-        ShiftBytes(br);
-      }
-    }
+  if (!br->eos_ && n_bits <= VP8L_MAX_NUM_BIT_READ) {
+    const uint32_t val = VP8LPrefetchBits(br) & kBitMask[n_bits];
+    const int new_bits = br->bit_pos_ + n_bits;
+    br->bit_pos_ = new_bits;
+    ShiftBytes(br);
+    return val;
   } else {
-    br->error_ = 1;
+    VP8LSetEndOfStream(br);
+    return 0;
   }
-  return val;
 }
 
 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/utils/bit_reader.h b/drivers/webp/utils/bit_reader.h
index 43cd948fd4..0fc62d33b7 100644
--- a/drivers/webp/utils/bit_reader.h
+++ b/drivers/webp/utils/bit_reader.h
@@ -1,9 +1,10 @@
-//
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Boolean decoder
@@ -18,44 +19,75 @@
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
-#include <string.h>  // For memcpy
-#include "../types.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#define BITS 32     // can be 32, 16 or 8
-#define MASK ((((bit_t)1) << (BITS)) - 1)
-#if (BITS == 32)
-typedef uint64_t bit_t;   // natural register type
-typedef uint32_t lbit_t;  // natural type for memory I/O
-#elif (BITS == 16)
-typedef uint32_t bit_t;
-typedef uint16_t lbit_t;
+// The Boolean decoder needs to maintain infinite precision on the value_ field.
+// However, since range_ is only 8bit, we only need an active window of 8 bits
+// for value_. Left bits (MSB) gets zeroed and shifted away when value_ falls
+// below 128, range_ is updated, and fresh bits read from the bitstream are
+// brought in as LSB. To avoid reading the fresh bits one by one (slow), we
+// cache BITS of them ahead. The total of (BITS + 8) bits must fit into a
+// natural register (with type bit_t). To fetch BITS bits from bitstream we
+// use a type lbit_t.
+//
+// BITS can be any multiple of 8 from 8 to 56 (inclusive).
+// Pick values that fit natural register size.
+
+#if defined(__i386__) || defined(_M_IX86)      // x86 32bit
+#define BITS 24
+#elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
+#define BITS 56
+#elif defined(__arm__) || defined(_M_ARM)      // ARM
+#define BITS 24
+#elif defined(__mips__)                        // MIPS
+#define BITS 24
+#else                                          // reasonable default
+#define BITS 24  // TODO(skal): test aarch64 and find the proper BITS value.
+#endif
+
+//------------------------------------------------------------------------------
+// Derived types and constants:
+//   bit_t = natural register type for storing 'value_' (which is BITS+8 bits)
+//   range_t = register for 'range_' (which is 8bits only)
+
+#if (BITS > 24)
+typedef uint64_t bit_t;
 #else
 typedef uint32_t bit_t;
-typedef uint8_t lbit_t;
 #endif
 
+typedef uint32_t range_t;
+
 //------------------------------------------------------------------------------
-// Bitreader and code-tree reader
+// Bitreader
 
 typedef struct VP8BitReader VP8BitReader;
 struct VP8BitReader {
+  // boolean decoder  (keep the field ordering as is!)
+  bit_t value_;               // current value
+  range_t range_;             // current range minus 1. In [127, 254] interval.
+  int bits_;                  // number of valid bits left
+  // read buffer
   const uint8_t* buf_;        // next byte to be read
   const uint8_t* buf_end_;    // end of read buffer
+  const uint8_t* buf_max_;    // max packed-read position on buffer
   int eof_;                   // true if input is exhausted
-
-  // boolean decoder
-  bit_t range_;            // current range minus 1. In [127, 254] interval.
-  bit_t value_;            // current value
-  int missing_;            // number of missing bits in value_ (8bit)
 };
 
 // Initialize the bit reader and the boolean decoder.
 void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end);
+                      const uint8_t* const start, size_t size);
+// Sets the working read buffer.
+void VP8BitReaderSetBuffer(VP8BitReader* const br,
+                           const uint8_t* const start, size_t size);
+
+// Update internal pointers to displace the byte buffer by the
+// relative offset 'offset'.
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
 
 // return the next value made of 'num_bits' bits
 uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
@@ -66,100 +98,31 @@ static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
 // return the next value with sign-extension.
 int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 
-// Read a bit with proba 'prob'. Speed-critical function!
-extern const uint8_t kVP8Log2Range[128];
-extern const bit_t kVP8NewRange[128];
-
-void VP8LoadFinalBytes(VP8BitReader* const br);    // special case for the tail
-
-static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
-  assert(br && br->buf_);
-  // Read 'BITS' bits at a time if possible.
-  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
-    // convert memory type to register type (with some zero'ing!)
-    bit_t bits;
-    lbit_t in_bits = *(lbit_t*)br->buf_;
-    br->buf_ += (BITS) >> 3;
-#if !defined(__BIG_ENDIAN__)
-#if (BITS == 32)
-#if defined(__i386__) || defined(__x86_64__)
-    __asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
-    bits = (bit_t)in_bits;   // 32b -> 64b zero-extension
-#elif defined(_MSC_VER)
-    bits = _byteswap_ulong(in_bits);
-#else
-    bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
-         | ((in_bits << 8) & 0xff0000)  | (in_bits << 24);
-#endif  // x86
-#elif (BITS == 16)
-    // gcc will recognize a 'rorw $8, ...' here:
-    bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
-#endif
-#else    // LITTLE_ENDIAN
-    bits = (bit_t)in_bits;
-#endif
-    br->value_ |= bits << br->missing_;
-    br->missing_ -= (BITS);
-  } else {
-    VP8LoadFinalBytes(br);    // no need to be inlined
-  }
-}
+// bit_reader_inl.h will implement the following methods:
+//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
+//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+// and should be included by the .c files that actually need them.
+// This is to avoid recompiling the whole library whenever this file is touched,
+// and also allowing platform-specific ad-hoc hacks.
 
-static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, bit_t split) {
-  const bit_t value_split = split | (MASK);
-  if (br->missing_ > 0) {  // Make sure we have a least BITS bits in 'value_'
-    VP8LoadNewBytes(br);
-  }
-  if (br->value_ > value_split) {
-    br->range_ -= value_split + 1;
-    br->value_ -= value_split + 1;
-    return 1;
-  } else {
-    br->range_ = value_split;
-    return 0;
-  }
-}
-
-static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
-  // range_ is in [0..127] interval here.
-  const int idx = br->range_ >> (BITS);
-  const int shift = kVP8Log2Range[idx];
-  br->range_ = kVP8NewRange[idx];
-  br->value_ <<= shift;
-  br->missing_ += shift;
-}
-
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
-  // It's important to avoid generating a 64bit x 64bit multiply here.
-  // We just need an 8b x 8b after all.
-  const bit_t split =
-      (bit_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
-  const int bit = VP8BitUpdate(br, split);
-  if (br->range_ <= (((bit_t)0x7e << (BITS)) | (MASK))) {
-    VP8Shift(br);
-  }
-  return bit;
-}
+// -----------------------------------------------------------------------------
+// Bitreader for lossless format
 
-static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
-  const bit_t split = (br->range_ >> 1);
-  const int bit = VP8BitUpdate(br, split);
-  VP8Shift(br);
-  return bit ? -v : v;
-}
+// maximum number of bits (inclusive) the bit-reader can handle:
+#define VP8L_MAX_NUM_BIT_READ 24
 
+#define VP8L_LBITS 64  // Number of bits prefetched (= bit-size of vp8l_val_t).
+#define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
 
-// -----------------------------------------------------------------------------
-// Bitreader
+typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
 
 typedef struct {
-  uint64_t       val_;
-  const uint8_t* buf_;
-  size_t         len_;
-  size_t         pos_;
-  int            bit_pos_;
-  int            eos_;
-  int            error_;
+  vp8l_val_t     val_;        // pre-fetched bits
+  const uint8_t* buf_;        // input byte buffer
+  size_t         len_;        // buffer length
+  size_t         pos_;        // byte position in buf_
+  int            bit_pos_;    // current bit-reading position in val_
+  int            eos_;        // true if a bit was read past the end of buffer
 } VP8LBitReader;
 
 void VP8LInitBitReader(VP8LBitReader* const br,
@@ -170,28 +133,39 @@ void VP8LInitBitReader(VP8LBitReader* const br,
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                             const uint8_t* const buffer, size_t length);
 
-// Reads the specified number of bits from Read Buffer.
-// Flags an error in case end_of_stream or n_bits is more than allowed limit.
-// Flags eos if this read attempt is going to cross the read buffer.
+// Reads the specified number of bits from read buffer.
+// Flags an error in case end_of_stream or n_bits is more than the allowed limit
+// of VP8L_MAX_NUM_BIT_READ (inclusive).
+// Flags eos_ if this read attempt is going to cross the read buffer.
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
 
-// Reads one bit from Read Buffer. Flags an error in case end_of_stream.
-// Flags eos after reading last bit from the buffer.
-uint32_t VP8LReadOneBit(VP8LBitReader* const br);
-
-// VP8LReadOneBitUnsafe is faster than VP8LReadOneBit, but it can be called only
-// 32 times after the last VP8LFillBitWindow. Any subsequent calls
-// (without VP8LFillBitWindow) will return invalid data.
-static WEBP_INLINE uint32_t VP8LReadOneBitUnsafe(VP8LBitReader* const br) {
-  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
-  ++br->bit_pos_;
-  return val;
+// Return the prefetched bits, so they can be looked up.
+static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
+  return (uint32_t)(br->val_ >> (br->bit_pos_ & (VP8L_LBITS - 1)));
+}
+
+// Returns true if there was an attempt at reading bit past the end of
+// the buffer. Doesn't set br->eos_ flag.
+static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
+  assert(br->pos_ <= br->len_);
+  return br->eos_ || ((br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS));
 }
 
-// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
-void VP8LFillBitWindow(VP8LBitReader* const br);
+// For jumping over a number of bits in the bit stream when accessed with
+// VP8LPrefetchBits and VP8LFillBitWindow.
+static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
+  br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
+}
+
+// Advances the read buffer by 4 bytes to make room for reading next 32 bits.
+// Speed critical, but infrequent part of the code can be non-inlined.
+extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
+static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
+}
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/bit_writer.c b/drivers/webp/utils/bit_writer.c
index 671159cacd..064428691b 100644
--- a/drivers/webp/utils/bit_writer.c
+++ b/drivers/webp/utils/bit_writer.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Bit writing and boolean coder
@@ -13,11 +15,10 @@
 #include <assert.h>
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
-#include "./bit_writer.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "./bit_writer.h"
+#include "./endian_inl.h"
+#include "./utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitWriter
@@ -36,19 +37,22 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
   new_size = 2 * bw->max_pos_;
   if (new_size < needed_size) new_size = needed_size;
   if (new_size < 1024) new_size = 1024;
-  new_buf = (uint8_t*)malloc(new_size);
+  new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
   if (new_buf == NULL) {
     bw->error_ = 1;
     return 0;
   }
-  memcpy(new_buf, bw->buf_, bw->pos_);
-  free(bw->buf_);
+  if (bw->pos_ > 0) {
+    assert(bw->buf_ != NULL);
+    memcpy(new_buf, bw->buf_, bw->pos_);
+  }
+  WebPSafeFree(bw->buf_);
   bw->buf_ = new_buf;
   bw->max_pos_ = new_size;
   return 1;
 }
 
-static void kFlush(VP8BitWriter* const bw) {
+static void Flush(VP8BitWriter* const bw) {
   const int s = 8 + bw->nb_bits_;
   const int32_t bits = bw->value_ >> s;
   assert(bw->nb_bits_ >= 0);
@@ -114,7 +118,7 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= shift;
     bw->nb_bits_ += shift;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
@@ -131,24 +135,25 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= 1;
     bw->nb_bits_ += 1;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
 
-void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits) {
-  int mask;
-  for (mask = 1 << (nb_bits - 1); mask; mask >>= 1)
+void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
+  uint32_t mask;
+  assert(nb_bits > 0 && nb_bits < 32);
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1)
     VP8PutBitUniform(bw, value & mask);
 }
 
-void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) {
+void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
   if (!VP8PutBitUniform(bw, value != 0))
     return;
   if (value < 0) {
-    VP8PutValue(bw, ((-value) << 1) | 1, nb_bits + 1);
+    VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
   } else {
-    VP8PutValue(bw, value << 1, nb_bits + 1);
+    VP8PutBits(bw, value << 1, nb_bits + 1);
   }
 }
 
@@ -167,16 +172,16 @@ int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
 }
 
 uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
-  VP8PutValue(bw, 0, 9 - bw->nb_bits_);
+  VP8PutBits(bw, 0, 9 - bw->nb_bits_);
   bw->nb_bits_ = 0;   // pad with zeroes
-  kFlush(bw);
+  Flush(bw);
   return bw->buf_;
 }
 
 int VP8BitWriterAppend(VP8BitWriter* const bw,
                        const uint8_t* data, size_t size) {
-  assert(data);
-  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  assert(data != NULL);
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
   if (!BitWriterResize(bw, size)) return 0;
   memcpy(bw->buf_ + bw->pos_, data, size);
   bw->pos_ += size;
@@ -184,8 +189,8 @@ int VP8BitWriterAppend(VP8BitWriter* const bw,
 }
 
 void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
-  if (bw) {
-    free(bw->buf_);
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
     memset(bw, 0, sizeof(*bw));
   }
 }
@@ -193,32 +198,39 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
 //------------------------------------------------------------------------------
 // VP8LBitWriter
 
+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE  (32768ULL)
+
 // Returns 1 on success.
 static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
   uint8_t* allocated_buf;
   size_t allocated_size;
-  const size_t current_size = VP8LBitWriterNumBytes(bw);
+  const size_t max_bytes = bw->end_ - bw->buf_;
+  const size_t current_size = bw->cur_ - bw->buf_;
   const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
   const size_t size_required = (size_t)size_required_64b;
   if (size_required != size_required_64b) {
     bw->error_ = 1;
     return 0;
   }
-  if (bw->max_bytes_ > 0 && size_required <= bw->max_bytes_) return 1;
-  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (max_bytes > 0 && size_required <= max_bytes) return 1;
+  allocated_size = (3 * max_bytes) >> 1;
   if (allocated_size < size_required) allocated_size = size_required;
   // make allocated size multiple of 1k
   allocated_size = (((allocated_size >> 10) + 1) << 10);
-  allocated_buf = (uint8_t*)malloc(allocated_size);
+  allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
   if (allocated_buf == NULL) {
     bw->error_ = 1;
     return 0;
   }
-  memcpy(allocated_buf, bw->buf_, current_size);
-  free(bw->buf_);
+  if (current_size > 0) {
+    memcpy(allocated_buf, bw->buf_, current_size);
+  }
+  WebPSafeFree(bw->buf_);
   bw->buf_ = allocated_buf;
-  bw->max_bytes_ = allocated_size;
-  memset(allocated_buf + current_size, 0, allocated_size - current_size);
+  bw->cur_ = bw->buf_ + current_size;
+  bw->end_ = bw->buf_ + allocated_size;
   return 1;
 }
 
@@ -227,58 +239,81 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
   return VP8LBitWriterResize(bw, expected_size);
 }
 
-void VP8LBitWriterDestroy(VP8LBitWriter* const bw) {
+void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
   if (bw != NULL) {
-    free(bw->buf_);
+    WebPSafeFree(bw->buf_);
     memset(bw, 0, sizeof(*bw));
   }
 }
 
-void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
-  if (n_bits < 1) return;
-#if !defined(__BIG_ENDIAN__)
-  // Technically, this branch of the code can write up to 25 bits at a time,
-  // but in prefix encoding, the maximum number of bits written is 18 at a time.
-  {
-    uint8_t* const p = &bw->buf_[bw->bit_pos_ >> 3];
-    uint32_t v = *(const uint32_t*)p;
-    v |= bits << (bw->bit_pos_ & 7);
-    *(uint32_t*)p = v;
-    bw->bit_pos_ += n_bits;
+void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
+  // If needed, make some room by flushing some bits out.
+  if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+    const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+    if (extra_size != (size_t)extra_size ||
+        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+      bw->cur_ = bw->buf_;
+      bw->error_ = 1;
+      return;
+    }
   }
-#else  // BIG_ENDIAN
-  {
-    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
-    const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
-    const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
-    // implicit & 0xff is assumed for uint8_t arithmetics
-    *p++ |= bits << bits_reserved_in_first_byte;
-    bits >>= 8 - bits_reserved_in_first_byte;
-    if (bits_left_to_write >= 1) {
-      *p++ = bits;
-      bits >>= 8;
-      if (bits_left_to_write >= 9) {
-        *p++ = bits;
-        bits >>= 8;
+  *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
+  bw->cur_ += VP8L_WRITER_BYTES;
+  bw->bits_ >>= VP8L_WRITER_BITS;
+  bw->used_ -= VP8L_WRITER_BITS;
+}
+
+void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
+  assert(n_bits <= 32);
+  // That's the max we can handle:
+  assert(sizeof(vp8l_wtype_t) == 2);
+  if (n_bits > 0) {
+    vp8l_atype_t lbits = bw->bits_;
+    int used = bw->used_;
+    // Special case of overflow handling for 32bit accumulator (2-steps flush).
+#if VP8L_WRITER_BITS == 16
+    if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
+      // Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
+      const int shift = VP8L_WRITER_MAX_BITS - used;
+      lbits |= (vp8l_atype_t)bits << used;
+      used = VP8L_WRITER_MAX_BITS;
+      n_bits -= shift;
+      bits >>= shift;
+      assert(n_bits <= VP8L_WRITER_MAX_BITS);
+    }
+#endif
+    // If needed, make some room by flushing some bits out.
+    while (used >= VP8L_WRITER_BITS) {
+      if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+        const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+        if (extra_size != (size_t)extra_size ||
+            !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+          bw->cur_ = bw->buf_;
+          bw->error_ = 1;
+          return;
+        }
       }
+      *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
+      bw->cur_ += VP8L_WRITER_BYTES;
+      lbits >>= VP8L_WRITER_BITS;
+      used -= VP8L_WRITER_BITS;
     }
-    assert(n_bits <= 25);
-    *p = bits;
-    bw->bit_pos_ += n_bits;
+    bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
+    bw->used_ = used + n_bits;
   }
-#endif
-  if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
-    const uint64_t extra_size = 32768ULL + bw->max_bytes_;
-    if (extra_size != (size_t)extra_size ||
-        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
-      bw->bit_pos_ = 0;
-      bw->error_ = 1;
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  // flush leftover bits
+  if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+    while (bw->used_ > 0) {
+      *bw->cur_++ = (uint8_t)bw->bits_;
+      bw->bits_ >>= 8;
+      bw->used_ -= 8;
     }
+    bw->used_ = 0;
   }
+  return bw->buf_;
 }
 
 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/utils/bit_writer.h b/drivers/webp/utils/bit_writer.h
index 57f39b11b1..ef360d1dc6 100644
--- a/drivers/webp/utils/bit_writer.h
+++ b/drivers/webp/utils/bit_writer.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Bit writing and boolean coder
@@ -12,9 +14,9 @@
 #ifndef WEBP_UTILS_BIT_WRITER_H_
 #define WEBP_UTILS_BIT_WRITER_H_
 
-#include "../types.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -43,8 +45,8 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw);
 
 int VP8PutBit(VP8BitWriter* const bw, int bit, int prob);
 int VP8PutBitUniform(VP8BitWriter* const bw, int bit);
-void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits);
-void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits);
+void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits);
+void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits);
 
 // Appends some bytes to the internal buffer. Data is copied.
 int VP8BitWriterAppend(VP8BitWriter* const bw,
@@ -66,57 +68,77 @@ static WEBP_INLINE size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
 
 //------------------------------------------------------------------------------
 // VP8LBitWriter
-// TODO(vikasa): VP8LBitWriter is copied as-is from lossless code. There's scope
-// of re-using VP8BitWriter. Will evaluate once basic lossless encoder is
-// implemented.
 
-typedef struct {
-  uint8_t* buf_;
-  size_t bit_pos_;
-  size_t max_bytes_;
+#if defined(__x86_64__) || defined(_M_X64)   // 64bit
+typedef uint64_t vp8l_atype_t;   // accumulator type
+typedef uint32_t vp8l_wtype_t;   // writing type
+#define WSWAP HToLE32
+#define VP8L_WRITER_BYTES    4   // sizeof(vp8l_wtype_t)
+#define VP8L_WRITER_BITS     32  // 8 * sizeof(vp8l_wtype_t)
+#define VP8L_WRITER_MAX_BITS 64  // 8 * sizeof(vp8l_atype_t)
+#else
+typedef uint32_t vp8l_atype_t;
+typedef uint16_t vp8l_wtype_t;
+#define WSWAP HToLE16
+#define VP8L_WRITER_BYTES    2
+#define VP8L_WRITER_BITS     16
+#define VP8L_WRITER_MAX_BITS 32
+#endif
 
-  // After all bits are written, the caller must observe the state of
-  // error_. A value of 1 indicates that a memory allocation failure
-  // has happened during bit writing. A value of 0 indicates successful
+typedef struct {
+  vp8l_atype_t bits_;   // bit accumulator
+  int          used_;   // number of bits used in accumulator
+  uint8_t*     buf_;    // start of buffer
+  uint8_t*     cur_;    // current write position
+  uint8_t*     end_;    // end of buffer
+
+  // After all bits are written (VP8LBitWriterFinish()), the caller must observe
+  // the state of error_. A value of 1 indicates that a memory allocation
+  // failure has happened during bit writing. A value of 0 indicates successful
   // writing of bits.
   int error_;
 } VP8LBitWriter;
 
 static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
-  return (bw->bit_pos_ + 7) >> 3;
-}
-
-static WEBP_INLINE uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
-  return bw->buf_;
+  return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }
 
-// Returns 0 in case of memory allocation error.
+// Returns false in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+// Finalize the bitstream coding. Returns a pointer to the internal buffer.
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
+// Release any pending memory and zeroes the object.
+void VP8LBitWriterWipeOut(VP8LBitWriter* const bw);
 
-void VP8LBitWriterDestroy(VP8LBitWriter* const bw);
+// Internal function for VP8LPutBits flushing 32 bits from the written state.
+void VP8LPutBitsFlushBits(VP8LBitWriter* const bw);
 
-// This function writes bits into bytes in increasing addresses, and within
-// a byte least-significant-bit first.
-//
-// The function can write up to 16 bits in one go with WriteBits
-// Example: let's assume that 3 bits (Rs below) have been written already:
-//
-// BYTE-0     BYTE+1       BYTE+2
-//
-// 0000 0RRR    0000 0000    0000 0000
-//
-// Now, we could write 5 or less bits in MSB by just sifting by 3
-// and OR'ing to BYTE-0.
-//
-// For n bits, we take the last 5 bytes, OR that with high bits in BYTE-0,
-// and locate the rest in BYTE+1 and BYTE+2.
-//
+// PutBits internal function used in the 16 bit vp8l_wtype_t case.
+void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits);
+
+// This function writes bits into bytes in increasing addresses (little endian),
+// and within a byte least-significant-bit first.
+// This function can write up to 32 bits in one go, but VP8LBitReader can only
+// read 24 bits max (VP8L_MAX_NUM_BIT_READ).
 // VP8LBitWriter's error_ flag is set in case of  memory allocation error.
-void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
+static WEBP_INLINE void VP8LPutBits(VP8LBitWriter* const bw,
+                                    uint32_t bits, int n_bits) {
+  if (sizeof(vp8l_wtype_t) == 4) {
+    if (n_bits > 0) {
+      if (bw->used_ >= 32) {
+        VP8LPutBitsFlushBits(bw);
+      }
+      bw->bits_ |= (vp8l_atype_t)bits << bw->used_;
+      bw->used_ += n_bits;
+    }
+  } else {
+    VP8LPutBitsInternal(bw, bits, n_bits);
+  }
+}
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/color_cache.c b/drivers/webp/utils/color_cache.c
index 560f81db10..f9ff4b5451 100644
--- a/drivers/webp/utils/color_cache.c
+++ b/drivers/webp/utils/color_cache.c
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
@@ -11,13 +13,10 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./color_cache.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
@@ -29,16 +28,22 @@ int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
                                           sizeof(*cc->colors_));
   if (cc->colors_ == NULL) return 0;
   cc->hash_shift_ = 32 - hash_bits;
+  cc->hash_bits_ = hash_bits;
   return 1;
 }
 
 void VP8LColorCacheClear(VP8LColorCache* const cc) {
   if (cc != NULL) {
-    free(cc->colors_);
+    WebPSafeFree(cc->colors_);
     cc->colors_ = NULL;
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
+void VP8LColorCacheCopy(const VP8LColorCache* const src,
+                        VP8LColorCache* const dst) {
+  assert(src != NULL);
+  assert(dst != NULL);
+  assert(src->hash_bits_ == dst->hash_bits_);
+  memcpy(dst->colors_, src->colors_,
+         ((size_t)1u << dst->hash_bits_) * sizeof(*dst->colors_));
 }
-#endif
diff --git a/drivers/webp/utils/color_cache.h b/drivers/webp/utils/color_cache.h
index da5e260195..a9a9f64270 100644
--- a/drivers/webp/utils/color_cache.h
+++ b/drivers/webp/utils/color_cache.h
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
@@ -13,26 +15,33 @@
 #ifndef WEBP_UTILS_COLOR_CACHE_H_
 #define WEBP_UTILS_COLOR_CACHE_H_
 
-#include "../types.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
 // Main color cache struct.
 typedef struct {
   uint32_t *colors_;  // color entries
-  int hash_shift_;    // Hash shift: 32 - hash_bits.
+  int hash_shift_;    // Hash shift: 32 - hash_bits_.
+  int hash_bits_;
 } VP8LColorCache;
 
 static const uint32_t kHashMul = 0x1e35a7bd;
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
     const VP8LColorCache* const cc, uint32_t key) {
-  assert(key <= (~0U >> cc->hash_shift_));
+  assert((key >> cc->hash_bits_) == 0u);
   return cc->colors_[key];
 }
 
+static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
+                                          uint32_t key, uint32_t argb) {
+  assert((key >> cc->hash_bits_) == 0u);
+  cc->colors_[key] = argb;
+}
+
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
   const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
@@ -47,7 +56,7 @@ static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
   const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
-  return cc->colors_[key] == argb;
+  return (cc->colors_[key] == argb);
 }
 
 //------------------------------------------------------------------------------
@@ -56,12 +65,15 @@ static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
 // Returns false in case of memory error.
 int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits);
 
+void VP8LColorCacheCopy(const VP8LColorCache* const src,
+                        VP8LColorCache* const dst);
+
 // Delete the memory associated to color cache.
 void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/drivers/webp/utils/filters.c b/drivers/webp/utils/filters.c
index 08f52a3d20..15543b1271 100644
--- a/drivers/webp/utils/filters.c
+++ b/drivers/webp/utils/filters.c
@@ -1,171 +1,37 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Spatial prediction using various filters
+// filter estimation
 //
 // Author: Urvang (urvang@google.com)
 
 #include "./filters.h"
-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Helpful macro.
-
-# define SANITY_CHECK(in, out)                              \
-  assert(in != NULL);                                       \
-  assert(out != NULL);                                      \
-  assert(width > 0);                                        \
-  assert(height > 0);                                       \
-  assert(bpp > 0);                                          \
-  assert(stride >= width * bpp);
-
-static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
-                                    uint8_t* dst, int length, int inverse) {
-  int i;
-  if (inverse) {
-    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
-  } else {
-    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
-  }
-}
-
-//------------------------------------------------------------------------------
-// Horizontal filter.
-
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
-  int h;
-  const uint8_t* preds = (inverse ? out : in);
-  SANITY_CHECK(in, out);
-
-  // Filter line-by-line.
-  for (h = 0; h < height; ++h) {
-    // Leftmost pixel is predicted from above (except for topmost scanline).
-    if (h == 0) {
-      memcpy((void*)out, (const void*)in, bpp);
-    } else {
-      PredictLine(in, preds - stride, out, bpp, inverse);
-    }
-    PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
-}
-
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int bpp, int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, bpp, stride, 0, filtered_data);
-}
-
-static void HorizontalUnfilter(const uint8_t* data, int width, int height,
-                               int bpp, int stride, uint8_t* recon_data) {
-  DoHorizontalFilter(data, width, height, bpp, stride, 1, recon_data);
-}
-
-//------------------------------------------------------------------------------
-// Vertical filter.
-
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
-  int h;
-  const uint8_t* preds = (inverse ? out : in);
-  SANITY_CHECK(in, out);
-
-  // Very first top-left pixel is copied.
-  memcpy((void*)out, (const void*)in, bpp);
-  // Rest of top scan-line is left-predicted.
-  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
-
-  // Filter line-by-line.
-  for (h = 1; h < height; ++h) {
-    in += stride;
-    out += stride;
-    PredictLine(in, preds, out, bpp * width, inverse);
-    preds += stride;
-  }
-}
-
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int bpp, int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, bpp, stride, 0, filtered_data);
-}
-
-static void VerticalUnfilter(const uint8_t* data, int width, int height,
-                             int bpp, int stride, uint8_t* recon_data) {
-  DoVerticalFilter(data, width, height, bpp, stride, 1, recon_data);
-}
+// -----------------------------------------------------------------------------
+// Quick estimate of a potentially interesting filter mode to try.
 
-//------------------------------------------------------------------------------
-// Gradient filter.
+#define SMAX 16
+#define SDIFF(a, b) (abs((a) - (b)) >> 4)   // Scoring diff, in [0..SMAX)
 
 static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
-  return (g < 0) ? 0 : (g > 255) ? 255 : g;
-}
-
-static WEBP_INLINE
-void DoGradientFilter(const uint8_t* in, int width, int height,
-                      int bpp, int stride, int inverse, uint8_t* out) {
-  const uint8_t* preds = (inverse ? out : in);
-  int h;
-  SANITY_CHECK(in, out);
-
-  // left prediction for top scan-line
-  memcpy((void*)out, (const void*)in, bpp);
-  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
-
-  // Filter line-by-line.
-  for (h = 1; h < height; ++h) {
-    int w;
-    preds += stride;
-    in += stride;
-    out += stride;
-    // leftmost pixel: predict from above.
-    PredictLine(in, preds - stride, out, bpp, inverse);
-    for (w = bpp; w < width * bpp; ++w) {
-      const int pred = GradientPredictor(preds[w - bpp],
-                                         preds[w - stride],
-                                         preds[w - stride - bpp]);
-      out[w] = in[w] + (inverse ? pred : -pred);
-    }
-  }
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int bpp, int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, bpp, stride, 0, filtered_data);
-}
-
-static void GradientUnfilter(const uint8_t* data, int width, int height,
-                             int bpp, int stride, uint8_t* recon_data) {
-  DoGradientFilter(data, width, height, bpp, stride, 1, recon_data);
-}
-
-#undef SANITY_CHECK
-
-// -----------------------------------------------------------------------------
-// Quick estimate of a potentially interesting filter mode to try, in addition
-// to the default NONE.
-
-#define SMAX 16
-#define SDIFF(a, b) (abs((a) - (b)) >> 4)   // Scoring diff, in [0..SMAX)
-
-WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
-                                    int width, int height, int stride) {
+WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data,
+                                        int width, int height, int stride) {
   int i, j;
   int bins[WEBP_FILTER_LAST][SMAX];
   memset(bins, 0, sizeof(bins));
+
   // We only sample every other pixels. That's enough.
   for (j = 2; j < height - 1; j += 2) {
     const uint8_t* const p = data + j * stride;
@@ -185,7 +51,8 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
     }
   }
   {
-    WEBP_FILTER_TYPE filter, best_filter = WEBP_FILTER_NONE;
+    int filter;
+    WEBP_FILTER_TYPE best_filter = WEBP_FILTER_NONE;
     int best_score = 0x7fffffff;
     for (filter = WEBP_FILTER_NONE; filter < WEBP_FILTER_LAST; ++filter) {
       int score = 0;
@@ -196,7 +63,7 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
       }
       if (score < best_score) {
         best_score = score;
-        best_filter = filter;
+        best_filter = (WEBP_FILTER_TYPE)filter;
       }
     }
     return best_filter;
@@ -207,23 +74,3 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
 #undef SDIFF
 
 //------------------------------------------------------------------------------
-
-const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST] = {
-  NULL,              // WEBP_FILTER_NONE
-  HorizontalFilter,  // WEBP_FILTER_HORIZONTAL
-  VerticalFilter,    // WEBP_FILTER_VERTICAL
-  GradientFilter     // WEBP_FILTER_GRADIENT
-};
-
-const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
-  NULL,                // WEBP_FILTER_NONE
-  HorizontalUnfilter,  // WEBP_FILTER_HORIZONTAL
-  VerticalUnfilter,    // WEBP_FILTER_VERTICAL
-  GradientUnfilter     // WEBP_FILTER_GRADIENT
-};
-
-//------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/utils/filters.h b/drivers/webp/utils/filters.h
index db886be29a..088b132fc5 100644
--- a/drivers/webp/utils/filters.h
+++ b/drivers/webp/utils/filters.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Spatial prediction using various filters
@@ -12,42 +14,18 @@
 #ifndef WEBP_UTILS_FILTERS_H_
 #define WEBP_UTILS_FILTERS_H_
 
-#include "../types.h"
+#include "../webp/types.h"
+#include "../dsp/dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-// Filters.
-typedef enum {
-  WEBP_FILTER_NONE = 0,
-  WEBP_FILTER_HORIZONTAL,
-  WEBP_FILTER_VERTICAL,
-  WEBP_FILTER_GRADIENT,
-  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
-  WEBP_FILTER_BEST,
-  WEBP_FILTER_FAST
-} WEBP_FILTER_TYPE;
-
-typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
-                               int bpp, int stride, uint8_t* out);
-
-// Filter the given data using the given predictor.
-// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
-// in raster order.
-// 'bpp' is number of bytes per pixel, and
-// 'stride' is number of bytes per scan line (with possible padding).
-// 'out' should be pre-allocated.
-extern const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
-
-// Reconstruct the original data from the given filtered data.
-extern const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST];
-
 // Fast estimate of a potentially good filter.
-extern WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
-                                           int width, int height, int stride);
+WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data,
+                                        int width, int height, int stride);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/huffman.c b/drivers/webp/utils/huffman.c
index 1cc1cfd355..d57376aa6b 100644
--- a/drivers/webp/utils/huffman.c
+++ b/drivers/webp/utils/huffman.c
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
@@ -11,228 +13,193 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./huffman.h"
 #include "../utils/utils.h"
-#include "../format_constants.h"
+#include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// Huffman data read via DecodeImageStream is represented in two (red and green)
+// bytes.
+#define MAX_HTREE_GROUPS    0x10000
 
-#define NON_EXISTENT_SYMBOL (-1)
-
-static void TreeNodeInit(HuffmanTreeNode* const node) {
-  node->children_ = -1;   // means: 'unassigned so far'
-}
-
-static int NodeIsEmpty(const HuffmanTreeNode* const node) {
-  return (node->children_ < 0);
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups) {
+  HTreeGroup* const htree_groups =
+      (HTreeGroup*)WebPSafeMalloc(num_htree_groups, sizeof(*htree_groups));
+  if (htree_groups == NULL) {
+    return NULL;
+  }
+  assert(num_htree_groups <= MAX_HTREE_GROUPS);
+  return htree_groups;
 }
 
-static int IsFull(const HuffmanTree* const tree) {
-  return (tree->num_nodes_ == tree->max_nodes_);
+void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups) {
+  if (htree_groups != NULL) {
+    WebPSafeFree(htree_groups);
+  }
 }
 
-static void AssignChildren(HuffmanTree* const tree,
-                           HuffmanTreeNode* const node) {
-  HuffmanTreeNode* const children = tree->root_ + tree->num_nodes_;
-  node->children_ = (int)(children - node);
-  assert(children - node == (int)(children - node));
-  tree->num_nodes_ += 2;
-  TreeNodeInit(children + 0);
-  TreeNodeInit(children + 1);
+// Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
+// bit-wise reversal of the len least significant bits of key.
+static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
+  uint32_t step = 1 << (len - 1);
+  while (key & step) {
+    step >>= 1;
+  }
+  return (key & (step - 1)) + step;
 }
 
-static int TreeInit(HuffmanTree* const tree, int num_leaves) {
-  assert(tree != NULL);
-  if (num_leaves == 0) return 0;
-  // We allocate maximum possible nodes in the tree at once.
-  // Note that a Huffman tree is a full binary tree; and in a full binary tree
-  // with L leaves, the total number of nodes N = 2 * L - 1.
-  tree->max_nodes_ = 2 * num_leaves - 1;
-  tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
-                                                 sizeof(*tree->root_));
-  if (tree->root_ == NULL) return 0;
-  TreeNodeInit(tree->root_);  // Initialize root.
-  tree->num_nodes_ = 1;
-  return 1;
+// Stores code in table[0], table[step], table[2*step], ..., table[end].
+// Assumes that end is an integer multiple of step.
+static WEBP_INLINE void ReplicateValue(HuffmanCode* table,
+                                       int step, int end,
+                                       HuffmanCode code) {
+  assert(end % step == 0);
+  do {
+    end -= step;
+    table[end] = code;
+  } while (end > 0);
 }
 
-void HuffmanTreeRelease(HuffmanTree* const tree) {
-  if (tree != NULL) {
-    free(tree->root_);
-    tree->root_ = NULL;
-    tree->max_nodes_ = 0;
-    tree->num_nodes_ = 0;
+// Returns the table width of the next 2nd level table. count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol
+static WEBP_INLINE int NextTableBitSize(const int* const count,
+                                        int len, int root_bits) {
+  int left = 1 << (len - root_bits);
+  while (len < MAX_ALLOWED_CODE_LENGTH) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
   }
+  return len - root_bits;
 }
 
-int HuffmanCodeLengthsToCodes(const int* const code_lengths,
-                              int code_lengths_size, int* const huff_codes) {
-  int symbol;
-  int code_len;
-  int code_length_hist[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
-  int curr_code;
-  int next_codes[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
-  int max_code_length = 0;
-
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size) {
+  HuffmanCode* table = root_table;  // next available space in table
+  int total_size = 1 << root_bits;  // total size root table + 2nd level table
+  int* sorted = NULL;               // symbols sorted by code length
+  int len;                          // current code length
+  int symbol;                       // symbol index in original or sorted table
+  // number of codes of each length:
+  int count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+  // offsets in sorted table for each length:
+  int offset[MAX_ALLOWED_CODE_LENGTH + 1];
+
+  assert(code_lengths_size != 0);
   assert(code_lengths != NULL);
-  assert(code_lengths_size > 0);
-  assert(huff_codes != NULL);
+  assert(root_table != NULL);
+  assert(root_bits > 0);
 
-  // Calculate max code length.
+  // Build histogram of code lengths.
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    if (code_lengths[symbol] > max_code_length) {
-      max_code_length = code_lengths[symbol];
+    if (code_lengths[symbol] > MAX_ALLOWED_CODE_LENGTH) {
+      return 0;
     }
+    ++count[code_lengths[symbol]];
   }
-  if (max_code_length > MAX_ALLOWED_CODE_LENGTH) return 0;
 
-  // Calculate code length histogram.
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    ++code_length_hist[code_lengths[symbol]];
-  }
-  code_length_hist[0] = 0;
-
-  // Calculate the initial values of 'next_codes' for each code length.
-  // next_codes[code_len] denotes the code to be assigned to the next symbol
-  // of code length 'code_len'.
-  curr_code = 0;
-  next_codes[0] = -1;  // Unused, as code length = 0 implies code doesn't exist.
-  for (code_len = 1; code_len <= max_code_length; ++code_len) {
-    curr_code = (curr_code + code_length_hist[code_len - 1]) << 1;
-    next_codes[code_len] = curr_code;
+  // Error, all code lengths are zeros.
+  if (count[0] == code_lengths_size) {
+    return 0;
   }
 
-  // Get symbols.
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    if (code_lengths[symbol] > 0) {
-      huff_codes[symbol] = next_codes[code_lengths[symbol]]++;
-    } else {
-      huff_codes[symbol] = NON_EXISTENT_SYMBOL;
-    }
-  }
-  return 1;
-}
-
-static int TreeAddSymbol(HuffmanTree* const tree,
-                         int symbol, int code, int code_length) {
-  HuffmanTreeNode* node = tree->root_;
-  const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
-  while (code_length-- > 0) {
-    if (node >= max_node) {
+  // Generate offsets into sorted symbol table by code length.
+  offset[1] = 0;
+  for (len = 1; len < MAX_ALLOWED_CODE_LENGTH; ++len) {
+    if (count[len] > (1 << len)) {
       return 0;
     }
-    if (NodeIsEmpty(node)) {
-      if (IsFull(tree)) return 0;    // error: too many symbols.
-      AssignChildren(tree, node);
-    } else if (HuffmanTreeNodeIsLeaf(node)) {
-      return 0;  // leaf is already occupied.
-    }
-    node += node->children_ + ((code >> code_length) & 1);
-  }
-  if (NodeIsEmpty(node)) {
-    node->children_ = 0;      // turn newly created node into a leaf.
-  } else if (!HuffmanTreeNodeIsLeaf(node)) {
-    return 0;   // trying to assign a symbol to already used code.
+    offset[len + 1] = offset[len] + count[len];
   }
-  node->symbol_ = symbol;  // Add symbol in this node.
-  return 1;
-}
 
-int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             int code_lengths_size) {
-  int symbol;
-  int num_symbols = 0;
-  int root_symbol = 0;
-
-  assert(tree != NULL);
-  assert(code_lengths != NULL);
+  sorted = (int*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
+  if (sorted == NULL) {
+    return 0;
+  }
 
-  // Find out number of symbols and the root symbol.
+  // Sort symbols by length, by symbol order within each length.
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    const int symbol_code_length = code_lengths[symbol];
     if (code_lengths[symbol] > 0) {
-      // Note: code length = 0 indicates non-existent symbol.
-      ++num_symbols;
-      root_symbol = symbol;
+      sorted[offset[symbol_code_length]++] = symbol;
     }
   }
 
-  // Initialize the tree. Will fail for num_symbols = 0
-  if (!TreeInit(tree, num_symbols)) return 0;
-
-  // Build tree.
-  if (num_symbols == 1) {  // Trivial case.
-    const int max_symbol = code_lengths_size;
-    if (root_symbol < 0 || root_symbol >= max_symbol) {
-      HuffmanTreeRelease(tree);
-      return 0;
-    }
-    return TreeAddSymbol(tree, root_symbol, 0, 0);
-  } else {  // Normal case.
-    int ok = 0;
-
-    // Get Huffman codes from the code lengths.
-    int* const codes =
-        (int*)WebPSafeMalloc((uint64_t)code_lengths_size, sizeof(*codes));
-    if (codes == NULL) goto End;
+  // Special case code with only one value.
+  if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
+    HuffmanCode code;
+    code.bits = 0;
+    code.value = (uint16_t)sorted[0];
+    ReplicateValue(table, 1, total_size, code);
+    WebPSafeFree(sorted);
+    return total_size;
+  }
 
-    if (!HuffmanCodeLengthsToCodes(code_lengths, code_lengths_size, codes)) {
-      goto End;
+  {
+    int step;              // step size to replicate values in current table
+    uint32_t low = -1;     // low bits for current root entry
+    uint32_t mask = total_size - 1;    // mask for low bits
+    uint32_t key = 0;      // reversed prefix code
+    int num_nodes = 1;     // number of Huffman tree nodes
+    int num_open = 1;      // number of open branches in current tree level
+    int table_bits = root_bits;        // key length of current table
+    int table_size = 1 << table_bits;  // size of current table
+    symbol = 0;
+    // Fill in root table.
+    for (len = 1, step = 2; len <= root_bits; ++len, step <<= 1) {
+      num_open <<= 1;
+      num_nodes += num_open;
+      num_open -= count[len];
+      if (num_open < 0) {
+        WebPSafeFree(sorted);
+        return 0;
+      }
+      for (; count[len] > 0; --count[len]) {
+        HuffmanCode code;
+        code.bits = (uint8_t)len;
+        code.value = (uint16_t)sorted[symbol++];
+        ReplicateValue(&table[key], step, table_size, code);
+        key = GetNextKey(key, len);
+      }
     }
 
-    // Add symbols one-by-one.
-    for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-      if (code_lengths[symbol] > 0) {
-        if (!TreeAddSymbol(tree, symbol, codes[symbol], code_lengths[symbol])) {
-          goto End;
+    // Fill in 2nd level tables and add pointers to root table.
+    for (len = root_bits + 1, step = 2; len <= MAX_ALLOWED_CODE_LENGTH;
+         ++len, step <<= 1) {
+      num_open <<= 1;
+      num_nodes += num_open;
+      num_open -= count[len];
+      if (num_open < 0) {
+        WebPSafeFree(sorted);
+        return 0;
+      }
+      for (; count[len] > 0; --count[len]) {
+        HuffmanCode code;
+        if ((key & mask) != low) {
+          table += table_size;
+          table_bits = NextTableBitSize(count, len, root_bits);
+          table_size = 1 << table_bits;
+          total_size += table_size;
+          low = key & mask;
+          root_table[low].bits = (uint8_t)(table_bits + root_bits);
+          root_table[low].value = (uint16_t)((table - root_table) - low);
         }
+        code.bits = (uint8_t)(len - root_bits);
+        code.value = (uint16_t)sorted[symbol++];
+        ReplicateValue(&table[key >> root_bits], step, table_size, code);
+        key = GetNextKey(key, len);
       }
     }
-    ok = 1;
- End:
-    free(codes);
-    ok = ok && IsFull(tree);
-    if (!ok) HuffmanTreeRelease(tree);
-    return ok;
-  }
-}
-
-int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             const int* const codes,
-                             const int* const symbols, int max_symbol,
-                             int num_symbols) {
-  int ok = 0;
-  int i;
-
-  assert(tree != NULL);
-  assert(code_lengths != NULL);
-  assert(codes != NULL);
-  assert(symbols != NULL);
-
-  // Initialize the tree. Will fail if num_symbols = 0.
-  if (!TreeInit(tree, num_symbols)) return 0;
 
-  // Add symbols one-by-one.
-  for (i = 0; i < num_symbols; ++i) {
-    if (codes[i] != NON_EXISTENT_SYMBOL) {
-      if (symbols[i] < 0 || symbols[i] >= max_symbol) {
-        goto End;
-      }
-      if (!TreeAddSymbol(tree, symbols[i], codes[i], code_lengths[i])) {
-        goto End;
-      }
+    // Check if tree is full.
+    if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
+      WebPSafeFree(sorted);
+      return 0;
     }
   }
-  ok = 1;
- End:
-  ok = ok && IsFull(tree);
-  if (!ok) HuffmanTreeRelease(tree);
-  return ok;
-}
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+  WebPSafeFree(sorted);
+  return total_size;
+}
diff --git a/drivers/webp/utils/huffman.h b/drivers/webp/utils/huffman.h
index f16447e649..c6dd6aaa45 100644
--- a/drivers/webp/utils/huffman.h
+++ b/drivers/webp/utils/huffman.h
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
@@ -13,65 +15,73 @@
 #define WEBP_UTILS_HUFFMAN_H_
 
 #include <assert.h>
-#include "../types.h"
+#include "../webp/format_constants.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-// A node of a Huffman tree.
-typedef struct {
-  int symbol_;
-  int children_;  // delta offset to both children (contiguous) or 0 if leaf.
-} HuffmanTreeNode;
+#define HUFFMAN_TABLE_BITS      8
+#define HUFFMAN_TABLE_MASK      ((1 << HUFFMAN_TABLE_BITS) - 1)
+
+#define LENGTHS_TABLE_BITS      7
+#define LENGTHS_TABLE_MASK      ((1 << LENGTHS_TABLE_BITS) - 1)
 
-// Huffman Tree.
-typedef struct HuffmanTree HuffmanTree;
-struct HuffmanTree {
-  HuffmanTreeNode* root_;   // all the nodes, starting at root.
-  int max_nodes_;           // max number of nodes
-  int num_nodes_;           // number of currently occupied nodes
-};
 
-// Returns true if the given node is a leaf of the Huffman tree.
-static WEBP_INLINE int HuffmanTreeNodeIsLeaf(
-    const HuffmanTreeNode* const node) {
-  return (node->children_ == 0);
-}
+// Huffman lookup table entry
+typedef struct {
+  uint8_t bits;     // number of bits used for this symbol
+  uint16_t value;   // symbol value or table offset
+} HuffmanCode;
 
-// Go down one level. Most critical function. 'right_child' must be 0 or 1.
-static WEBP_INLINE const HuffmanTreeNode* HuffmanTreeNextNode(
-    const HuffmanTreeNode* node, int right_child) {
-  return node + node->children_ + right_child;
-}
+// long version for holding 32b values
+typedef struct {
+  int bits;         // number of bits used for this symbol,
+                    // or an impossible value if not a literal code.
+  uint32_t value;   // 32b packed ARGB value if literal,
+                    // or non-literal symbol otherwise
+} HuffmanCode32;
 
-// Releases the nodes of the Huffman tree.
-// Note: It does NOT free 'tree' itself.
-void HuffmanTreeRelease(HuffmanTree* const tree);
+#define HUFFMAN_PACKED_BITS 6
+#define HUFFMAN_PACKED_TABLE_SIZE (1u << HUFFMAN_PACKED_BITS)
 
-// Builds Huffman tree assuming code lengths are implicitly in symbol order.
-// Returns false in case of error (invalid tree or memory error).
-int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             int code_lengths_size);
+// Huffman table group.
+// Includes special handling for the following cases:
+//  - is_trivial_literal: one common literal base for RED/BLUE/ALPHA (not GREEN)
+//  - is_trivial_code: only 1 code (no bit is read from bitstream)
+//  - use_packed_table: few enough literal symbols, so all the bit codes
+//    can fit into a small look-up table packed_table[]
+// The common literal base, if applicable, is stored in 'literal_arb'.
+typedef struct HTreeGroup HTreeGroup;
+struct HTreeGroup {
+  HuffmanCode* htrees[HUFFMAN_CODES_PER_META_CODE];
+  int      is_trivial_literal;  // True, if huffman trees for Red, Blue & Alpha
+                                // Symbols are trivial (have a single code).
+  uint32_t literal_arb;         // If is_trivial_literal is true, this is the
+                                // ARGB value of the pixel, with Green channel
+                                // being set to zero.
+  int is_trivial_code;          // true if is_trivial_literal with only one code
+  int use_packed_table;         // use packed table below for short literal code
+  // table mapping input bits to a packed values, or escape case to literal code
+  HuffmanCode32 packed_table[HUFFMAN_PACKED_TABLE_SIZE];
+};
 
-// Build a Huffman tree with explicitly given lists of code lengths, codes
-// and symbols. Verifies that all symbols added are smaller than max_symbol.
-// Returns false in case of an invalid symbol, invalid tree or memory error.
-int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             const int* const codes,
-                             const int* const symbols, int max_symbol,
-                             int num_symbols);
+// Creates the instance of HTreeGroup with specified number of tree-groups.
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
 
-// Utility: converts Huffman code lengths to corresponding Huffman codes.
-// 'huff_codes' should be pre-allocated.
-// Returns false in case of error (memory allocation, invalid codes).
-int HuffmanCodeLengthsToCodes(const int* const code_lengths,
-                              int code_lengths_size, int* const huff_codes);
+// Releases the memory allocated for HTreeGroup.
+void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
 
+// Builds Huffman lookup table assuming code lengths are in symbol order.
+// The 'code_lengths' is pre-allocated temporary memory buffer used for creating
+// the huffman table.
+// Returns built table size or 0 in case of error (invalid tree or
+// memory error).
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/huffman_encode.c b/drivers/webp/utils/huffman_encode.c
index e172b10a85..6421c2beed 100644
--- a/drivers/webp/utils/huffman_encode.c
+++ b/drivers/webp/utils/huffman_encode.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -14,7 +16,7 @@
 #include <string.h>
 #include "./huffman_encode.h"
 #include "../utils/utils.h"
-#include "../format_constants.h"
+#include "../webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
 // Util function to optimize the symbol map for RLE coding
@@ -25,14 +27,14 @@ static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
 }
 
 // Change the population counts in a way that the consequent
-// Hufmann tree compression, especially its RLE-part, give smaller output.
-static int OptimizeHuffmanForRle(int length, int* const counts) {
-  uint8_t* good_for_rle;
+// Huffman tree compression, especially its RLE-part, give smaller output.
+static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
+                                  uint32_t* const counts) {
   // 1) Let's make the Huffman code more compatible with rle encoding.
   int i;
   for (; length >= 0; --length) {
     if (length == 0) {
-      return 1;  // All zeros.
+      return;  // All zeros.
     }
     if (counts[length - 1] != 0) {
       // Now counts[0..length - 1] does not have trailing zeros.
@@ -41,15 +43,11 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
   }
   // 2) Let's mark all population counts that already can be encoded
   // with an rle code.
-  good_for_rle = (uint8_t*)calloc(length, 1);
-  if (good_for_rle == NULL) {
-    return 0;
-  }
   {
     // Let's not spoil any of the existing good rle codes.
     // Mark any seq of 0's that is longer as 5 as a good_for_rle.
     // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
-    int symbol = counts[0];
+    uint32_t symbol = counts[0];
     int stride = 0;
     for (i = 0; i < length + 1; ++i) {
       if (i == length || counts[i] != symbol) {
@@ -71,17 +69,17 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
   }
   // 3) Let's replace those population counts that lead to more rle codes.
   {
-    int stride = 0;
-    int limit = counts[0];
-    int sum = 0;
+    uint32_t stride = 0;
+    uint32_t limit = counts[0];
+    uint32_t sum = 0;
     for (i = 0; i < length + 1; ++i) {
       if (i == length || good_for_rle[i] ||
           (i != 0 && good_for_rle[i - 1]) ||
           !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
         if (stride >= 4 || (stride >= 3 && sum == 0)) {
-          int k;
+          uint32_t k;
           // The stride must end, collapse what we have, if we have enough (4).
-          int count = (sum + stride / 2) / stride;
+          uint32_t count = (sum + stride / 2) / stride;
           if (count < 1) {
             count = 1;
           }
@@ -117,17 +115,8 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
       }
     }
   }
-  free(good_for_rle);
-  return 1;
 }
 
-typedef struct {
-  int total_count_;
-  int value_;
-  int pool_index_left_;
-  int pool_index_right_;
-} HuffmanTree;
-
 // A comparer function for two Huffman trees: sorts first by 'total count'
 // (more comes first), and then by 'value' (more comes first).
 static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
@@ -138,13 +127,8 @@ static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
   } else if (t1->total_count_ < t2->total_count_) {
     return 1;
   } else {
-    if (t1->value_ < t2->value_) {
-      return -1;
-    }
-    if (t1->value_ > t2->value_) {
-      return 1;
-    }
-    return 0;
+    assert(t1->value_ != t2->value_);
+    return (t1->value_ < t2->value_) ? -1 : 1;
   }
 }
 
@@ -178,12 +162,12 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // we are not planning to use this with extremely long blocks.
 //
 // See http://en.wikipedia.org/wiki/Huffman_coding
-static int GenerateOptimalTree(const int* const histogram, int histogram_size,
-                               int tree_depth_limit,
-                               uint8_t* const bit_depths) {
-  int count_min;
+static void GenerateOptimalTree(const uint32_t* const histogram,
+                                int histogram_size,
+                                HuffmanTree* tree, int tree_depth_limit,
+                                uint8_t* const bit_depths) {
+  uint32_t count_min;
   HuffmanTree* tree_pool;
-  HuffmanTree* tree;
   int tree_size_orig = 0;
   int i;
 
@@ -193,12 +177,10 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     }
   }
 
-  // 3 * tree_size is enough to cover all the nodes representing a
-  // population and all the inserted nodes combining two existing nodes.
-  // The tree pool needs 2 * (tree_size_orig - 1) entities, and the
-  // tree needs exactly tree_size_orig entities.
-  tree = (HuffmanTree*)WebPSafeMalloc(3ULL * tree_size_orig, sizeof(*tree));
-  if (tree == NULL) return 0;
+  if (tree_size_orig == 0) {   // pretty optimal already!
+    return;
+  }
+
   tree_pool = tree + tree_size_orig;
 
   // For block sizes with less than 64k symbols we never need to do a
@@ -214,7 +196,7 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     int j;
     for (j = 0; j < histogram_size; ++j) {
       if (histogram[j] != 0) {
-        const int count =
+        const uint32_t count =
             (histogram[j] < count_min) ? count_min : histogram[j];
         tree[idx].total_count_ = count;
         tree[idx].value_ = j;
@@ -230,11 +212,11 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     if (tree_size > 1) {  // Normal case.
       int tree_pool_size = 0;
       while (tree_size > 1) {  // Finish when we have only one root.
-        int count;
+        uint32_t count;
         tree_pool[tree_pool_size++] = tree[tree_size - 1];
         tree_pool[tree_pool_size++] = tree[tree_size - 2];
         count = tree_pool[tree_pool_size - 1].total_count_ +
-            tree_pool[tree_pool_size - 2].total_count_;
+                tree_pool[tree_pool_size - 2].total_count_;
         tree_size -= 2;
         {
           // Search for the insertion point.
@@ -271,8 +253,6 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
       }
     }
   }
-  free(tree);
-  return 1;
 }
 
 // -----------------------------------------------------------------------------
@@ -423,17 +403,15 @@ static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
 // -----------------------------------------------------------------------------
 // Main entry point
 
-int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
-                          HuffmanTreeCode* const tree) {
-  const int num_symbols = tree->num_symbols;
-  if (!OptimizeHuffmanForRle(num_symbols, histogram)) {
-    return 0;
-  }
-  if (!GenerateOptimalTree(histogram, num_symbols,
-                           tree_depth_limit, tree->code_lengths)) {
-    return 0;
-  }
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle,
+                           HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const huff_code) {
+  const int num_symbols = huff_code->num_symbols;
+  memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
+  OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
+  GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
+                      huff_code->code_lengths);
   // Create the actual bit codes for the bit lengths.
-  ConvertBitDepthsToSymbols(tree);
-  return 1;
+  ConvertBitDepthsToSymbols(huff_code);
 }
diff --git a/drivers/webp/utils/huffman_encode.h b/drivers/webp/utils/huffman_encode.h
index 7f4aedc102..a157165148 100644
--- a/drivers/webp/utils/huffman_encode.h
+++ b/drivers/webp/utils/huffman_encode.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -12,9 +14,9 @@
 #ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_
 #define WEBP_UTILS_HUFFMAN_ENCODE_H_
 
-#include "../types.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -31,16 +33,27 @@ typedef struct {
   uint16_t* codes;         // Symbol Codes.
 } HuffmanTreeCode;
 
+// Struct to represent the Huffman tree.
+typedef struct {
+  uint32_t total_count_;   // Symbol frequency.
+  int value_;              // Symbol value.
+  int pool_index_left_;    // Index for the left sub-tree.
+  int pool_index_right_;   // Index for the right sub-tree.
+} HuffmanTree;
+
 // Turn the Huffman tree into a token sequence.
 // Returns the number of tokens used.
 int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
                                     HuffmanTreeToken* tokens, int max_tokens);
 
 // Create an optimized tree, and tokenize it.
-int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
-                          HuffmanTreeCode* const tree);
+// 'buf_rle' and 'huff_tree' are pre-allocated and the 'tree' is the constructed
+// huffman code tree.
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const tree);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/drivers/webp/utils/quant_levels.c b/drivers/webp/utils/quant_levels.c
index f6884392aa..d7c8aab922 100644
--- a/drivers/webp/utils/quant_levels.c
+++ b/drivers/webp/utils/quant_levels.c
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantize levels for specified number of quantization-levels ([2, 256]).
@@ -14,10 +16,6 @@
 
 #include "./quant_levels.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define NUM_SYMBOLS     256
 
 #define MAX_ITER  6             // Maximum number of convergence steps.
@@ -140,15 +138,3 @@ int QuantizeLevels(uint8_t* const data, int width, int height,
   return 1;
 }
 
-int DequantizeLevels(uint8_t* const data, int width, int height) {
-  if (data == NULL || width <= 0 || height <= 0) return 0;
-  // TODO(skal): implement gradient smoothing.
-  (void)data;
-  (void)width;
-  (void)height;
-  return 1;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/utils/quant_levels.h b/drivers/webp/utils/quant_levels.h
index 4f165fd230..1cb5a32cae 100644
--- a/drivers/webp/utils/quant_levels.h
+++ b/drivers/webp/utils/quant_levels.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha plane quantization utility
@@ -14,9 +16,9 @@
 
 #include <stdlib.h>
 
-#include "../types.h"
+#include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -27,12 +29,7 @@ extern "C" {
 int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
                    uint64_t* const sse);
 
-// Apply post-processing to input 'data' of size 'width'x'height' assuming
-// that the source was quantized to a reduced number of levels.
-// Returns false in case of error (data is NULL, invalid parameters, ...).
-int DequantizeLevels(uint8_t* const data, int width, int height);
-
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/rescaler.c b/drivers/webp/utils/rescaler.c
index 9825dcbc5f..00c9300bfb 100644
--- a/drivers/webp/utils/rescaler.c
+++ b/drivers/webp/utils/rescaler.c
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
@@ -11,124 +13,116 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
+#include "../dsp/dsp.h"
 #include "./rescaler.h"
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#define RFIX 30
-#define MULT_FIX(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
-
 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst, int dst_width, int dst_height,
-                      int dst_stride, int num_channels, int x_add, int x_sub,
-                      int y_add, int y_sub, int32_t* const work) {
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels, rescaler_t* const work) {
+  const int x_add = src_width, x_sub = dst_width;
+  const int y_add = src_height, y_sub = dst_height;
   wrk->x_expand = (src_width < dst_width);
+  wrk->y_expand = (src_height < dst_height);
   wrk->src_width = src_width;
   wrk->src_height = src_height;
   wrk->dst_width = dst_width;
   wrk->dst_height = dst_height;
+  wrk->src_y = 0;
+  wrk->dst_y = 0;
   wrk->dst = dst;
   wrk->dst_stride = dst_stride;
   wrk->num_channels = num_channels;
+
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
   wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  if (!wrk->x_expand) {  // fx_scale is not used otherwise
+    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  }
+  // vertical scaling parameters
+  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
+  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
+  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
+  if (!wrk->y_expand) {
+    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    const uint64_t ratio =
+        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    if (ratio != (uint32_t)ratio) {
+      // We can't represent the ratio with the current fixed-point precision.
+      // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
+      wrk->fxy_scale = 0;
+    } else {
+      wrk->fxy_scale = (uint32_t)ratio;
+    }
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+  } else {
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
+    // wrk->fxy_scale is unused here.
+  }
   wrk->irow = work;
   wrk->frow = work + num_channels * dst_width;
-}
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
 
-void WebPRescalerImportRow(WebPRescaler* const wrk,
-                           const uint8_t* const src, int channel) {
-  const int x_stride = wrk->num_channels;
-  const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-      accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in];
-        x_in += x_stride;
-      }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in];
-        const int32_t frac = base * (-accum);
-        x_in += x_stride;
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
-        // fresh fractional start for next pixel
-        sum = (int)MULT_FIX(frac, wrk->fx_scale);
-      }
-    }
-  } else {        // simple bilinear interpolation
-    int left = src[channel], right = src[channel];
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-      if (accum < 0) {
-        left = right;
-        x_in += x_stride;
-        right = src[x_in];
-        accum += wrk->x_add;
-      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
-    }
-  }
-  // Accumulate the new row's contribution
-  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-    wrk->irow[x_out] += wrk->frow[x_out];
-  }
+  WebPRescalerDspInit();
 }
 
-uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
-  if (wrk->y_accum <= 0) {
-    int x_out;
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
+int WebPRescalerGetScaledDimensions(int src_width, int src_height,
+                                    int* const scaled_width,
+                                    int* const scaled_height) {
+  assert(scaled_width != NULL);
+  assert(scaled_height != NULL);
+  {
+    int width = *scaled_width;
+    int height = *scaled_height;
 
-    for (x_out = 0; x_out < x_out_max; ++x_out) {
-      const int frac = (int)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
-      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-      irow[x_out] = frac;   // new fractional start
+    // if width is unspecified, scale original proportionally to height ratio.
+    if (width == 0) {
+      width = (src_width * height + src_height / 2) / src_height;
     }
-    wrk->y_accum += wrk->y_add;
-    wrk->dst += wrk->dst_stride;
-    return dst;
-  } else {
-    return NULL;
+    // if height is unspecified, scale original proportionally to width ratio.
+    if (height == 0) {
+      height = (src_height * width + src_width / 2) / src_width;
+    }
+    // Check if the overall dimensions still make sense.
+    if (width <= 0 || height <= 0) {
+      return 0;
+    }
+
+    *scaled_width = width;
+    *scaled_height = height;
+    return 1;
   }
 }
 
-#undef MULT_FIX
-#undef RFIX
-
 //------------------------------------------------------------------------------
 // all-in-one calls
 
+int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
+  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+  return (num_lines > max_num_lines) ? max_num_lines : num_lines;
+}
+
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && wrk->y_accum > 0) {
-    int channel;
-    for (channel = 0; channel < wrk->num_channels; ++channel) {
-      WebPRescalerImportRow(wrk, src, channel);
+  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
+    if (wrk->y_expand) {
+      rescaler_t* const tmp = wrk->irow;
+      wrk->irow = wrk->frow;
+      wrk->frow = tmp;
     }
+    WebPRescalerImportRow(wrk, src);
+    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+      int x;
+      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
+        wrk->irow[x] += wrk->frow[x];
+      }
+    }
+    ++wrk->src_y;
     src += src_stride;
     ++total_imported;
     wrk->y_accum -= wrk->y_sub;
@@ -146,7 +140,3 @@ int WebPRescalerExport(WebPRescaler* const rescaler) {
 }
 
 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/drivers/webp/utils/rescaler.h b/drivers/webp/utils/rescaler.h
index 9c9133d19b..98b01a76d0 100644
--- a/drivers/webp/utils/rescaler.h
+++ b/drivers/webp/utils/rescaler.h
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
@@ -12,64 +14,87 @@
 #ifndef WEBP_UTILS_RESCALER_H_
 #define WEBP_UTILS_RESCALER_H_
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "../types.h"
+#include "../webp/types.h"
+
+#define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
+#define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_FRAC(x, y) \
+    ((uint32_t)(((uint64_t)(x) << WEBP_RESCALER_RFIX) / (y)))
 
 // Structure used for on-the-fly rescaling
-typedef struct {
+typedef uint32_t rescaler_t;   // type for side-buffer
+typedef struct WebPRescaler WebPRescaler;
+struct WebPRescaler {
   int x_expand;               // true if we're expanding in the x direction
+  int y_expand;               // true if we're expanding in the y direction
   int num_channels;           // bytes to jump between pixels
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  uint32_t fx_scale;          // fixed-point scaling factors
+  uint32_t fy_scale;          // ''
+  uint32_t fxy_scale;         // ''
   int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int y_add, y_sub;           // vertical increments
+  int x_add, x_sub;           // horizontal increments
   int src_width, src_height;  // source dimensions
   int dst_width, dst_height;  // destination dimensions
+  int src_y, dst_y;           // row counters for input and output
   uint8_t* dst;
   int dst_stride;
-  int32_t* irow, *frow;       // work buffer
-} WebPRescaler;
+  rescaler_t* irow, *frow;    // work buffer
+};
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+void WebPRescalerInit(WebPRescaler* const rescaler,
+                      int src_width, int src_height,
                       uint8_t* const dst,
                       int dst_width, int dst_height, int dst_stride,
                       int num_channels,
-                      int x_add, int x_sub,
-                      int y_add, int y_sub,
-                      int32_t* const work);
+                      rescaler_t* const work);
+
+// If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
+// will be calculated preserving the aspect ratio, otherwise the values are
+// left unmodified. Returns true on success, false if either value is 0 after
+// performing the scaling calculation.
+int WebPRescalerGetScaledDimensions(int src_width, int src_height,
+                                    int* const scaled_width,
+                                    int* const scaled_height);
 
-// Import a row of data and save its contribution in the rescaler.
-// 'channel' denotes the channel number to be imported.
-void WebPRescalerImportRow(WebPRescaler* const rescaler,
-                           const uint8_t* const src, int channel);
+// Returns the number of input lines needed next to produce one output line,
+// considering that the maximum available input lines are 'max_num_lines'.
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines);
 
 // Import multiple rows over all channels, until at least one row is ready to
 // be exported. Returns the actual number of lines that were imported.
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                        const uint8_t* src, int src_stride);
 
-// Return true if there is pending output rows ready.
+// Export as many rows as possible. Return the numbers of rows written.
+int WebPRescalerExport(WebPRescaler* const rescaler);
+
+// Return true if input is finished
 static WEBP_INLINE
-int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
-  return (rescaler->y_accum <= 0);
+int WebPRescalerInputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->src_y >= rescaler->src_height);
+}
+// Return true if output is finished
+static WEBP_INLINE
+int WebPRescalerOutputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->dst_y >= rescaler->dst_height);
 }
 
-// Export one row from rescaler. Returns the pointer where output was written,
-// or NULL if no row was pending.
-uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk);
-
-// Export as many rows as possible. Return the numbers of rows written.
-int WebPRescalerExport(WebPRescaler* const wrk);
+// Return true if there are pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return !WebPRescalerOutputDone(rescaler) && (rescaler->y_accum <= 0);
+}
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/thread.c b/drivers/webp/utils/thread.c
index ce89cf9dc7..93f7622797 100644
--- a/drivers/webp/utils/thread.c
+++ b/drivers/webp/utils/thread.c
@@ -1,27 +1,60 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Multi-threaded worker
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./thread.h"
+#include "./utils.h"
+
+#ifdef WEBP_USE_THREAD
+
+#if defined(_WIN32)
+
+#include <windows.h>
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
+#define USE_WINDOWS_CONDITION_VARIABLE
+typedef CONDITION_VARIABLE pthread_cond_t;
+#else
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+#endif  // _WIN32_WINNT >= 0x600
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
 #endif
 
-#ifdef WEBP_USE_THREAD
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+#else  // !_WIN32
+
+#include <pthread.h>
+
+#endif  // _WIN32
+
+struct WebPWorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
 
 #if defined(_WIN32)
 
@@ -34,15 +67,29 @@ extern "C" {
 #define THREADFN unsigned int __stdcall
 #define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
 
+#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
+#define WaitForSingleObject(obj, timeout) \
+  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
+#endif
+
 static int pthread_create(pthread_t* const thread, const void* attr,
                           unsigned int (__stdcall *start)(void*), void* arg) {
   (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,   /* lpThreadAttributes */
+                         0,      /* dwStackSize */
+                         start,
+                         arg,
+                         0,      /* dwStackSize */
+                         NULL);  /* lpThreadId */
+#else
   *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
                                       0,      /* unsigned stack_size */
                                       start,
                                       arg,
                                       0,      /* unsigned initflag */
                                       NULL);  /* unsigned *thrdaddr */
+#endif
   if (*thread == NULL) return 1;
   SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
   return 0;
@@ -57,7 +104,11 @@ static int pthread_join(pthread_t thread, void** value_ptr) {
 // Mutex
 static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
   (void)mutexattr;
+#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
+  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+#else
   InitializeCriticalSection(mutex);
+#endif
   return 0;
 }
 
@@ -79,14 +130,21 @@ static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
 // Condition
 static int pthread_cond_destroy(pthread_cond_t* const condition) {
   int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  (void)condition;
+#else
   ok &= (CloseHandle(condition->waiting_sem_) != 0);
   ok &= (CloseHandle(condition->received_sem_) != 0);
   ok &= (CloseHandle(condition->signal_event_) != 0);
+#endif
   return !ok;
 }
 
 static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
   (void)cond_attr;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  InitializeConditionVariable(condition);
+#else
   condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
   condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
   condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
@@ -96,11 +154,15 @@ static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
     pthread_cond_destroy(condition);
     return 1;
   }
+#endif
   return 0;
 }
 
 static int pthread_cond_signal(pthread_cond_t* const condition) {
   int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  WakeConditionVariable(condition);
+#else
   if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
     // a thread is waiting in pthread_cond_wait: allow it to be notified
     ok = SetEvent(condition->signal_event_);
@@ -109,12 +171,16 @@ static int pthread_cond_signal(pthread_cond_t* const condition) {
     ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
            WAIT_OBJECT_0);
   }
+#endif
   return !ok;
 }
 
 static int pthread_cond_wait(pthread_cond_t* const condition,
                              pthread_mutex_t* const mutex) {
   int ok;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+#else
   // note that there is a consumer available so the signal isn't dropped in
   // pthread_cond_signal
   if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
@@ -125,123 +191,168 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
         WAIT_OBJECT_0);
   ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
   pthread_mutex_lock(mutex);
+#endif
   return !ok;
 }
 
-#else  // _WIN32
+#else  // !_WIN32
 # define THREADFN void*
 # define THREAD_RETURN(val) val
-#endif
+#endif  // _WIN32
 
 //------------------------------------------------------------------------------
 
-static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
+static void Execute(WebPWorker* const worker);  // Forward declaration.
+
+static THREADFN ThreadLoop(void* ptr) {
   WebPWorker* const worker = (WebPWorker*)ptr;
   int done = 0;
   while (!done) {
-    pthread_mutex_lock(&worker->mutex_);
+    pthread_mutex_lock(&worker->impl_->mutex_);
     while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     if (worker->status_ == WORK) {
-      if (worker->hook) {
-        worker->had_error |= !worker->hook(worker->data1, worker->data2);
-      }
+      Execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->condition_);
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
 
 // main thread state control
-static void WebPWorkerChangeState(WebPWorker* const worker,
-                                  WebPWorkerStatus new_status) {
-  // no-op when attempting to change state on a thread that didn't come up
-  if (worker->status_ < OK) return;
-
-  pthread_mutex_lock(&worker->mutex_);
-  // wait for the worker to finish
-  while (worker->status_ != OK) {
-    pthread_cond_wait(&worker->condition_, &worker->mutex_);
-  }
-  // assign new status and release the working thread if needed
-  if (new_status != OK) {
-    worker->status_ = new_status;
-    pthread_cond_signal(&worker->condition_);
+static void ChangeState(WebPWorker* const worker,
+                        WebPWorkerStatus new_status) {
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
+
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
   }
-  pthread_mutex_unlock(&worker->mutex_);
+  pthread_mutex_unlock(&worker->impl_->mutex_);
 }
 
-#endif
+#endif  // WEBP_USE_THREAD
 
 //------------------------------------------------------------------------------
 
-void WebPWorkerInit(WebPWorker* const worker) {
+static void Init(WebPWorker* const worker) {
   memset(worker, 0, sizeof(*worker));
   worker->status_ = NOT_OK;
 }
 
-int WebPWorkerSync(WebPWorker* const worker) {
+static int Sync(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  WebPWorkerChangeState(worker, OK);
+  ChangeState(worker, OK);
 #endif
   assert(worker->status_ <= OK);
   return !worker->had_error;
 }
 
-int WebPWorkerReset(WebPWorker* const worker) {
+static int Reset(WebPWorker* const worker) {
   int ok = 1;
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #ifdef WEBP_USE_THREAD
-    if (pthread_mutex_init(&worker->mutex_, NULL) ||
-        pthread_cond_init(&worker->condition_, NULL)) {
+    worker->impl_ = (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
       return 0;
     }
-    pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, ThreadLoop, worker);
     if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      WebPSafeFree(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
 #else
     worker->status_ = OK;
 #endif
   } else if (worker->status_ > OK) {
-    ok = WebPWorkerSync(worker);
+    ok = Sync(worker);
   }
   assert(!ok || (worker->status_ == OK));
   return ok;
 }
 
-void WebPWorkerLaunch(WebPWorker* const worker) {
+static void Execute(WebPWorker* const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
+static void Launch(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  WebPWorkerChangeState(worker, WORK);
+  ChangeState(worker, WORK);
 #else
-  if (worker->hook)
-    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  Execute(worker);
 #endif
 }
 
-void WebPWorkerEnd(WebPWorker* const worker) {
-  if (worker->status_ >= OK) {
+static void End(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-    WebPWorkerChangeState(worker, NOT_OK);
-    pthread_join(worker->thread_, NULL);
-    pthread_mutex_destroy(&worker->mutex_);
-    pthread_cond_destroy(&worker->condition_);
+  if (worker->impl_ != NULL) {
+    ChangeState(worker, NOT_OK);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+    WebPSafeFree(worker->impl_);
+    worker->impl_ = NULL;
+  }
 #else
-    worker->status_ = NOT_OK;
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
 #endif
-  }
   assert(worker->status_ == NOT_OK);
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+static WebPWorkerInterface g_worker_interface = {
+  Init, Reset, Sync, Launch, Execute, End
+};
+
+int WebPSetWorkerInterface(const WebPWorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->Init == NULL || winterface->Reset == NULL ||
+      winterface->Sync == NULL || winterface->Launch == NULL ||
+      winterface->Execute == NULL || winterface->End == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const WebPWorkerInterface* WebPGetWorkerInterface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/drivers/webp/utils/thread.h b/drivers/webp/utils/thread.h
index 3191890b76..8408311855 100644
--- a/drivers/webp/utils/thread.h
+++ b/drivers/webp/utils/thread.h
@@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Multi-threaded worker
@@ -12,29 +14,15 @@
 #ifndef WEBP_UTILS_THREAD_H_
 #define WEBP_UTILS_THREAD_H_
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
 #endif
 
-#if WEBP_USE_THREAD
-
-#if defined(_WIN32)
-
-#include <windows.h>
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
+#include "../webp/types.h"
 
-#else
-
-#include <pthread.h>
-
-#endif    /* _WIN32 */
-#endif    /* WEBP_USE_THREAD */
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 // State of the worker thread object
 typedef enum {
@@ -47,13 +35,12 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*WebPWorkerHook)(void*, void*);
 
-// Synchronize object used to launch job in the worker thread
+// Platform-dependent implementation details for the worker.
+typedef struct WebPWorkerImpl WebPWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
 typedef struct {
-#if WEBP_USE_THREAD
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-#endif
+  WebPWorkerImpl* impl_;
   WebPWorkerStatus status_;
   WebPWorkerHook hook;    // hook to call
   void* data1;            // first argument passed to 'hook'
@@ -61,25 +48,45 @@ typedef struct {
   int had_error;          // return value of the last call to 'hook'
 } WebPWorker;
 
-// Must be called first, before any other method.
-void WebPWorkerInit(WebPWorker* const worker);
-// Must be called initialize the object and spawn the thread. Re-entrant.
-// Will potentially launch the thread. Returns false in case of error.
-int WebPWorkerReset(WebPWorker* const worker);
-// Make sure the previous work is finished. Returns true if worker->had_error
-// was not set and not error condition was triggered by the working thread.
-int WebPWorkerSync(WebPWorker* const worker);
-// Trigger the thread to call hook() with data1 and data2 argument. These
-// hook/data1/data2 can be changed at any time before calling this function,
-// but not be changed afterward until the next call to WebPWorkerSync().
-void WebPWorkerLaunch(WebPWorker* const worker);
-// Kill the thread and terminate the object. To use the object again, one
-// must call WebPWorkerReset() again.
-void WebPWorkerEnd(WebPWorker* const worker);
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*Init)(WebPWorker* const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*Reset)(WebPWorker* const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*Sync)(WebPWorker* const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*Launch)(WebPWorker* const worker);
+  // This function is similar to Launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the WebPWorker structs. Sync() must
+  // still be called afterward (for error reporting).
+  void (*Execute)(WebPWorker* const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call Reset() again.
+  void (*End)(WebPWorker* const worker);
+} WebPWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+WEBP_EXTERN(int) WebPSetWorkerInterface(
+    const WebPWorkerInterface* const winterface);
+
+// Retrieve the currently set thread worker interface.
+WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/utils/utils.c b/drivers/webp/utils/utils.c
index 673b7e284c..d8e30930af 100644
--- a/drivers/webp/utils/utils.c
+++ b/drivers/webp/utils/utils.c
@@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Misc. common utility functions
@@ -10,35 +12,228 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
+#include <string.h>  // for memcpy()
+#include "../webp/decode.h"
+#include "../webp/encode.h"
 #include "./utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
+// alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
+// and not multi-thread safe!).
+// An interesting alternative is valgrind's 'massif' tool:
+//    http://valgrind.org/docs/manual/ms-manual.html
+// Here is an example command line:
+/*    valgrind --tool=massif --massif-out-file=massif.out \
+               --stacks=yes --alloc-fn=WebPSafeAlloc --alloc-fn=WebPSafeCalloc
+      ms_print massif.out
+*/
+// In addition:
+// * if PRINT_MEM_TRAFFIC is defined, all the details of the malloc/free cycles
+//   are printed.
+// * if MALLOC_FAIL_AT is defined, the global environment variable
+//   $MALLOC_FAIL_AT is used to simulate a memory error when calloc or malloc
+//   is called for the nth time. Example usage:
+//   export MALLOC_FAIL_AT=50 && ./examples/cwebp input.png
+// * if MALLOC_LIMIT is defined, the global environment variable $MALLOC_LIMIT
+//   sets the maximum amount of memory (in bytes) made available to libwebp.
+//   This can be used to emulate environment with very limited memory.
+//   Example: export MALLOC_LIMIT=64000000 && ./examples/dwebp picture.webp
+
+// #define PRINT_MEM_INFO
+// #define PRINT_MEM_TRAFFIC
+// #define MALLOC_FAIL_AT
+// #define MALLOC_LIMIT
 
 //------------------------------------------------------------------------------
 // Checked memory allocation
 
-static int CheckSizeArguments(uint64_t nmemb, size_t size) {
+#if defined(PRINT_MEM_INFO)
+
+#include <stdio.h>
+
+static int num_malloc_calls = 0;
+static int num_calloc_calls = 0;
+static int num_free_calls = 0;
+static int countdown_to_fail = 0;     // 0 = off
+
+typedef struct MemBlock MemBlock;
+struct MemBlock {
+  void* ptr_;
+  size_t size_;
+  MemBlock* next_;
+};
+
+static MemBlock* all_blocks = NULL;
+static size_t total_mem = 0;
+static size_t total_mem_allocated = 0;
+static size_t high_water_mark = 0;
+static size_t mem_limit = 0;
+
+static int exit_registered = 0;
+
+static void PrintMemInfo(void) {
+  fprintf(stderr, "\nMEMORY INFO:\n");
+  fprintf(stderr, "num calls to: malloc = %4d\n", num_malloc_calls);
+  fprintf(stderr, "              calloc = %4d\n", num_calloc_calls);
+  fprintf(stderr, "              free   = %4d\n", num_free_calls);
+  fprintf(stderr, "total_mem: %u\n", (uint32_t)total_mem);
+  fprintf(stderr, "total_mem allocated: %u\n", (uint32_t)total_mem_allocated);
+  fprintf(stderr, "high-water mark: %u\n", (uint32_t)high_water_mark);
+  while (all_blocks != NULL) {
+    MemBlock* b = all_blocks;
+    all_blocks = b->next_;
+    free(b);
+  }
+}
+
+static void Increment(int* const v) {
+  if (!exit_registered) {
+#if defined(MALLOC_FAIL_AT)
+    {
+      const char* const malloc_fail_at_str = getenv("MALLOC_FAIL_AT");
+      if (malloc_fail_at_str != NULL) {
+        countdown_to_fail = atoi(malloc_fail_at_str);
+      }
+    }
+#endif
+#if defined(MALLOC_LIMIT)
+    {
+      const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+      if (malloc_limit_str != NULL) {
+        mem_limit = atoi(malloc_limit_str);
+      }
+    }
+#endif
+    (void)countdown_to_fail;
+    (void)mem_limit;
+    atexit(PrintMemInfo);
+    exit_registered = 1;
+  }
+  ++*v;
+}
+
+static void AddMem(void* ptr, size_t size) {
+  if (ptr != NULL) {
+    MemBlock* const b = (MemBlock*)malloc(sizeof(*b));
+    if (b == NULL) abort();
+    b->next_ = all_blocks;
+    all_blocks = b;
+    b->ptr_ = ptr;
+    b->size_ = size;
+    total_mem += size;
+    total_mem_allocated += size;
+#if defined(PRINT_MEM_TRAFFIC)
+#if defined(MALLOC_FAIL_AT)
+    fprintf(stderr, "fail-count: %5d [mem=%u]\n",
+            num_malloc_calls + num_calloc_calls, (uint32_t)total_mem);
+#else
+    fprintf(stderr, "Mem: %u (+%u)\n", (uint32_t)total_mem, (uint32_t)size);
+#endif
+#endif
+    if (total_mem > high_water_mark) high_water_mark = total_mem;
+  }
+}
+
+static void SubMem(void* ptr) {
+  if (ptr != NULL) {
+    MemBlock** b = &all_blocks;
+    // Inefficient search, but that's just for debugging.
+    while (*b != NULL && (*b)->ptr_ != ptr) b = &(*b)->next_;
+    if (*b == NULL) {
+      fprintf(stderr, "Invalid pointer free! (%p)\n", ptr);
+      abort();
+    }
+    {
+      MemBlock* const block = *b;
+      *b = block->next_;
+      total_mem -= block->size_;
+#if defined(PRINT_MEM_TRAFFIC)
+      fprintf(stderr, "Mem: %u (-%u)\n",
+              (uint32_t)total_mem, (uint32_t)block->size_);
+#endif
+      free(block);
+    }
+  }
+}
+
+#else
+#define Increment(v) do {} while (0)
+#define AddMem(p, s) do {} while (0)
+#define SubMem(p)    do {} while (0)
+#endif
+
+// Returns 0 in case of overflow of nmemb * size.
+static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
   if (total_size != (size_t)total_size) return 0;
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
+  if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
+    return 0;    // fake fail!
+  }
+#endif
+#if defined(MALLOC_LIMIT)
+  if (mem_limit > 0 && total_mem + total_size >= mem_limit) {
+    return 0;   // fake fail!
+  }
+#endif
+
   return 1;
 }
 
 void* WebPSafeMalloc(uint64_t nmemb, size_t size) {
-  if (!CheckSizeArguments(nmemb, size)) return NULL;
-  return malloc((size_t)(nmemb * size));
+  void* ptr;
+  Increment(&num_malloc_calls);
+  if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
+  assert(nmemb * size > 0);
+  ptr = malloc((size_t)(nmemb * size));
+  AddMem(ptr, (size_t)(nmemb * size));
+  return ptr;
 }
 
 void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
-  if (!CheckSizeArguments(nmemb, size)) return NULL;
-  return calloc((size_t)nmemb, size);
+  void* ptr;
+  Increment(&num_calloc_calls);
+  if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
+  assert(nmemb * size > 0);
+  ptr = calloc((size_t)nmemb, size);
+  AddMem(ptr, (size_t)(nmemb * size));
+  return ptr;
+}
+
+void WebPSafeFree(void* const ptr) {
+  if (ptr != NULL) {
+    Increment(&num_free_calls);
+    SubMem(ptr);
+  }
+  free(ptr);
+}
+
+// Public API function.
+void WebPFree(void* ptr) {
+  free(ptr);
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+void WebPCopyPlane(const uint8_t* src, int src_stride,
+                   uint8_t* dst, int dst_stride, int width, int height) {
+  assert(src != NULL && dst != NULL);
+  assert(src_stride >= width && dst_stride >= width);
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  assert(src->width == dst->width && src->height == dst->height);
+  assert(src->use_argb && dst->use_argb);
+  WebPCopyPlane((uint8_t*)src->argb, 4 * src->argb_stride, (uint8_t*)dst->argb,
+                4 * dst->argb_stride, 4 * src->width, src->height);
+}
+
+//------------------------------------------------------------------------------
diff --git a/drivers/webp/utils/utils.h b/drivers/webp/utils/utils.h
index 316ac90612..fcdb7e139b 100644
--- a/drivers/webp/utils/utils.h
+++ b/drivers/webp/utils/utils.h
@@ -1,20 +1,25 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Misc. common utility functions
 //
-// Author: Skal (pascal.massimino@gmail.com)
+// Authors: Skal (pascal.massimino@gmail.com)
+//          Urvang (urvang@google.com)
 
 #ifndef WEBP_UTILS_UTILS_H_
 #define WEBP_UTILS_UTILS_H_
 
-#include "../types.h"
+#include <assert.h>
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#include "../webp/types.h"
+
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -30,14 +35,107 @@ extern "C" {
 // somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
 // safe malloc() borrows the signature from calloc(), pointing at the dangerous
 // underlying multiply involved.
-void* WebPSafeMalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN(void*) WebPSafeMalloc(uint64_t nmemb, size_t size);
 // Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
 // in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
-void* WebPSafeCalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN(void*) WebPSafeCalloc(uint64_t nmemb, size_t size);
+
+// Companion deallocation function to the above allocations.
+WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
+
+//------------------------------------------------------------------------------
+// Alignment
+
+#define WEBP_ALIGN_CST 31
+#define WEBP_ALIGN(PTR) ((uintptr_t)((PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
+
+//------------------------------------------------------------------------------
+// Reading/writing data.
+
+// Read 16, 24 or 32 bits stored in little-endian order.
+static WEBP_INLINE int GetLE16(const uint8_t* const data) {
+  return (int)(data[0] << 0) | (data[1] << 8);
+}
+
+static WEBP_INLINE int GetLE24(const uint8_t* const data) {
+  return GetLE16(data) | (data[2] << 16);
+}
+
+static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
+  return GetLE16(data) | ((uint32_t)GetLE16(data + 2) << 16);
+}
+
+// Store 16, 24 or 32 bits in little-endian order.
+static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
+  assert(val < (1 << 16));
+  data[0] = (val >> 0);
+  data[1] = (val >> 8);
+}
+
+static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
+  assert(val < (1 << 24));
+  PutLE16(data, val & 0xffff);
+  data[2] = (val >> 16);
+}
+
+static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
+  PutLE16(data, (int)(val & 0xffff));
+  PutLE16(data + 2, (int)(val >> 16));
+}
+
+// Returns (int)floor(log2(n)). n must be > 0.
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Pixel copying.
+
+struct WebPPicture;
+
+// Copy width x height pixels from 'src' to 'dst' honoring the strides.
+WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride,
+                                uint8_t* dst, int dst_stride,
+                                int width, int height);
+
+// Copy ARGB pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are
+// assumed to be already allocated and using ARGB data.
+WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
+                                 struct WebPPicture* const dst);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif