25 files changed, 18701 insertions, 0 deletions
diff --git a/thirdparty/misc/aes256.cpp b/thirdparty/misc/aes256.cpp
new file mode 100644
index 0000000000..dc271928b4
--- /dev/null
+++ b/thirdparty/misc/aes256.cpp
@@ -0,0 +1,397 @@
+/*
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations.
+*
+*   Copyright (c) 2007-2011 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+#include "aes256.h"
+
+#define FD(x)  (((x) >> 1) ^ (((x) & 1) ? 0x8d : 0))
+
+#define BACK_TO_TABLES
+
+static uint8_t rj_xtime(uint8_t);
+static void aes_subBytes(uint8_t *);
+static void aes_subBytes_inv(uint8_t *);
+static void aes_addRoundKey(uint8_t *, uint8_t *);
+static void aes_addRoundKey_cpy(uint8_t *, uint8_t *, uint8_t *);
+static void aes_shiftRows(uint8_t *);
+static void aes_shiftRows_inv(uint8_t *);
+static void aes_mixColumns(uint8_t *);
+static void aes_mixColumns_inv(uint8_t *);
+static void aes_expandEncKey(uint8_t *, uint8_t *);
+static void aes_expandDecKey(uint8_t *, uint8_t *);
+#ifndef BACK_TO_TABLES
+static uint8_t gf_alog(uint8_t);
+static uint8_t gf_log(uint8_t);
+static uint8_t gf_mulinv(uint8_t);
+static uint8_t rj_sbox(uint8_t);
+static uint8_t rj_sbox_inv(uint8_t);
+#endif
+
+#ifdef BACK_TO_TABLES
+
+static const uint8_t sbox[256] = {
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+static const uint8_t sboxinv[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+
+#define rj_sbox(x)     sbox[(x)]
+#define rj_sbox_inv(x) sboxinv[(x)]
+
+#else /* tableless subroutines */
+
+/* -------------------------------------------------------------------------- */
+static uint8_t gf_alog(uint8_t x) // calculate anti-logarithm gen 3
+{
+    uint8_t y = 1, i;
+
+    for (i = 0; i < x; i++) y ^= rj_xtime(y);
+
+    return y;
+} /* gf_alog */
+
+/* -------------------------------------------------------------------------- */
+static uint8_t gf_log(uint8_t x) // calculate logarithm gen 3
+{
+    uint8_t y, i = 0;
+
+    if (x)
+        for (i = 1, y = 1; i > 0; i++ )
+        {
+            y ^= rj_xtime(y);
+            if (y == x) break;
+        }
+
+    return i;
+} /* gf_log */
+
+
+/* -------------------------------------------------------------------------- */
+static uint8_t gf_mulinv(uint8_t x) // calculate multiplicative inverse
+{
+    return (x) ? gf_alog(255 - gf_log(x)) : 0;
+} /* gf_mulinv */
+
+/* -------------------------------------------------------------------------- */
+static uint8_t rj_sbox(uint8_t x)
+{
+    uint8_t y, sb;
+
+    sb = y = gf_mulinv(x);
+    y = (uint8_t)(y << 1) | (y >> 7), sb ^= y;
+    y = (uint8_t)(y << 1) | (y >> 7), sb ^= y;
+    y = (uint8_t)(y << 1) | (y >> 7), sb ^= y;
+    y = (uint8_t)(y << 1) | (y >> 7), sb ^= y;
+
+    return (sb ^ 0x63);
+} /* rj_sbox */
+
+/* -------------------------------------------------------------------------- */
+static uint8_t rj_sbox_inv(uint8_t x)
+{
+    uint8_t y, sb;
+
+    y = x ^ 0x63;
+    sb = y = (uint8_t)(y << 1) | (y >> 7);
+    y = (uint8_t)(y << 2) | (y >> 6);
+    sb ^= y;
+    y = (uint8_t)(y << 3) | (y >> 5);
+    sb ^= y;
+
+    return gf_mulinv(sb);
+} /* rj_sbox_inv */
+
+#endif
+
+/* -------------------------------------------------------------------------- */
+static uint8_t rj_xtime(uint8_t x)
+{
+    uint8_t y = (uint8_t)(x << 1);
+    return (x & 0x80) ? (y ^ 0x1b) : y;
+} /* rj_xtime */
+
+/* -------------------------------------------------------------------------- */
+static void aes_subBytes(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox(buf[i]);
+} /* aes_subBytes */
+
+/* -------------------------------------------------------------------------- */
+static void aes_subBytes_inv(uint8_t *buf)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] = rj_sbox_inv(buf[i]);
+} /* aes_subBytes_inv */
+
+/* -------------------------------------------------------------------------- */
+static void aes_addRoundKey(uint8_t *buf, uint8_t *key)
+{
+    register uint8_t i = 16;
+
+    while (i--) buf[i] ^= key[i];
+} /* aes_addRoundKey */
+
+/* -------------------------------------------------------------------------- */
+static void aes_addRoundKey_cpy(uint8_t *buf, uint8_t *key, uint8_t *cpk)
+{
+    register uint8_t i = 16;
+
+    while (i--)  buf[i] ^= (cpk[i] = key[i]), cpk[16 + i] = key[16 + i];
+} /* aes_addRoundKey_cpy */
+
+
+/* -------------------------------------------------------------------------- */
+static void aes_shiftRows(uint8_t *buf)
+{
+    register uint8_t i, j; /* to make it potentially parallelable :) */
+
+    i = buf[1], buf[1] = buf[5], buf[5] = buf[9], buf[9] = buf[13], buf[13] = i;
+    i = buf[10], buf[10] = buf[2], buf[2] = i;
+    j = buf[3], buf[3] = buf[15], buf[15] = buf[11], buf[11] = buf[7], buf[7] = j;
+    j = buf[14], buf[14] = buf[6], buf[6]  = j;
+
+} /* aes_shiftRows */
+
+/* -------------------------------------------------------------------------- */
+static void aes_shiftRows_inv(uint8_t *buf)
+{
+    register uint8_t i, j; /* same as above :) */
+
+    i = buf[1], buf[1] = buf[13], buf[13] = buf[9], buf[9] = buf[5], buf[5] = i;
+    i = buf[2], buf[2] = buf[10], buf[10] = i;
+    j = buf[3], buf[3] = buf[7], buf[7] = buf[11], buf[11] = buf[15], buf[15] = j;
+    j = buf[6], buf[6] = buf[14], buf[14] = j;
+
+} /* aes_shiftRows_inv */
+
+/* -------------------------------------------------------------------------- */
+static void aes_mixColumns(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i];
+        b = buf[i + 1];
+        c = buf[i + 2];
+        d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        buf[i] ^= e ^ rj_xtime(a ^ b);
+        buf[i + 1] ^= e ^ rj_xtime(b ^ c);
+        buf[i + 2] ^= e ^ rj_xtime(c ^ d);
+        buf[i + 3] ^= e ^ rj_xtime(d ^ a);
+    }
+} /* aes_mixColumns */
+
+/* -------------------------------------------------------------------------- */
+void aes_mixColumns_inv(uint8_t *buf)
+{
+    register uint8_t i, a, b, c, d, e, x, y, z;
+
+    for (i = 0; i < 16; i += 4)
+    {
+        a = buf[i];
+        b = buf[i + 1];
+        c = buf[i + 2];
+        d = buf[i + 3];
+        e = a ^ b ^ c ^ d;
+        z = rj_xtime(e);
+        x = e ^ rj_xtime(rj_xtime(z ^ a ^ c));
+        y = e ^ rj_xtime(rj_xtime(z ^ b ^ d));
+        buf[i] ^= x ^ rj_xtime(a ^ b);
+        buf[i + 1] ^= y ^ rj_xtime(b ^ c);
+        buf[i + 2] ^= x ^ rj_xtime(c ^ d);
+        buf[i + 3] ^= y ^ rj_xtime(d ^ a);
+    }
+} /* aes_mixColumns_inv */
+
+/* -------------------------------------------------------------------------- */
+static void aes_expandEncKey(uint8_t *k, uint8_t *rc)
+{
+    register uint8_t i;
+
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+    *rc = rj_xtime( *rc);
+
+    for(i = 4; i < 16; i += 4)  k[i] ^= k[i - 4],   k[i + 1] ^= k[i - 3],
+                                            k[i + 2] ^= k[i - 2], k[i + 3] ^= k[i - 1];
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 20; i < 32; i += 4) k[i] ^= k[i - 4],   k[i + 1] ^= k[i - 3],
+                                            k[i + 2] ^= k[i - 2], k[i + 3] ^= k[i - 1];
+
+} /* aes_expandEncKey */
+
+/* -------------------------------------------------------------------------- */
+void aes_expandDecKey(uint8_t *k, uint8_t *rc)
+{
+    uint8_t i;
+
+    for(i = 28; i > 16; i -= 4) k[i + 0] ^= k[i - 4], k[i + 1] ^= k[i - 3],
+                                                k[i + 2] ^= k[i - 2], k[i + 3] ^= k[i - 1];
+
+    k[16] ^= rj_sbox(k[12]);
+    k[17] ^= rj_sbox(k[13]);
+    k[18] ^= rj_sbox(k[14]);
+    k[19] ^= rj_sbox(k[15]);
+
+    for(i = 12; i > 0; i -= 4)  k[i + 0] ^= k[i - 4], k[i + 1] ^= k[i - 3],
+                                                k[i + 2] ^= k[i - 2], k[i + 3] ^= k[i - 1];
+
+    *rc = FD(*rc);
+    k[0] ^= rj_sbox(k[29]) ^ (*rc);
+    k[1] ^= rj_sbox(k[30]);
+    k[2] ^= rj_sbox(k[31]);
+    k[3] ^= rj_sbox(k[28]);
+} /* aes_expandDecKey */
+
+
+/* -------------------------------------------------------------------------- */
+void aes256_init(aes256_context *ctx, uint8_t *k)
+{
+    uint8_t rcon = 1;
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++) ctx->enckey[i] = ctx->deckey[i] = k[i];
+    for (i = 8; --i;) aes_expandEncKey(ctx->deckey, &rcon);
+} /* aes256_init */
+
+/* -------------------------------------------------------------------------- */
+void aes256_done(aes256_context *ctx)
+{
+    register uint8_t i;
+
+    for (i = 0; i < sizeof(ctx->key); i++)
+        ctx->key[i] = ctx->enckey[i] = ctx->deckey[i] = 0;
+} /* aes256_done */
+
+/* -------------------------------------------------------------------------- */
+void aes256_encrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->enckey, ctx->key);
+    for(i = 1, rcon = 1; i < 14; ++i)
+    {
+        aes_subBytes(buf);
+        aes_shiftRows(buf);
+        aes_mixColumns(buf);
+        if( i & 1 ) aes_addRoundKey( buf, &ctx->key[16]);
+        else aes_expandEncKey(ctx->key, &rcon), aes_addRoundKey(buf, ctx->key);
+    }
+    aes_subBytes(buf);
+    aes_shiftRows(buf);
+    aes_expandEncKey(ctx->key, &rcon);
+    aes_addRoundKey(buf, ctx->key);
+} /* aes256_encrypt */
+
+/* -------------------------------------------------------------------------- */
+void aes256_decrypt_ecb(aes256_context *ctx, uint8_t *buf)
+{
+    uint8_t i, rcon;
+
+    aes_addRoundKey_cpy(buf, ctx->deckey, ctx->key);
+    aes_shiftRows_inv(buf);
+    aes_subBytes_inv(buf);
+
+    for (i = 14, rcon = 0x80; --i;)
+    {
+        if( ( i & 1 ) )
+        {
+            aes_expandDecKey(ctx->key, &rcon);
+            aes_addRoundKey(buf, &ctx->key[16]);
+        }
+        else aes_addRoundKey(buf, ctx->key);
+        aes_mixColumns_inv(buf);
+        aes_shiftRows_inv(buf);
+        aes_subBytes_inv(buf);
+    }
+    aes_addRoundKey( buf, ctx->key);
+} /* aes256_decrypt */
diff --git a/thirdparty/misc/aes256.h b/thirdparty/misc/aes256.h
new file mode 100644
index 0000000000..8fcc25a4de
--- /dev/null
+++ b/thirdparty/misc/aes256.h
@@ -0,0 +1,46 @@
+/*
+*   Byte-oriented AES-256 implementation.
+*   All lookup tables replaced with 'on the fly' calculations.
+*
+*   Copyright (c) 2007-2009 Ilya O. Levin, http://www.literatecode.com
+*   Other contributors: Hal Finney
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef AES_256_H
+#define AES_256_H
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    typedef struct {
+        uint8_t key[32];
+        uint8_t enckey[32];
+        uint8_t deckey[32];
+    } aes256_context;
+
+
+    void aes256_init(aes256_context *, uint8_t * /* key */);
+    void aes256_done(aes256_context *);
+    void aes256_encrypt_ecb(aes256_context *, uint8_t * /* plaintext */);
+    void aes256_decrypt_ecb(aes256_context *, uint8_t * /* cipertext */);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/thirdparty/misc/base64.c b/thirdparty/misc/base64.c
new file mode 100644
index 0000000000..0929ae5db5
--- /dev/null
+++ b/thirdparty/misc/base64.c
@@ -0,0 +1,118 @@
+/*
+ * File: base64.c
+ * Description: Simple BASE64 conversion methods
+ * Author: Ari Edelkind
+ * License: Public Domain
+ * Website: http://episec.com/people/edelkind/c.html
+ */
+
+#include <string.h>
+
+char b64string[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+long base64_encode (to, from, len)
+	char *to, *from;
+	unsigned int len;
+{
+	char *fromp = from;
+	char *top = to;
+	unsigned char cbyte;
+	unsigned char obyte;
+	char end[3];
+
+	for (; len >= 3; len -= 3) {
+		cbyte = *fromp++;
+		*top++ = b64string[(int)(cbyte >> 2)];
+		obyte = (cbyte << 4) & 0x30;		/* 0011 0000 */
+
+		cbyte = *fromp++;
+		obyte |= (cbyte >> 4);			/* 0000 1111 */
+		*top++ = b64string[(int)obyte];
+		obyte = (cbyte << 2) & 0x3C;		/* 0011 1100 */
+
+		cbyte = *fromp++;
+		obyte |= (cbyte >> 6);			/* 0000 0011 */
+		*top++ = b64string[(int)obyte];
+		*top++ = b64string[(int)(cbyte & 0x3F)];/* 0011 1111 */
+	}
+
+	if (len) {
+		end[0] = *fromp++;
+		if (--len) end[1] = *fromp++; else end[1] = 0;
+		end[2] = 0;
+
+		cbyte = end[0];
+		*top++ = b64string[(int)(cbyte >> 2)];
+		obyte = (cbyte << 4) & 0x30;		/* 0011 0000 */
+
+		cbyte = end[1];
+		obyte |= (cbyte >> 4);
+		*top++ = b64string[(int)obyte];
+		obyte = (cbyte << 2) & 0x3C;		/* 0011 1100 */
+
+		if (len) *top++ = b64string[(int)obyte];
+		else *top++ = '=';
+		*top++ = '=';
+	}
+	*top = 0;
+	return top - to;
+}
+
+/* badchar(): check if c is decent; puts either the */
+/* location of c or null into p.                  */
+#define badchar(c,p) (!(p = memchr(b64string, c, 64)))
+
+long base64_decode (to, from, len)
+	char *to, *from;
+	unsigned int len;
+{
+	char *fromp = from;
+	char *top = to;
+	char *p;
+	unsigned char cbyte;
+	unsigned char obyte;
+	int padding = 0;
+
+	for (; len >= 4; len -= 4) {
+		if ((cbyte = *fromp++) == '=') cbyte = 0;
+		else {
+			if (badchar(cbyte, p)) return -1;
+			cbyte = (p - b64string);
+		}
+		obyte = cbyte << 2;		/* 1111 1100 */
+
+		if ((cbyte = *fromp++) == '=') cbyte = 0;
+		else {
+			if (badchar(cbyte, p)) return -1;
+			cbyte = p - b64string;
+		}
+		obyte |= cbyte >> 4;		/* 0000 0011 */
+		*top++ = obyte;
+
+		obyte = cbyte << 4;		/* 1111 0000 */
+		if ((cbyte = *fromp++) == '=') { cbyte = 0; padding++; }
+		else {
+			padding = 0;
+			if (badchar (cbyte, p)) return -1;
+			cbyte = p - b64string;
+		}
+		obyte |= cbyte >> 2;		/* 0000 1111 */
+		*top++ = obyte;
+
+		obyte = cbyte << 6;		/* 1100 0000 */
+		if ((cbyte = *fromp++) == '=') { cbyte = 0; padding++; }
+		else {
+			padding = 0;
+			if (badchar (cbyte, p)) return -1;
+			cbyte = p - b64string;
+		}
+		obyte |= cbyte;			/* 0011 1111 */
+		*top++ = obyte;
+	}
+
+	*top = 0;
+	if (len) return -1;
+	return (top - to) - padding;
+}
+
diff --git a/thirdparty/misc/base64.h b/thirdparty/misc/base64.h
new file mode 100644
index 0000000000..456ef1811b
--- /dev/null
+++ b/thirdparty/misc/base64.h
@@ -0,0 +1,19 @@
+/*
+ * File: base64.h
+ * Description: Simple BASE64 conversion methods
+ * Author: Ari Edelkind
+ * License: Public Domain
+ * Website: http://episec.com/people/edelkind/c.html
+ */
+
+#ifndef BASE64_H
+#define BASE64_H
+
+extern "C" {
+
+uint32_t base64_encode (char* to, char* from, uint32_t len);
+uint32_t base64_decode (char* to, char* from, uint32_t len);
+
+};
+
+#endif /* BASE64_H */
diff --git a/thirdparty/misc/curl_hostcheck.c b/thirdparty/misc/curl_hostcheck.c
new file mode 100644
index 0000000000..feef232619
--- /dev/null
+++ b/thirdparty/misc/curl_hostcheck.c
@@ -0,0 +1,217 @@
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2012, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/* This file is an amalgamation of hostcheck.c and most of rawstr.c
+   from cURL.  The contents of the COPYING file mentioned above are:
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1996 - 2013, Daniel Stenberg, <daniel@haxx.se>.
+
+All rights reserved.
+
+Permission to use, copy, modify, and distribute this software for any purpose
+with or without fee is hereby granted, provided that the above copyright
+notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
+NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall not
+be used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization of the copyright holder.
+*/
+
+#include "curl_hostcheck.h"
+#include <string.h>
+
+/* Portable, consistent toupper (remember EBCDIC). Do not use toupper() because
+   its behavior is altered by the current locale. */
+static char Curl_raw_toupper(char in)
+{
+  switch (in) {
+  case 'a':
+    return 'A';
+  case 'b':
+    return 'B';
+  case 'c':
+    return 'C';
+  case 'd':
+    return 'D';
+  case 'e':
+    return 'E';
+  case 'f':
+    return 'F';
+  case 'g':
+    return 'G';
+  case 'h':
+    return 'H';
+  case 'i':
+    return 'I';
+  case 'j':
+    return 'J';
+  case 'k':
+    return 'K';
+  case 'l':
+    return 'L';
+  case 'm':
+    return 'M';
+  case 'n':
+    return 'N';
+  case 'o':
+    return 'O';
+  case 'p':
+    return 'P';
+  case 'q':
+    return 'Q';
+  case 'r':
+    return 'R';
+  case 's':
+    return 'S';
+  case 't':
+    return 'T';
+  case 'u':
+    return 'U';
+  case 'v':
+    return 'V';
+  case 'w':
+    return 'W';
+  case 'x':
+    return 'X';
+  case 'y':
+    return 'Y';
+  case 'z':
+    return 'Z';
+  }
+  return in;
+}
+
+/*
+ * Curl_raw_equal() is for doing "raw" case insensitive strings. This is meant
+ * to be locale independent and only compare strings we know are safe for
+ * this.  See http://daniel.haxx.se/blog/2008/10/15/strcasecmp-in-turkish/ for
+ * some further explanation to why this function is necessary.
+ *
+ * The function is capable of comparing a-z case insensitively even for
+ * non-ascii.
+ */
+
+static int Curl_raw_equal(const char *first, const char *second)
+{
+  while(*first && *second) {
+    if(Curl_raw_toupper(*first) != Curl_raw_toupper(*second))
+      /* get out of the loop as soon as they don't match */
+      break;
+    first++;
+    second++;
+  }
+  /* we do the comparison here (possibly again), just to make sure that if the
+     loop above is skipped because one of the strings reached zero, we must not
+     return this as a successful match */
+  return (Curl_raw_toupper(*first) == Curl_raw_toupper(*second));
+}
+
+static int Curl_raw_nequal(const char *first, const char *second, size_t max)
+{
+  while(*first && *second && max) {
+    if(Curl_raw_toupper(*first) != Curl_raw_toupper(*second)) {
+      break;
+    }
+    max--;
+    first++;
+    second++;
+  }
+  if(0 == max)
+    return 1; /* they are equal this far */
+
+  return Curl_raw_toupper(*first) == Curl_raw_toupper(*second);
+}
+
+/*
+ * Match a hostname against a wildcard pattern.
+ * E.g.
+ *  "foo.host.com" matches "*.host.com".
+ *
+ * We use the matching rule described in RFC6125, section 6.4.3.
+ * http://tools.ietf.org/html/rfc6125#section-6.4.3
+ */
+
+static int hostmatch(const char *hostname, const char *pattern)
+{
+  const char *pattern_label_end, *pattern_wildcard, *hostname_label_end;
+  int wildcard_enabled;
+  size_t prefixlen, suffixlen;
+  pattern_wildcard = strchr(pattern, '*');
+  if(pattern_wildcard == NULL)
+    return Curl_raw_equal(pattern, hostname) ?
+      CURL_HOST_MATCH : CURL_HOST_NOMATCH;
+
+  /* We require at least 2 dots in pattern to avoid too wide wildcard
+     match. */
+  wildcard_enabled = 1;
+  pattern_label_end = strchr(pattern, '.');
+  if(pattern_label_end == NULL || strchr(pattern_label_end+1, '.') == NULL ||
+     pattern_wildcard > pattern_label_end ||
+     Curl_raw_nequal(pattern, "xn--", 4)) {
+    wildcard_enabled = 0;
+  }
+  if(!wildcard_enabled)
+    return Curl_raw_equal(pattern, hostname) ?
+      CURL_HOST_MATCH : CURL_HOST_NOMATCH;
+
+  hostname_label_end = strchr(hostname, '.');
+  if(hostname_label_end == NULL ||
+     !Curl_raw_equal(pattern_label_end, hostname_label_end))
+    return CURL_HOST_NOMATCH;
+
+  /* The wildcard must match at least one character, so the left-most
+     label of the hostname is at least as large as the left-most label
+     of the pattern. */
+  if(hostname_label_end - hostname < pattern_label_end - pattern)
+    return CURL_HOST_NOMATCH;
+
+  prefixlen = pattern_wildcard - pattern;
+  suffixlen = pattern_label_end - (pattern_wildcard+1);
+  return Curl_raw_nequal(pattern, hostname, prefixlen) &&
+    Curl_raw_nequal(pattern_wildcard+1, hostname_label_end - suffixlen,
+                    suffixlen) ?
+    CURL_HOST_MATCH : CURL_HOST_NOMATCH;
+}
+
+int Tool_Curl_cert_hostcheck(const char *match_pattern, const char *hostname)
+{
+  if(!match_pattern || !*match_pattern ||
+      !hostname || !*hostname) /* sanity check */
+    return 0;
+
+  if(Curl_raw_equal(hostname, match_pattern)) /* trivial case */
+    return 1;
+
+  if(hostmatch(hostname,match_pattern) == CURL_HOST_MATCH)
+    return 1;
+  return 0;
+}
diff --git a/thirdparty/misc/curl_hostcheck.h b/thirdparty/misc/curl_hostcheck.h
new file mode 100644
index 0000000000..1b7fbe81e3
--- /dev/null
+++ b/thirdparty/misc/curl_hostcheck.h
@@ -0,0 +1,39 @@
+#ifndef HEADER_TOOL_CURL_HOSTCHECK_H
+#define HEADER_TOOL_CURL_HOSTCHECK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2012, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+#define CURL_HOST_NOMATCH 0
+#define CURL_HOST_MATCH   1
+int Tool_Curl_cert_hostcheck(const char *match_pattern, const char *hostname);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HEADER_CURL_HOSTCHECK_H */
+
diff --git a/thirdparty/misc/fastlz.c b/thirdparty/misc/fastlz.c
new file mode 100644
index 0000000000..508f6ea2ae
--- /dev/null
+++ b/thirdparty/misc/fastlz.c
@@ -0,0 +1,551 @@
+ /*
+  FastLZ - lightning-fast lossless compression library
+
+  Copyright (C) 2007 Ariya Hidayat (ariya@kde.org)
+  Copyright (C) 2006 Ariya Hidayat (ariya@kde.org)
+  Copyright (C) 2005 Ariya Hidayat (ariya@kde.org)
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#if !defined(FASTLZ_COMPRESSOR) && !defined(FASTLZ_DECOMPRESSOR)
+
+/*
+ * Always check for bound when decompressing.
+ * Generally it is best to leave it defined.
+ */
+#define FASTLZ_SAFE
+
+/*
+ * Give hints to the compiler for branch prediction optimization.
+ */
+#if defined(__GNUC__) && (__GNUC__ > 2)
+#define FASTLZ_EXPECT_CONDITIONAL(c)    (__builtin_expect((c), 1))
+#define FASTLZ_UNEXPECT_CONDITIONAL(c)  (__builtin_expect((c), 0))
+#else
+#define FASTLZ_EXPECT_CONDITIONAL(c)    (c)
+#define FASTLZ_UNEXPECT_CONDITIONAL(c)  (c)
+#endif
+
+/*
+ * Use inlined functions for supported systems.
+ */
+#if defined(__GNUC__) || defined(__DMC__) || defined(__POCC__) || defined(__WATCOMC__) || defined(__SUNPRO_C)
+#define FASTLZ_INLINE inline
+#elif defined(__BORLANDC__) || defined(_MSC_VER) || defined(__LCC__)
+#define FASTLZ_INLINE __inline
+#else 
+#define FASTLZ_INLINE
+#endif
+
+/*
+ * Prevent accessing more than 8-bit at once, except on x86 architectures.
+ */
+#if !defined(FASTLZ_STRICT_ALIGN)
+#define FASTLZ_STRICT_ALIGN
+#if defined(__i386__) || defined(__386)  /* GNU C, Sun Studio */
+#undef FASTLZ_STRICT_ALIGN
+#elif defined(__i486__) || defined(__i586__) || defined(__i686__) /* GNU C */
+#undef FASTLZ_STRICT_ALIGN
+#elif defined(_M_IX86) /* Intel, MSVC */
+#undef FASTLZ_STRICT_ALIGN
+#elif defined(__386)
+#undef FASTLZ_STRICT_ALIGN
+#elif defined(_X86_) /* MinGW */
+#undef FASTLZ_STRICT_ALIGN
+#elif defined(__I86__) /* Digital Mars */
+#undef FASTLZ_STRICT_ALIGN
+#endif
+#endif
+
+/*
+ * FIXME: use preprocessor magic to set this on different platforms!
+ */
+typedef unsigned char  flzuint8;
+typedef unsigned short flzuint16;
+typedef unsigned int   flzuint32;
+
+/* prototypes */
+int fastlz_compress(const void* input, int length, void* output);
+int fastlz_compress_level(int level, const void* input, int length, void* output);
+int fastlz_decompress(const void* input, int length, void* output, int maxout);
+
+#define MAX_COPY       32
+#define MAX_LEN       264  /* 256 + 8 */
+#define MAX_DISTANCE 8192
+
+#if !defined(FASTLZ_STRICT_ALIGN)
+#define FASTLZ_READU16(p) *((const flzuint16*)(p)) 
+#else
+#define FASTLZ_READU16(p) ((p)[0] | (p)[1]<<8)
+#endif
+
+#define HASH_LOG  13
+#define HASH_SIZE (1<< HASH_LOG)
+#define HASH_MASK  (HASH_SIZE-1)
+#define HASH_FUNCTION(v,p) { v = FASTLZ_READU16(p); v ^= FASTLZ_READU16(p+1)^(v>>(16-HASH_LOG));v &= HASH_MASK; }
+
+#undef FASTLZ_LEVEL
+#define FASTLZ_LEVEL 1
+
+#undef FASTLZ_COMPRESSOR
+#undef FASTLZ_DECOMPRESSOR
+#define FASTLZ_COMPRESSOR fastlz1_compress
+#define FASTLZ_DECOMPRESSOR fastlz1_decompress
+static FASTLZ_INLINE int FASTLZ_COMPRESSOR(const void* input, int length, void* output);
+static FASTLZ_INLINE int FASTLZ_DECOMPRESSOR(const void* input, int length, void* output, int maxout);
+#include "fastlz.c"
+
+#undef FASTLZ_LEVEL
+#define FASTLZ_LEVEL 2
+
+#undef MAX_DISTANCE
+#define MAX_DISTANCE 8191
+#define MAX_FARDISTANCE (65535+MAX_DISTANCE-1)
+
+#undef FASTLZ_COMPRESSOR
+#undef FASTLZ_DECOMPRESSOR
+#define FASTLZ_COMPRESSOR fastlz2_compress
+#define FASTLZ_DECOMPRESSOR fastlz2_decompress
+static FASTLZ_INLINE int FASTLZ_COMPRESSOR(const void* input, int length, void* output);
+static FASTLZ_INLINE int FASTLZ_DECOMPRESSOR(const void* input, int length, void* output, int maxout);
+#include "fastlz.c"
+
+int fastlz_compress(const void* input, int length, void* output)
+{
+  /* for short block, choose fastlz1 */
+  if(length < 65536)
+    return fastlz1_compress(input, length, output);
+
+  /* else... */
+  return fastlz2_compress(input, length, output);
+}
+
+int fastlz_decompress(const void* input, int length, void* output, int maxout)
+{
+  /* magic identifier for compression level */
+  int level = ((*(const flzuint8*)input) >> 5) + 1;
+
+  if(level == 1)
+    return fastlz1_decompress(input, length, output, maxout);
+  if(level == 2)
+    return fastlz2_decompress(input, length, output, maxout);
+
+  /* unknown level, trigger error */
+  return 0;
+}
+
+int fastlz_compress_level(int level, const void* input, int length, void* output)
+{
+  if(level == 1)
+    return fastlz1_compress(input, length, output);
+  if(level == 2)
+    return fastlz2_compress(input, length, output);
+
+  return 0;
+}
+
+#else /* !defined(FASTLZ_COMPRESSOR) && !defined(FASTLZ_DECOMPRESSOR) */
+
+static FASTLZ_INLINE int FASTLZ_COMPRESSOR(const void* input, int length, void* output)
+{
+  const flzuint8* ip = (const flzuint8*) input;
+  const flzuint8* ip_bound = ip + length - 2;
+  const flzuint8* ip_limit = ip + length - 12;
+  flzuint8* op = (flzuint8*) output;
+
+  const flzuint8* htab[HASH_SIZE];
+  const flzuint8** hslot;
+  flzuint32 hval;
+
+  flzuint32 copy;
+
+  /* sanity check */
+  if(FASTLZ_UNEXPECT_CONDITIONAL(length < 4))
+  {
+    if(length)
+    {
+      /* create literal copy only */
+      *op++ = length-1;
+      ip_bound++;
+      while(ip <= ip_bound)
+        *op++ = *ip++;
+      return length+1;
+    }
+    else
+      return 0;
+  }
+
+  /* initializes hash table */
+  for (hslot = htab; hslot < htab + HASH_SIZE; hslot++)
+    *hslot = ip;
+
+  /* we start with literal copy */
+  copy = 2;
+  *op++ = MAX_COPY-1;
+  *op++ = *ip++;
+  *op++ = *ip++;
+
+  /* main loop */
+  while(FASTLZ_EXPECT_CONDITIONAL(ip < ip_limit))
+  {
+    const flzuint8* ref;
+    flzuint32 distance;
+
+    /* minimum match length */
+    flzuint32 len = 3;
+
+    /* comparison starting-point */
+    const flzuint8* anchor = ip;
+
+    /* check for a run */
+#if FASTLZ_LEVEL==2
+    if(ip[0] == ip[-1] && FASTLZ_READU16(ip-1)==FASTLZ_READU16(ip+1))
+    {
+      distance = 1;
+      ip += 3;
+      ref = anchor - 1 + 3;
+      goto match;
+    }
+#endif
+
+    /* find potential match */
+    HASH_FUNCTION(hval,ip);
+    hslot = htab + hval;
+    ref = htab[hval];
+
+    /* calculate distance to the match */
+    distance = anchor - ref;
+
+    /* update hash table */
+    *hslot = anchor;
+
+    /* is this a match? check the first 3 bytes */
+    if(distance==0 || 
+#if FASTLZ_LEVEL==1
+    (distance >= MAX_DISTANCE) ||
+#else
+    (distance >= MAX_FARDISTANCE) ||
+#endif
+    *ref++ != *ip++ || *ref++!=*ip++ || *ref++!=*ip++)
+      goto literal;
+
+#if FASTLZ_LEVEL==2
+    /* far, needs at least 5-byte match */
+    if(distance >= MAX_DISTANCE)
+    {
+      if(*ip++ != *ref++ || *ip++!= *ref++) 
+        goto literal;
+      len += 2;
+    }
+    
+    match:
+#endif
+
+    /* last matched byte */
+    ip = anchor + len;
+
+    /* distance is biased */
+    distance--;
+
+    if(!distance)
+    {
+      /* zero distance means a run */
+      flzuint8 x = ip[-1];
+      while(ip < ip_bound)
+        if(*ref++ != x) break; else ip++;
+    }
+    else
+    for(;;)
+    {
+      /* safe because the outer check against ip limit */
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      if(*ref++ != *ip++) break;
+      while(ip < ip_bound)
+        if(*ref++ != *ip++) break;
+      break;
+    }
+
+    /* if we have copied something, adjust the copy count */
+    if(copy)
+      /* copy is biased, '0' means 1 byte copy */
+      *(op-copy-1) = copy-1;
+    else
+      /* back, to overwrite the copy count */
+      op--;
+
+    /* reset literal counter */
+    copy = 0;
+
+    /* length is biased, '1' means a match of 3 bytes */
+    ip -= 3;
+    len = ip - anchor;
+
+    /* encode the match */
+#if FASTLZ_LEVEL==2
+    if(distance < MAX_DISTANCE)
+    {
+      if(len < 7)
+      {
+        *op++ = (len << 5) + (distance >> 8);
+        *op++ = (distance & 255);
+      }
+      else
+      {
+        *op++ = (7 << 5) + (distance >> 8);
+        for(len-=7; len >= 255; len-= 255)
+          *op++ = 255;
+        *op++ = len;
+        *op++ = (distance & 255);
+      }
+    }
+    else
+    {
+      /* far away, but not yet in the another galaxy... */
+      if(len < 7)
+      {
+        distance -= MAX_DISTANCE;
+        *op++ = (len << 5) + 31;
+        *op++ = 255;
+        *op++ = distance >> 8;
+        *op++ = distance & 255;
+      }
+      else
+      {
+        distance -= MAX_DISTANCE;
+        *op++ = (7 << 5) + 31;
+        for(len-=7; len >= 255; len-= 255)
+          *op++ = 255;
+        *op++ = len;
+        *op++ = 255;
+        *op++ = distance >> 8;
+        *op++ = distance & 255;
+      }
+    }
+#else
+
+    if(FASTLZ_UNEXPECT_CONDITIONAL(len > MAX_LEN-2))
+      while(len > MAX_LEN-2)
+      {
+        *op++ = (7 << 5) + (distance >> 8);
+        *op++ = MAX_LEN - 2 - 7 -2; 
+        *op++ = (distance & 255);
+        len -= MAX_LEN-2;
+      }
+
+    if(len < 7)
+    {
+      *op++ = (len << 5) + (distance >> 8);
+      *op++ = (distance & 255);
+    }
+    else
+    {
+      *op++ = (7 << 5) + (distance >> 8);
+      *op++ = len - 7;
+      *op++ = (distance & 255);
+    }
+#endif
+
+    /* update the hash at match boundary */
+    HASH_FUNCTION(hval,ip);
+    htab[hval] = ip++;
+    HASH_FUNCTION(hval,ip);
+    htab[hval] = ip++;
+
+    /* assuming literal copy */
+    *op++ = MAX_COPY-1;
+
+    continue;
+
+    literal:
+      *op++ = *anchor++;
+      ip = anchor;
+      copy++;
+      if(FASTLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY))
+      {
+        copy = 0;
+        *op++ = MAX_COPY-1;
+      }
+  }
+
+  /* left-over as literal copy */
+  ip_bound++;
+  while(ip <= ip_bound)
+  {
+    *op++ = *ip++;
+    copy++;
+    if(copy == MAX_COPY)
+    {
+      copy = 0;
+      *op++ = MAX_COPY-1;
+    }
+  }
+
+  /* if we have copied something, adjust the copy length */
+  if(copy)
+    *(op-copy-1) = copy-1;
+  else
+    op--;
+
+#if FASTLZ_LEVEL==2
+  /* marker for fastlz2 */
+  *(flzuint8*)output |= (1 << 5);
+#endif
+
+  return op - (flzuint8*)output;
+}
+
+static FASTLZ_INLINE int FASTLZ_DECOMPRESSOR(const void* input, int length, void* output, int maxout)
+{
+  const flzuint8* ip = (const flzuint8*) input;
+  const flzuint8* ip_limit  = ip + length;
+  flzuint8* op = (flzuint8*) output;
+  flzuint8* op_limit = op + maxout;
+  flzuint32 ctrl = (*ip++) & 31;
+  int loop = 1;
+
+  do
+  {
+    const flzuint8* ref = op;
+    flzuint32 len = ctrl >> 5;
+    flzuint32 ofs = (ctrl & 31) << 8;
+
+    if(ctrl >= 32)
+    {
+#if FASTLZ_LEVEL==2
+      flzuint8 code;
+#endif
+      len--;
+      ref -= ofs;
+      if (len == 7-1)
+#if FASTLZ_LEVEL==1
+        len += *ip++;
+      ref -= *ip++;
+#else
+        do
+        {
+          code = *ip++;
+          len += code;
+        } while (code==255);
+      code = *ip++;
+      ref -= code;
+
+      /* match from 16-bit distance */
+      if(FASTLZ_UNEXPECT_CONDITIONAL(code==255))
+      if(FASTLZ_EXPECT_CONDITIONAL(ofs==(31 << 8)))
+      {
+        ofs = (*ip++) << 8;
+        ofs += *ip++;
+        ref = op - ofs - MAX_DISTANCE;
+      }
+#endif
+      
+#ifdef FASTLZ_SAFE
+      if (FASTLZ_UNEXPECT_CONDITIONAL(op + len + 3 > op_limit))
+        return 0;
+
+      if (FASTLZ_UNEXPECT_CONDITIONAL(ref-1 < (flzuint8 *)output))
+        return 0;
+#endif
+
+      if(FASTLZ_EXPECT_CONDITIONAL(ip < ip_limit))
+        ctrl = *ip++;
+      else
+        loop = 0;
+
+      if(ref == op)
+      {
+        /* optimize copy for a run */
+        flzuint8 b = ref[-1];
+        *op++ = b;
+        *op++ = b;
+        *op++ = b;
+        for(; len; --len)
+          *op++ = b;
+      }
+      else
+      {
+#if !defined(FASTLZ_STRICT_ALIGN)
+        const flzuint16* p;
+        flzuint16* q;
+#endif
+        /* copy from reference */
+        ref--;
+        *op++ = *ref++;
+        *op++ = *ref++;
+        *op++ = *ref++;
+
+#if !defined(FASTLZ_STRICT_ALIGN)
+        /* copy a byte, so that now it's word aligned */
+        if(len & 1)
+        {
+          *op++ = *ref++;
+          len--;
+        }
+
+        /* copy 16-bit at once */
+        q = (flzuint16*) op;
+        op += len;
+        p = (const flzuint16*) ref;
+        for(len>>=1; len > 4; len-=4)
+        {
+          *q++ = *p++;
+          *q++ = *p++;
+          *q++ = *p++;
+          *q++ = *p++;
+        }
+        for(; len; --len)
+          *q++ = *p++;
+#else
+        for(; len; --len)
+          *op++ = *ref++;
+#endif
+      }
+    }
+    else
+    {
+      ctrl++;
+#ifdef FASTLZ_SAFE
+      if (FASTLZ_UNEXPECT_CONDITIONAL(op + ctrl > op_limit))
+        return 0;
+      if (FASTLZ_UNEXPECT_CONDITIONAL(ip + ctrl > ip_limit))
+        return 0;
+#endif
+
+      *op++ = *ip++; 
+      for(--ctrl; ctrl; ctrl--)
+        *op++ = *ip++;
+
+      loop = FASTLZ_EXPECT_CONDITIONAL(ip < ip_limit);
+      if(loop)
+        ctrl = *ip++;
+    }
+  }
+  while(FASTLZ_EXPECT_CONDITIONAL(loop));
+
+  return op - (flzuint8*)output;
+}
+
+#endif /* !defined(FASTLZ_COMPRESSOR) && !defined(FASTLZ_DECOMPRESSOR) */
diff --git a/thirdparty/misc/fastlz.h b/thirdparty/misc/fastlz.h
new file mode 100644
index 0000000000..e5ca8dfc02
--- /dev/null
+++ b/thirdparty/misc/fastlz.h
@@ -0,0 +1,100 @@
+/*
+  FastLZ - lightning-fast lossless compression library
+
+  Copyright (C) 2007 Ariya Hidayat (ariya@kde.org)
+  Copyright (C) 2006 Ariya Hidayat (ariya@kde.org)
+  Copyright (C) 2005 Ariya Hidayat (ariya@kde.org)
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#ifndef FASTLZ_H
+#define FASTLZ_H
+
+#define FASTLZ_VERSION 0x000100
+
+#define FASTLZ_VERSION_MAJOR     0
+#define FASTLZ_VERSION_MINOR     0
+#define FASTLZ_VERSION_REVISION  0
+
+#define FASTLZ_VERSION_STRING "0.1.0"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**
+  Compress a block of data in the input buffer and returns the size of
+  compressed block. The size of input buffer is specified by length. The
+  minimum input buffer size is 16.
+
+  The output buffer must be at least 5% larger than the input buffer
+  and can not be smaller than 66 bytes.
+
+  If the input is not compressible, the return value might be larger than
+  length (input buffer size).
+
+  The input buffer and the output buffer can not overlap.
+*/
+
+int fastlz_compress(const void* input, int length, void* output);
+
+/**
+  Decompress a block of compressed data and returns the size of the
+  decompressed block. If error occurs, e.g. the compressed data is
+  corrupted or the output buffer is not large enough, then 0 (zero)
+  will be returned instead.
+
+  The input buffer and the output buffer can not overlap.
+
+  Decompression is memory safe and guaranteed not to write the output buffer
+  more than what is specified in maxout.
+ */
+
+int fastlz_decompress(const void* input, int length, void* output, int maxout);
+
+/**
+  Compress a block of data in the input buffer and returns the size of
+  compressed block. The size of input buffer is specified by length. The
+  minimum input buffer size is 16.
+
+  The output buffer must be at least 5% larger than the input buffer
+  and can not be smaller than 66 bytes.
+
+  If the input is not compressible, the return value might be larger than
+  length (input buffer size).
+
+  The input buffer and the output buffer can not overlap.
+
+  Compression level can be specified in parameter level. At the moment,
+  only level 1 and level 2 are supported.
+  Level 1 is the fastest compression and generally useful for short data.
+  Level 2 is slightly slower but it gives better compression ratio.
+
+  Note that the compressed data, regardless of the level, can always be
+  decompressed using the function fastlz_decompress above.
+*/
+
+int fastlz_compress_level(int level, const void* input, int length, void* output);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* FASTLZ_H */
diff --git a/thirdparty/misc/hq2x.cpp b/thirdparty/misc/hq2x.cpp
new file mode 100644
index 0000000000..7ebb505d64
--- /dev/null
+++ b/thirdparty/misc/hq2x.cpp
@@ -0,0 +1,2636 @@
+/*
+ * Copyright 2016 Bruno Ribeiro
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "hq2x.h"
+#include "math_funcs.h"
+
+
+static const uint32_t AMASK = 0xFF000000;
+static const uint32_t YMASK = 0x00FF0000;
+static const uint32_t UMASK = 0x0000FF00;
+static const uint32_t VMASK = 0x000000FF;
+
+_FORCE_INLINE_ static uint32_t ARGBtoAYUV(
+	uint32_t value )
+{
+    uint32_t A, R, G, B, Y, U, V;
+//todo big endian check
+    A = value >> 24;
+    R = (value >> 16) & 0xFF;
+    G = (value >> 8) & 0xFF;
+    B = value & 0xFF;
+
+    Y = Math::fast_ftoi( 0.299 * R + 0.587 * G + 0.114 * B);
+    U = Math::fast_ftoi(-0.169 * R - 0.331 * G +   0.5 * B) + 128;
+    V = Math::fast_ftoi(   0.5 * R - 0.419 * G - 0.081 * B) + 128;
+    return (A << 24) + (Y << 16) + (U << 8) + V;
+}
+
+
+/*
+ * Use this function for sharper images (good for cartoon style, used by DOSBOX)
+ */
+
+_FORCE_INLINE_ static bool isDifferent(
+	uint32_t color1,
+	uint32_t color2,
+	uint32_t trY,
+	uint32_t trU,
+	uint32_t trV,
+	uint32_t trA )
+{
+	color1 = ARGBtoAYUV(color1);
+	color2 = ARGBtoAYUV(color2);
+
+	uint32_t value;
+
+	value = ((color1 & YMASK) - (color2 & YMASK));
+	value = (value ^ (value >> 31)) - (value >> 31);
+	if (value > trY) return true;
+
+	value = ((color1 & UMASK) - (color2 & UMASK));
+	value = (value ^ (value >> 31)) - (value >> 31);
+	if (value > trU) return true;
+
+	value = ((color1 & VMASK) - (color2 & VMASK));
+	value = (value ^ (value >> 31)) - (value >> 31);
+	if (value > trV) return true;
+
+	value = ((color1 & AMASK) - (color2 & AMASK));
+	value = (value ^ (value >> 31)) - (value >> 31);
+	if (value > trA) return true;
+
+	return false;
+
+}
+
+
+
+#define MASK_RB   0x00FF00FF
+#define MASK_G    0x0000FF00
+#define MASK_A    0xFF000000
+
+
+/**
+ * @brief Mixes two colors using the given weights.
+ */
+#define HQX_MIX_2(C0,C1,W0,W1) \
+	((((C0 & MASK_RB) * W0 + (C1 & MASK_RB) * W1) / (W0 + W1)) & MASK_RB) | \
+	((((C0 & MASK_G)  * W0 + (C1 & MASK_G)  * W1) / (W0 + W1)) & MASK_G)  | \
+	((((((C0 & MASK_A) >> 8)  * W0 + ((C1 & MASK_A) >> 8) * W1) / (W0 + W1)) << 8) & MASK_A)
+
+/**
+ * @brief Mixes three colors using the given weights.
+ */
+#define HQX_MIX_3(C0,C1,C2,W0,W1,W2) \
+	((((C0 & MASK_RB) * W0 + (C1 & MASK_RB) * W1 + (C2 & MASK_RB) * W2) / (W0 + W1 + W2)) & MASK_RB) | \
+	((((C0 & MASK_G)  * W0 + (C1 & MASK_G)  * W1 + (C2 & MASK_G)  * W2) / (W0 + W1 + W2)) & MASK_G)  | \
+	((((((C0 & MASK_A) >> 8) * W0 + ((C1 & MASK_A) >> 8) * W1 + ((C2 & MASK_A) >> 8) * W2) / (W0 + W1 + W2)) << 8) & MASK_A)
+
+
+#define MIX_00_4				*output = w[4];
+#define MIX_00_MIX_00_4_0_3_1	*output = HQX_MIX_2(w[4],w[0],3U,1U);
+#define MIX_00_4_3_3_1			*output = HQX_MIX_2(w[4],w[3],3U,1U);
+#define MIX_00_4_1_3_1			*output = HQX_MIX_2(w[4],w[1],3U,1U);
+#define MIX_00_3_1_1_1			*output = HQX_MIX_2(w[3],w[1],1U,1U);
+#define MIX_00_4_3_1_2_1_1		*output = HQX_MIX_3(w[4],w[3],w[1],2U,1U,1U);
+#define MIX_00_4_3_1_2_7_7 		*output = HQX_MIX_3(w[4],w[3],w[1],2U,7U,7U);
+#define MIX_00_4_0_1_2_1_1		*output = HQX_MIX_3(w[4],w[0],w[1],2U,1U,1U);
+#define MIX_00_4_0_3_2_1_1		*output = HQX_MIX_3(w[4],w[0],w[3],2U,1U,1U);
+#define MIX_00_4_1_3_5_2_1		*output = HQX_MIX_3(w[4],w[1],w[3],5U,2U,1U);
+#define MIX_00_4_3_1_5_2_1		*output = HQX_MIX_3(w[4],w[3],w[1],5U,2U,1U);
+#define MIX_00_4_3_1_6_1_1		*output = HQX_MIX_3(w[4],w[3],w[1],6U,1U,1U);
+#define MIX_00_4_3_1_2_3_3		*output = HQX_MIX_3(w[4],w[3],w[1],2U,3U,3U);
+#define MIX_00_MIX_00_4_0_3_10	*output = HQX_MIX_3(w[4],w[3],w[1],14U,1U,1U);
+
+#define MIX_01_4			*(output + 1) = w[4];
+#define MIX_01_4_2_3_1		*(output + 1) = HQX_MIX_2(w[4],w[2],3U,1U);
+#define MIX_01_4_1_3_1		*(output + 1) = HQX_MIX_2(w[4],w[1],3U,1U);
+#define MIX_01_1_4_3_1		*(output + 1) = HQX_MIX_2(w[1],w[4],3U,1U);
+#define MIX_01_4_5_3_1		*(output + 1) = HQX_MIX_2(w[4],w[5],3U,1U);
+#define MIX_01_4_1_7_1		*(output + 1) = HQX_MIX_2(w[4],w[1],7U,1U);
+#define MIX_01_4_1_5_2_1_1	*(output + 1) = HQX_MIX_3(w[4],w[1],w[5],2U,1U,1U);
+#define MIX_01_4_2_5_2_1_1	*(output + 1) = HQX_MIX_3(w[4],w[2],w[5],2U,1U,1U);
+#define MIX_01_4_2_1_2_1_1	*(output + 1) = HQX_MIX_3(w[4],w[2],w[1],2U,1U,1U);
+#define MIX_01_4_5_1_5_2_1	*(output + 1) = HQX_MIX_3(w[4],w[5],w[1],5U,2U,1U);
+#define MIX_01_4_1_5_5_2_1	*(output + 1) = HQX_MIX_3(w[4],w[1],w[5],5U,2U,1U);
+#define MIX_01_4_1_5_6_1_1	*(output + 1) = HQX_MIX_3(w[4],w[1],w[5],6U,1U,1U);
+#define MIX_01_4_1_5_2_3_3	*(output + 1) = HQX_MIX_3(w[4],w[1],w[5],2U,3U,3U);
+#define MIX_01_4_2_3_10		*(output + 1) = HQX_MIX_3(w[4],w[1],w[5],14U,1U,1U);
+
+#define MIX_02_4			*(output + 2) = w[4];
+#define MIX_02_4_2_3_1		*(output + 2) = HQX_MIX_2(w[4],w[2],3U,1U);
+#define MIX_02_4_1_3_1		*(output + 2) = HQX_MIX_2(w[4],w[1],3U,1U);
+#define MIX_02_4_5_3_1  	*(output + 2) = HQX_MIX_2(w[4],w[5],3U,1U);
+#define MIX_02_4_1_5_2_1_1	*(output + 2) = HQX_MIX_3(w[4],w[1],w[5],2U,1U,1U);
+#define MIX_02_4_1_5_2_7_7	*(output + 2) = HQX_MIX_3(w[4],w[1],w[5],2U,7U,7U);
+#define MIX_02_1_5_1_1		*(output + 2) = HQX_MIX_2(w[1],w[5],1U,1U);
+
+#define MIX_10_4			*(output + lineSize) = w[4];
+#define MIX_10_4_6_3_1		*(output + lineSize) = HQX_MIX_2(w[4],w[6],3U,1U);
+#define MIX_10_4_7_3_1		*(output + lineSize) = HQX_MIX_2(w[4],w[7],3U,1U);
+#define MIX_10_4_3_3_1		*(output + lineSize) = HQX_MIX_2(w[4],w[3],3U,1U);
+#define MIX_10_4_7_3_2_1_1	*(output + lineSize) = HQX_MIX_3(w[4],w[7],w[3],2U,1U,1U);
+#define MIX_10_4_6_3_2_1_1	*(output + lineSize) = HQX_MIX_3(w[4],w[6],w[3],2U,1U,1U);
+#define MIX_10_4_6_7_2_1_1	*(output + lineSize) = HQX_MIX_3(w[4],w[6],w[7],2U,1U,1U);
+#define MIX_10_4_3_7_5_2_1	*(output + lineSize) = HQX_MIX_3(w[4],w[3],w[7],5U,2U,1U);
+#define MIX_10_4_7_3_5_2_1	*(output + lineSize) = HQX_MIX_3(w[4],w[7],w[3],5U,2U,1U);
+#define MIX_10_4_7_3_6_1_1	*(output + lineSize) = HQX_MIX_3(w[4],w[7],w[3],6U,1U,1U);
+#define MIX_10_4_7_3_2_3_3	*(output + lineSize) = HQX_MIX_3(w[4],w[7],w[3],2U,3U,3U);
+#define MIX_10_4_6_3_10		*(output + lineSize) = HQX_MIX_3(w[4],w[7],w[3],14U,1U,1U);
+#define MIX_10_4_3_7_1  	*(output + lineSize) = HQX_MIX_2(w[4],w[3],7U,1U);
+#define MIX_10_3_4_3_1  	*(output + lineSize) = HQX_MIX_2(w[3],w[4],3U,1U);
+
+#define MIX_11_4			*(output + lineSize + 1) = w[4];
+#define MIX_11_4_8_3_1		*(output + lineSize + 1) = HQX_MIX_2(w[4],w[8],3U,1U);
+#define MIX_11_4_5_3_1		*(output + lineSize + 1) = HQX_MIX_2(w[4],w[5],3U,1U);
+#define MIX_11_4_7_3_1		*(output + lineSize + 1) = HQX_MIX_2(w[4],w[7],3U,1U);
+#define MIX_11_4_5_7_2_1_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[5],w[7],2U,1U,1U);
+#define MIX_11_4_8_7_2_1_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[8],w[7],2U,1U,1U);
+#define MIX_11_4_8_5_2_1_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[8],w[5],2U,1U,1U);
+#define MIX_11_4_7_5_5_2_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[7],w[5],5U,2U,1U);
+#define MIX_11_4_5_7_5_2_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[5],w[7],5U,2U,1U);
+#define MIX_11_4_5_7_6_1_1	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[5],w[7],6U,1U,1U);
+#define MIX_11_4_5_7_2_3_3	*(output + lineSize + 1) = HQX_MIX_3(w[4],w[5],w[7],2U,3U,3U);
+#define MIX_11_4_8_3_10		*(output + lineSize + 1) = HQX_MIX_3(w[4],w[5],w[7],14U,1U,1U);
+
+#define MIX_12_4			*(output + lineSize + 2) = w[4];
+#define MIX_12_4_5_3_1		*(output + lineSize + 2) = HQX_MIX_2(w[4],w[5],3U,1U);
+#define MIX_12_4_5_7_1		*(output + lineSize + 2) = HQX_MIX_2(w[4],w[5],7U,1U);
+#define MIX_12_5_4_3_1		*(output + lineSize + 2) = HQX_MIX_2(w[5],w[4],3U,1U);
+
+#define MIX_20_4			*(output + lineSize + lineSize) = w[4];
+#define MIX_20_4_6_3_1		*(output + lineSize + lineSize) = HQX_MIX_2(w[4],w[6],3U,1U);
+#define MIX_20_4_7_3_1		*(output + lineSize + lineSize) = HQX_MIX_2(w[4],w[7],3U,1U);
+#define MIX_20_4_3_3_1		*(output + lineSize + lineSize) = HQX_MIX_2(w[4],w[3],3U,1U);
+#define MIX_20_4_7_3_2_1_1	*(output + lineSize + lineSize) = HQX_MIX_3(w[4],w[7],w[3],2U,1U,1U);
+#define MIX_20_4_7_3_2_7_7	*(output + lineSize + lineSize) = HQX_MIX_3(w[4],w[7],w[3],2U,7U,7U);
+#define MIX_20_7_3_1_1		*(output + lineSize + lineSize) = HQX_MIX_2(w[7],w[3],1U,1U);
+
+#define MIX_21_4			*(output + lineSize + lineSize + 1) = w[4];
+#define MIX_21_4_7_3_1		*(output + lineSize + lineSize + 1) = HQX_MIX_2(w[4],w[7],3U,1U);
+#define MIX_21_4_7_7_1		*(output + lineSize + lineSize + 1) = HQX_MIX_2(w[4],w[7],7U,1U);
+#define MIX_21_7_4_3_1		*(output + lineSize + lineSize + 1) = HQX_MIX_2(w[7],w[4],3U,1U);
+
+#define MIX_22_4			*(output + lineSize + lineSize + 2) = w[4];
+#define MIX_22_4_8_3_1		*(output + lineSize + lineSize + 2) = HQX_MIX_2(w[4],w[8],3U,1U);
+#define MIX_22_4_7_3_1		*(output + lineSize + lineSize + 2) = HQX_MIX_2(w[4],w[7],3U,1U);
+#define MIX_22_4_5_3_1		*(output + lineSize + lineSize + 2) = HQX_MIX_2(w[4],w[5],3U,1U);
+#define MIX_22_4_5_7_2_1_1	*(output + lineSize + lineSize + 2) = HQX_MIX_3(w[4],w[5],w[7],2U,1U,1U);
+#define MIX_22_4_5_7_2_7_7	*(output + lineSize + lineSize + 2) = HQX_MIX_3(w[4],w[5],w[7],2U,7U,7U);
+#define MIX_22_5_7_1_1		*(output + lineSize + lineSize + 2) = HQX_MIX_2(w[5],w[7],1U,1U);
+
+
+
+uint32_t *hq2x_resize(
+	const uint32_t *image,
+	uint32_t width,
+	uint32_t height,
+	uint32_t *output,
+	uint32_t trY,
+	uint32_t trU,
+	uint32_t trV,
+	uint32_t trA,
+	bool wrapX,
+	bool wrapY )
+{
+	int lineSize = width * 2;
+
+	int previous, next;
+	uint32_t w[9];
+
+	trY <<= 16;
+	trU <<= 8;
+	trA <<= 24;
+
+	// iterates between the lines
+	for (uint32_t row = 0; row < height; row++)
+	{
+		/*
+		 * Note: this function uses a 3x3 sliding window over the original image.
+		 *
+		 *   +----+----+----+
+		 *   |    |    |    |
+		 *   | w0 | w1 | w2 |
+		 *   +----+----+----+
+		 *   |    |    |    |
+		 *   | w3 | w4 | w5 |
+		 *   +----+----+----+
+		 *   |    |    |    |
+		 *   | w6 | w7 | w8 |
+		 *   +----+----+----+
+		 */
+
+		// adjusts the previous and next line pointers
+		if (row > 0)
+			previous = -width;
+		else
+		{
+			if (wrapY)
+				previous = width * (height - 1);
+			else
+				previous = 0;
+		}
+		if (row < height - 1)
+			next = width;
+		else
+		{
+			if (wrapY)
+				next = -(width * (height - 1));
+			else
+				next = 0;
+		}
+
+		// iterates between the columns
+		for (uint32_t col = 0; col < width; col++)
+		{
+			w[1] = *(image + previous);
+			w[4] = *image;
+			w[7] = *(image + next);
+
+			if (col > 0)
+			{
+				w[0] = *(image + previous - 1);
+				w[3] = *(image - 1);
+				w[6] = *(image + next - 1);
+			}
+			else
+			{
+				if (wrapX)
+				{
+					w[0] = *(image + previous + width - 1);
+					w[3] = *(image + width - 1);
+					w[6] = *(image + next + width - 1);
+				}
+				else
+				{
+					w[0] = w[1];
+					w[3] = w[4];
+					w[6] = w[7];
+				}
+			}
+
+			if (col < width - 1)
+			{
+				w[2] = *(image + previous + 1);
+				w[5] = *(image + 1);
+				w[8] = *(image + next + 1);
+			}
+			else
+			{
+				if (wrapX)
+				{
+					w[2] = *(image + previous - width + 1);
+					w[5] = *(image - width + 1);
+					w[8] = *(image + next - width + 1);
+				}
+				else
+				{
+					w[2] = w[1];
+					w[5] = w[4];
+					w[8] = w[7];
+				}
+			}
+
+			int pattern = 0;
+
+			// computes the pattern to be used considering the neighbor pixels
+			for (int k = 0, flag = 1; k < 9; k++)
+			{
+				// ignores the central pixel
+				if (k == 4) continue;
+
+				if (w[k] != w[4])
+					if (isDifferent(w[4], w[k], trY, trU, trV, trA)) pattern |= flag;
+				flag <<= 1;
+			}
+
+			switch (pattern)
+			{
+				case 0:
+				case 1:
+				case 4:
+				case 32:
+				case 128:
+				case 5:
+				case 132:
+				case 160:
+				case 33:
+				case 129:
+				case 36:
+				case 133:
+				case 164:
+				case 161:
+				case 37:
+				case 165:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 2:
+				case 34:
+				case 130:
+				case 162:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 16:
+				case 17:
+				case 48:
+				case 49:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 64:
+				case 65:
+				case 68:
+				case 69:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 8:
+				case 12:
+				case 136:
+				case 140:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 3:
+				case 35:
+				case 131:
+				case 163:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 6:
+				case 38:
+				case 134:
+				case 166:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_5_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 20:
+				case 21:
+				case 52:
+				case 53:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 144:
+				case 145:
+				case 176:
+				case 177:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 192:
+				case 193:
+				case 196:
+				case 197:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_5_3_1
+					break;
+				case 96:
+				case 97:
+				case 100:
+				case 101:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 40:
+				case 44:
+				case 168:
+				case 172:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 9:
+				case 13:
+				case 137:
+				case 141:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 18:
+				case 50:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 80:
+				case 81:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 72:
+				case 76:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 10:
+				case 138:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 66:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 24:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 7:
+				case 39:
+				case 135:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 148:
+				case 149:
+				case 180:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 224:
+				case 228:
+				case 225:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 41:
+				case 169:
+				case 45:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 22:
+				case 54:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 208:
+				case 209:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 104:
+				case 108:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 11:
+				case 139:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 19:
+				case 51:
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_00_4_3_3_1
+					MIX_01_4_2_3_1
+					}
+					else
+					{
+					MIX_00_4_1_3_5_2_1
+					MIX_01_4_1_5_2_3_3
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 146:
+				case 178:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_01_4_2_3_1
+					MIX_11_4_7_3_1
+					}
+					else
+					{
+					MIX_01_4_1_5_2_3_3
+					MIX_11_4_5_7_5_2_1
+					}
+					MIX_10_4_7_3_2_1_1
+					break;
+				case 84:
+				case 85:
+					MIX_00_4_3_1_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_01_4_1_3_1
+					MIX_11_4_8_3_1
+					}
+					else
+					{
+					MIX_01_4_5_1_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					MIX_10_4_6_3_2_1_1
+					break;
+				case 112:
+				case 113:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_10_4_3_3_1
+					MIX_11_4_8_3_1
+					}
+					else
+					{
+					MIX_10_4_7_3_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					break;
+				case 200:
+				case 204:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_10_4_6_3_1
+					MIX_11_4_5_3_1
+					}
+					else
+					{
+					MIX_10_4_7_3_2_3_3
+					MIX_11_4_7_5_5_2_1
+					}
+					break;
+				case 73:
+				case 77:
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_00_4_1_3_1
+					MIX_10_4_6_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_5_2_1
+					MIX_10_4_7_3_2_3_3
+					}
+					MIX_01_4_1_5_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 42:
+				case 170:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_MIX_00_4_0_3_1
+					MIX_10_4_7_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_10_4_3_7_5_2_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 14:
+				case 142:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_MIX_00_4_0_3_1
+					MIX_01_4_5_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_01_4_1_5_5_2_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 67:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 70:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_5_3_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 28:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 152:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 194:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_5_3_1
+					break;
+				case 98:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 56:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 25:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 26:
+				case 31:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 82:
+				case 214:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 88:
+				case 248:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 74:
+				case 107:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 27:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 86:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_3_1
+					break;
+				case 216:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 106:
+					MIX_00_MIX_00_4_0_3_1
+					MIX_01_4_2_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 30:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 210:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_3_1
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 120:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_3_1
+					break;
+				case 75:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 29:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 198:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_5_3_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_5_3_1
+					break;
+				case 184:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 99:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 57:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 71:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 156:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 226:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 60:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 195:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_5_3_1
+					break;
+				case 102:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_5_3_1
+					MIX_10_4_3_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 153:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 58:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 83:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 92:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 202:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 78:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 154:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 114:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 89:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 90:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 55:
+				case 23:
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_00_4_3_3_1
+					MIX_01_4
+					}
+					else
+					{
+					MIX_00_4_1_3_5_2_1
+					MIX_01_4_1_5_2_3_3
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 182:
+				case 150:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_01_4
+					MIX_11_4_7_3_1
+					}
+					else
+					{
+					MIX_01_4_1_5_2_3_3
+					MIX_11_4_5_7_5_2_1
+					}
+					MIX_10_4_7_3_2_1_1
+					break;
+				case 213:
+				case 212:
+					MIX_00_4_3_1_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_01_4_1_3_1
+					MIX_11_4
+					}
+					else
+					{
+					MIX_01_4_5_1_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					MIX_10_4_6_3_2_1_1
+					break;
+				case 241:
+				case 240:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_10_4_3_3_1
+					MIX_11_4
+					}
+					else
+					{
+					MIX_10_4_7_3_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					break;
+				case 236:
+				case 232:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_10_4
+					MIX_11_4_5_3_1
+					}
+					else
+					{
+					MIX_10_4_7_3_2_3_3
+					MIX_11_4_7_5_5_2_1
+					}
+					break;
+				case 109:
+				case 105:
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_00_4_1_3_1
+					MIX_10_4
+					}
+					else
+					{
+					MIX_00_4_3_1_5_2_1
+					MIX_10_4_7_3_2_3_3
+					}
+					MIX_01_4_1_5_2_1_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 171:
+				case 43:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_4
+					MIX_10_4_7_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_10_4_3_7_5_2_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 143:
+				case 15:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_4
+					MIX_01_4_5_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_01_4_1_5_5_2_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 124:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_3_1
+					break;
+				case 203:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_6_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 62:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 211:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_3_1
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 118:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_3_3_1
+					MIX_11_4_8_3_1
+					break;
+				case 217:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_6_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 110:
+					MIX_00_MIX_00_4_0_3_1
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 155:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 188:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 185:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 61:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 157:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 103:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_3_3_1
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 227:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 230:
+					MIX_00_4_0_3_2_1_1
+					MIX_01_4_5_3_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 199:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_6_3_2_1_1
+					MIX_11_4_5_3_1
+					break;
+				case 220:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 158:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 234:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 242:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 59:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 121:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 87:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 79:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 122:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 94:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 218:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 91:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 229:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 167:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 173:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_5_2_1_1
+					MIX_10_4_7_3_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 181:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 186:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 115:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 93:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 206:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 205:
+				case 201:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4_6_3_1
+					}
+					else
+					{
+						MIX_10_4_7_3_6_1_1
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 174:
+				case 46:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_MIX_00_4_0_3_1
+					}
+					else
+					{
+						MIX_00_4_3_1_6_1_1
+					}
+					MIX_01_4_5_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 179:
+				case 147:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4_2_3_1
+					}
+					else
+					{
+						MIX_01_4_1_5_6_1_1
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 117:
+				case 116:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4_8_3_1
+					}
+					else
+					{
+						MIX_11_4_5_7_6_1_1
+					}
+					break;
+				case 189:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 231:
+					MIX_00_4_3_3_1
+					MIX_01_4_5_3_1
+					MIX_10_4_3_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 126:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_3_1
+					break;
+				case 219:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_3_1
+					MIX_10_4_6_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 125:
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_00_4_1_3_1
+					MIX_10_4
+					}
+					else
+					{
+					MIX_00_4_3_1_5_2_1
+					MIX_10_4_7_3_2_3_3
+					}
+					MIX_01_4_1_3_1
+					MIX_11_4_8_3_1
+					break;
+				case 221:
+					MIX_00_4_1_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_01_4_1_3_1
+					MIX_11_4
+					}
+					else
+					{
+					MIX_01_4_5_1_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					MIX_10_4_6_3_1
+					break;
+				case 207:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_4
+					MIX_01_4_5_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_01_4_1_5_5_2_1
+					}
+					MIX_10_4_6_3_1
+					MIX_11_4_5_3_1
+					break;
+				case 238:
+					MIX_00_MIX_00_4_0_3_1
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+					MIX_10_4
+					MIX_11_4_5_3_1
+					}
+					else
+					{
+					MIX_10_4_7_3_2_3_3
+					MIX_11_4_7_5_5_2_1
+					}
+					break;
+				case 190:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_01_4
+					MIX_11_4_7_3_1
+					}
+					else
+					{
+					MIX_01_4_1_5_2_3_3
+					MIX_11_4_5_7_5_2_1
+					}
+					MIX_10_4_7_3_1
+					break;
+				case 187:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+					MIX_00_4
+					MIX_10_4_7_3_1
+					}
+					else
+					{
+					MIX_00_4_3_1_2_3_3
+					MIX_10_4_3_7_5_2_1
+					}
+					MIX_01_4_2_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 243:
+					MIX_00_4_3_3_1
+					MIX_01_4_2_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+					MIX_10_4_3_3_1
+					MIX_11_4
+					}
+					else
+					{
+					MIX_10_4_7_3_5_2_1
+					MIX_11_4_5_7_2_3_3
+					}
+					break;
+				case 119:
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+					MIX_00_4_3_3_1
+					MIX_01_4
+					}
+					else
+					{
+					MIX_00_4_1_3_5_2_1
+					MIX_01_4_1_5_2_3_3
+					}
+					MIX_10_4_3_3_1
+					MIX_11_4_8_3_1
+					break;
+				case 237:
+				case 233:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 175:
+				case 47:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					MIX_01_4_5_3_1
+					MIX_10_4_7_3_1
+					MIX_11_4_5_7_2_1_1
+					break;
+				case 183:
+				case 151:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_7_3_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 245:
+				case 244:
+					MIX_00_4_3_1_2_1_1
+					MIX_01_4_1_3_1
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 250:
+					MIX_00_MIX_00_4_0_3_1
+					MIX_01_4_2_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 123:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_3_1
+					break;
+				case 95:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_3_1
+					MIX_11_4_8_3_1
+					break;
+				case 222:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_6_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 252:
+					MIX_00_4_0_1_2_1_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 249:
+					MIX_00_4_1_3_1
+					MIX_01_4_2_1_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 235:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_5_2_1_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 111:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_5_2_1_1
+					break;
+				case 63:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_8_7_2_1_1
+					break;
+				case 159:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_6_7_2_1_1
+					MIX_11_4_7_3_1
+					break;
+				case 215:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_6_3_2_1_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 246:
+					MIX_00_4_0_3_2_1_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 254:
+					MIX_00_MIX_00_4_0_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 253:
+					MIX_00_4_1_3_1
+					MIX_01_4_1_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 251:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					MIX_01_4_2_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 239:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					MIX_01_4_5_3_1
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_6_3_10
+					}
+					MIX_11_4_5_3_1
+					break;
+				case 127:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_1_5_2_1_1
+					}
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+					{
+						MIX_10_4
+					}
+					else
+					{
+						MIX_10_4_7_3_2_1_1
+					}
+					MIX_11_4_8_3_1
+					break;
+				case 191:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_MIX_00_4_0_3_10
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_7_3_1
+					MIX_11_4_7_3_1
+					break;
+				case 223:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+					{
+						MIX_00_4
+					}
+					else
+					{
+						MIX_00_4_3_1_2_1_1
+					}
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_6_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_5_7_2_1_1
+					}
+					break;
+				case 247:
+					MIX_00_4_3_3_1
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+					{
+						MIX_01_4
+					}
+					else
+					{
+						MIX_01_4_2_3_10
+					}
+					MIX_10_4_3_3_1
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+					{
+						MIX_11_4
+					}
+					else
+					{
+						MIX_11_4_8_3_10
+					}
+					break;
+				case 255:
+					if (isDifferent(w[3], w[1], trY, trU, trV, trA))
+						MIX_00_4
+					else
+						MIX_00_MIX_00_4_0_3_10
+
+					if (isDifferent(w[1], w[5], trY, trU, trV, trA))
+						MIX_01_4
+					else
+						MIX_01_4_2_3_10
+
+					if (isDifferent(w[7], w[3], trY, trU, trV, trA))
+						MIX_10_4
+					else
+						MIX_10_4_6_3_10
+
+					if (isDifferent(w[5], w[7], trY, trU, trV, trA))
+						MIX_11_4
+					else
+						MIX_11_4_8_3_10
+					break;
+			}
+			image++;
+			output += 2;
+		}
+		output += lineSize;
+	}
+
+	return output;
+}
diff --git a/thirdparty/misc/hq2x.h b/thirdparty/misc/hq2x.h
new file mode 100644
index 0000000000..8f119d2a01
--- /dev/null
+++ b/thirdparty/misc/hq2x.h
@@ -0,0 +1,19 @@
+#ifndef HQ2X_H
+#define HQ2X_H
+
+#include "typedefs.h"
+
+
+uint32_t *hq2x_resize(
+		const uint32_t *image,
+		uint32_t width,
+		uint32_t height,
+		uint32_t *output,
+		uint32_t trY = 0x30,
+		uint32_t trU = 0x07,
+		uint32_t trV = 0x06,
+		uint32_t trA = 0x50,
+		bool wrapX = false,
+		bool wrapY = false );
+
+#endif // HQ2X_H
diff --git a/thirdparty/misc/md5.cpp b/thirdparty/misc/md5.cpp
new file mode 100644
index 0000000000..1653ab0be5
--- /dev/null
+++ b/thirdparty/misc/md5.cpp
@@ -0,0 +1,267 @@
+/*
+ **********************************************************************
+ ** md5.c                                                            **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version                  **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+/* -- include the following line if the md5.h header file is separate -- */
+#include "md5.h"
+
+/* forward declaration */
+static void Transform (uint32_t *buf, uint32_t *in);
+
+
+static unsigned char PADDING[64] = {
+  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* F, G and H are basic MD5 functions: selection, majority, parity */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s, ac) \
+  {(a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define GG(a, b, c, d, x, s, ac) \
+  {(a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define HH(a, b, c, d, x, s, ac) \
+  {(a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define II(a, b, c, d, x, s, ac) \
+  {(a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+
+void MD5Init (MD5_CTX *mdContext)
+{
+  mdContext->i[0] = mdContext->i[1] = (uint32_t)0;
+
+  /* Load magic initialization constants.
+   */
+  mdContext->buf[0] = (uint32_t)0x67452301;
+  mdContext->buf[1] = (uint32_t)0xefcdab89;
+  mdContext->buf[2] = (uint32_t)0x98badcfe;
+  mdContext->buf[3] = (uint32_t)0x10325476;
+}
+
+void MD5Update (MD5_CTX *mdContext,unsigned char *inBuf,unsigned int inLen) {
+  uint32_t in[16];
+  int mdi;
+  unsigned int i, ii;
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* update number of bits */
+  if ((mdContext->i[0] + ((uint32_t)inLen << 3)) < mdContext->i[0])
+    mdContext->i[1]++;
+  mdContext->i[0] += ((uint32_t)inLen << 3);
+  mdContext->i[1] += ((uint32_t)inLen >> 29);
+
+  while (inLen--) {
+    /* add new character to buffer, increment mdi */
+    mdContext->in[mdi++] = *inBuf++;
+
+    /* transform if necessary */
+    if (mdi == 0x40) {
+      for (i = 0, ii = 0; i < 16; i++, ii += 4)
+	in[i] = (((uint32_t)mdContext->in[ii+3]) << 24) |
+		(((uint32_t)mdContext->in[ii+2]) << 16) |
+		(((uint32_t)mdContext->in[ii+1]) << 8) |
+		((uint32_t)mdContext->in[ii]);
+      Transform (mdContext->buf, in);
+      mdi = 0;
+    }
+  }
+}
+
+void MD5Final (MD5_CTX *mdContext) {
+  uint32_t in[16];
+  int mdi;
+  unsigned int i, ii;
+  unsigned int padLen;
+
+  /* save number of bits */
+  in[14] = mdContext->i[0];
+  in[15] = mdContext->i[1];
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* pad out to 56 mod 64 */
+  padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi);
+  MD5Update (mdContext, PADDING, padLen);
+
+  /* append length in bits and transform */
+  for (i = 0, ii = 0; i < 14; i++, ii += 4)
+    in[i] = (((uint32_t)mdContext->in[ii+3]) << 24) |
+	    (((uint32_t)mdContext->in[ii+2]) << 16) |
+	    (((uint32_t)mdContext->in[ii+1]) << 8) |
+	    ((uint32_t)mdContext->in[ii]);
+  Transform (mdContext->buf, in);
+
+  /* store buffer in digest */
+  for (i = 0, ii = 0; i < 4; i++, ii += 4) {
+    mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF);
+    mdContext->digest[ii+1] =
+      (unsigned char)((mdContext->buf[i] >> 8) & 0xFF);
+    mdContext->digest[ii+2] =
+      (unsigned char)((mdContext->buf[i] >> 16) & 0xFF);
+    mdContext->digest[ii+3] =
+      (unsigned char)((mdContext->buf[i] >> 24) & 0xFF);
+  }
+}
+
+/* Basic MD5 step. Transform buf based on in.
+ */
+static void Transform (uint32_t *buf, uint32_t *in) {
+  uint32_t a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+  /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+  FF ( a, b, c, d, in[ 0], S11, 3614090360); /* 1 */
+  FF ( d, a, b, c, in[ 1], S12, 3905402710); /* 2 */
+  FF ( c, d, a, b, in[ 2], S13,  606105819); /* 3 */
+  FF ( b, c, d, a, in[ 3], S14, 3250441966); /* 4 */
+  FF ( a, b, c, d, in[ 4], S11, 4118548399); /* 5 */
+  FF ( d, a, b, c, in[ 5], S12, 1200080426); /* 6 */
+  FF ( c, d, a, b, in[ 6], S13, 2821735955); /* 7 */
+  FF ( b, c, d, a, in[ 7], S14, 4249261313); /* 8 */
+  FF ( a, b, c, d, in[ 8], S11, 1770035416); /* 9 */
+  FF ( d, a, b, c, in[ 9], S12, 2336552879); /* 10 */
+  FF ( c, d, a, b, in[10], S13, 4294925233); /* 11 */
+  FF ( b, c, d, a, in[11], S14, 2304563134); /* 12 */
+  FF ( a, b, c, d, in[12], S11, 1804603682); /* 13 */
+  FF ( d, a, b, c, in[13], S12, 4254626195); /* 14 */
+  FF ( c, d, a, b, in[14], S13, 2792965006); /* 15 */
+  FF ( b, c, d, a, in[15], S14, 1236535329); /* 16 */
+
+  /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+  GG ( a, b, c, d, in[ 1], S21, 4129170786); /* 17 */
+  GG ( d, a, b, c, in[ 6], S22, 3225465664); /* 18 */
+  GG ( c, d, a, b, in[11], S23,  643717713); /* 19 */
+  GG ( b, c, d, a, in[ 0], S24, 3921069994); /* 20 */
+  GG ( a, b, c, d, in[ 5], S21, 3593408605); /* 21 */
+  GG ( d, a, b, c, in[10], S22,   38016083); /* 22 */
+  GG ( c, d, a, b, in[15], S23, 3634488961); /* 23 */
+  GG ( b, c, d, a, in[ 4], S24, 3889429448); /* 24 */
+  GG ( a, b, c, d, in[ 9], S21,  568446438); /* 25 */
+  GG ( d, a, b, c, in[14], S22, 3275163606); /* 26 */
+  GG ( c, d, a, b, in[ 3], S23, 4107603335); /* 27 */
+  GG ( b, c, d, a, in[ 8], S24, 1163531501); /* 28 */
+  GG ( a, b, c, d, in[13], S21, 2850285829); /* 29 */
+  GG ( d, a, b, c, in[ 2], S22, 4243563512); /* 30 */
+  GG ( c, d, a, b, in[ 7], S23, 1735328473); /* 31 */
+  GG ( b, c, d, a, in[12], S24, 2368359562); /* 32 */
+
+  /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+  HH ( a, b, c, d, in[ 5], S31, 4294588738); /* 33 */
+  HH ( d, a, b, c, in[ 8], S32, 2272392833); /* 34 */
+  HH ( c, d, a, b, in[11], S33, 1839030562); /* 35 */
+  HH ( b, c, d, a, in[14], S34, 4259657740); /* 36 */
+  HH ( a, b, c, d, in[ 1], S31, 2763975236); /* 37 */
+  HH ( d, a, b, c, in[ 4], S32, 1272893353); /* 38 */
+  HH ( c, d, a, b, in[ 7], S33, 4139469664); /* 39 */
+  HH ( b, c, d, a, in[10], S34, 3200236656); /* 40 */
+  HH ( a, b, c, d, in[13], S31,  681279174); /* 41 */
+  HH ( d, a, b, c, in[ 0], S32, 3936430074); /* 42 */
+  HH ( c, d, a, b, in[ 3], S33, 3572445317); /* 43 */
+  HH ( b, c, d, a, in[ 6], S34,   76029189); /* 44 */
+  HH ( a, b, c, d, in[ 9], S31, 3654602809); /* 45 */
+  HH ( d, a, b, c, in[12], S32, 3873151461); /* 46 */
+  HH ( c, d, a, b, in[15], S33,  530742520); /* 47 */
+  HH ( b, c, d, a, in[ 2], S34, 3299628645); /* 48 */
+
+  /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+  II ( a, b, c, d, in[ 0], S41, 4096336452); /* 49 */
+  II ( d, a, b, c, in[ 7], S42, 1126891415); /* 50 */
+  II ( c, d, a, b, in[14], S43, 2878612391); /* 51 */
+  II ( b, c, d, a, in[ 5], S44, 4237533241); /* 52 */
+  II ( a, b, c, d, in[12], S41, 1700485571); /* 53 */
+  II ( d, a, b, c, in[ 3], S42, 2399980690); /* 54 */
+  II ( c, d, a, b, in[10], S43, 4293915773); /* 55 */
+  II ( b, c, d, a, in[ 1], S44, 2240044497); /* 56 */
+  II ( a, b, c, d, in[ 8], S41, 1873313359); /* 57 */
+  II ( d, a, b, c, in[15], S42, 4264355552); /* 58 */
+  II ( c, d, a, b, in[ 6], S43, 2734768916); /* 59 */
+  II ( b, c, d, a, in[13], S44, 1309151649); /* 60 */
+  II ( a, b, c, d, in[ 4], S41, 4149444226); /* 61 */
+  II ( d, a, b, c, in[11], S42, 3174756917); /* 62 */
+  II ( c, d, a, b, in[ 2], S43,  718787259); /* 63 */
+  II ( b, c, d, a, in[ 9], S44, 3951481745); /* 64 */
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+/*
+ **********************************************************************
+ ** End of md5.c                                                     **
+ ******************************* (cut) ********************************
+ */
diff --git a/thirdparty/misc/md5.h b/thirdparty/misc/md5.h
new file mode 100644
index 0000000000..e99d58b443
--- /dev/null
+++ b/thirdparty/misc/md5.h
@@ -0,0 +1,61 @@
+#ifndef MD5_H
+#define MD5_H
+
+/*
+ **********************************************************************
+ ** md5.h -- Header file for implementation of MD5                   **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version              **
+ ** Revised (for MD5): RLR 4/27/91                                   **
+ **   -- G modified to have y&~z instead of y&z                      **
+ **   -- FF, GG, HH modified to add in last register done            **
+ **   -- Access pattern: round 2 works mod 5, round 3 works mod 3    **
+ **   -- distinct additive constant for each step                    **
+ **   -- round 4 added, working mod 7                                **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+/* NOT typedef a 32 bit type */
+
+#include "typedefs.h"
+
+/* Data structure for MD5 (Message Digest) computation */
+typedef struct {
+  uint32_t i[2];                   /* number of _bits_ handled mod 2^64 */
+  uint32_t buf[4];                                    /* scratch buffer */
+  unsigned char in[64];                              /* input buffer */
+  unsigned char digest[16];     /* actual digest after MD5Final call */
+} MD5_CTX;
+
+void MD5Init (MD5_CTX *mdContext);
+void MD5Update (MD5_CTX *mdContext,unsigned char *inBuf,unsigned int inLen);
+void MD5Final (MD5_CTX *mdContext);
+
+
+
+#endif // MD5_H
diff --git a/thirdparty/misc/mikktspace.c b/thirdparty/misc/mikktspace.c
new file mode 100644
index 0000000000..62aa2da251
--- /dev/null
+++ b/thirdparty/misc/mikktspace.c
@@ -0,0 +1,1890 @@
+/** \file mikktspace/mikktspace.c
+ *  \ingroup mikktspace
+ */
+/**
+ *  Copyright (C) 2011 by Morten S. Mikkelsen
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdlib.h>
+
+#include "mikktspace.h"
+
+#define TFALSE		0
+#define TTRUE		1
+
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#define INTERNAL_RND_SORT_SEED		39871946
+
+// internal structure
+typedef struct {
+	float x, y, z;
+} SVec3;
+
+static tbool			veq( const SVec3 v1, const SVec3 v2 )
+{
+	return (v1.x == v2.x) && (v1.y == v2.y) && (v1.z == v2.z);
+}
+
+static SVec3		vadd( const SVec3 v1, const SVec3 v2 )
+{
+	SVec3 vRes;
+
+	vRes.x = v1.x + v2.x;
+	vRes.y = v1.y + v2.y;
+	vRes.z = v1.z + v2.z;
+
+	return vRes;
+}
+
+
+static SVec3		vsub( const SVec3 v1, const SVec3 v2 )
+{
+	SVec3 vRes;
+
+	vRes.x = v1.x - v2.x;
+	vRes.y = v1.y - v2.y;
+	vRes.z = v1.z - v2.z;
+
+	return vRes;
+}
+
+static SVec3		vscale(const float fS, const SVec3 v)
+{
+	SVec3 vRes;
+
+	vRes.x = fS * v.x;
+	vRes.y = fS * v.y;
+	vRes.z = fS * v.z;
+
+	return vRes;
+}
+
+static float			LengthSquared( const SVec3 v )
+{
+	return v.x*v.x + v.y*v.y + v.z*v.z;
+}
+
+static float			Length( const SVec3 v )
+{
+	return sqrtf(LengthSquared(v));
+}
+
+static SVec3		Normalize( const SVec3 v )
+{
+	return vscale(1 / Length(v), v);
+}
+
+static float		vdot( const SVec3 v1, const SVec3 v2)
+{
+	return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z;
+}
+
+
+static tbool NotZero(const float fX)
+{
+	// could possibly use FLT_EPSILON instead
+	return fabsf(fX) > FLT_MIN;
+}
+
+static tbool VNotZero(const SVec3 v)
+{
+	// might change this to an epsilon based test
+	return NotZero(v.x) || NotZero(v.y) || NotZero(v.z);
+}
+
+
+
+typedef struct {
+	int iNrFaces;
+	int * pTriMembers;
+} SSubGroup;
+
+typedef struct {
+	int iNrFaces;
+	int * pFaceIndices;
+	int iVertexRepresentitive;
+	tbool bOrientPreservering;
+} SGroup;
+
+// 
+#define MARK_DEGENERATE				1
+#define QUAD_ONE_DEGEN_TRI			2
+#define GROUP_WITH_ANY				4
+#define ORIENT_PRESERVING			8
+
+
+
+typedef struct {
+	int FaceNeighbors[3];
+	SGroup * AssignedGroup[3];
+	
+	// normalized first order face derivatives
+	SVec3 vOs, vOt;
+	float fMagS, fMagT;	// original magnitudes
+
+	// determines if the current and the next triangle are a quad.
+	int iOrgFaceNumber;
+	int iFlag, iTSpacesOffs;
+	unsigned char vert_num[4];
+} STriInfo;
+
+typedef struct {
+	SVec3 vOs;
+	float fMagS;
+	SVec3 vOt;
+	float fMagT;
+	int iCounter;	// this is to average back into quads.
+	tbool bOrient;
+} STSpace;
+
+static int GenerateInitialVerticesIndexList(STriInfo pTriInfos[], int piTriList_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static void GenerateSharedVerticesIndexList(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static void InitTriInfo(STriInfo pTriInfos[], const int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static int Build4RuleGroups(STriInfo pTriInfos[], SGroup pGroups[], int piGroupTrianglesBuffer[], const int piTriListIn[], const int iNrTrianglesIn);
+static tbool GenerateTSpaces(STSpace psTspace[], const STriInfo pTriInfos[], const SGroup pGroups[],
+                             const int iNrActiveGroups, const int piTriListIn[], const float fThresCos,
+                             const SMikkTSpaceContext * pContext);
+
+static int MakeIndex(const int iFace, const int iVert)
+{
+	assert(iVert>=0 && iVert<4 && iFace>=0);
+	return (iFace<<2) | (iVert&0x3);
+}
+
+static void IndexToData(int * piFace, int * piVert, const int iIndexIn)
+{
+	piVert[0] = iIndexIn&0x3;
+	piFace[0] = iIndexIn>>2;
+}
+
+static STSpace AvgTSpace(const STSpace * pTS0, const STSpace * pTS1)
+{
+	STSpace ts_res;
+
+	// this if is important. Due to floating point precision
+	// averaging when ts0==ts1 will cause a slight difference
+	// which results in tangent space splits later on
+	if (pTS0->fMagS==pTS1->fMagS && pTS0->fMagT==pTS1->fMagT &&
+	   veq(pTS0->vOs,pTS1->vOs)	&& veq(pTS0->vOt, pTS1->vOt))
+	{
+		ts_res.fMagS = pTS0->fMagS;
+		ts_res.fMagT = pTS0->fMagT;
+		ts_res.vOs = pTS0->vOs;
+		ts_res.vOt = pTS0->vOt;
+	}
+	else
+	{
+		ts_res.fMagS = 0.5f*(pTS0->fMagS+pTS1->fMagS);
+		ts_res.fMagT = 0.5f*(pTS0->fMagT+pTS1->fMagT);
+		ts_res.vOs = vadd(pTS0->vOs,pTS1->vOs);
+		ts_res.vOt = vadd(pTS0->vOt,pTS1->vOt);
+		if ( VNotZero(ts_res.vOs) ) ts_res.vOs = Normalize(ts_res.vOs);
+		if ( VNotZero(ts_res.vOt) ) ts_res.vOt = Normalize(ts_res.vOt);
+	}
+
+	return ts_res;
+}
+
+
+
+static SVec3 GetPosition(const SMikkTSpaceContext * pContext, const int index);
+static SVec3 GetNormal(const SMikkTSpaceContext * pContext, const int index);
+static SVec3 GetTexCoord(const SMikkTSpaceContext * pContext, const int index);
+
+
+// degen triangles
+static void DegenPrologue(STriInfo pTriInfos[], int piTriList_out[], const int iNrTrianglesIn, const int iTotTris);
+static void DegenEpilogue(STSpace psTspace[], STriInfo pTriInfos[], int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn, const int iTotTris);
+
+
+tbool genTangSpaceDefault(const SMikkTSpaceContext * pContext)
+{
+	return genTangSpace(pContext, 180.0f);
+}
+
+tbool genTangSpace(const SMikkTSpaceContext * pContext, const float fAngularThreshold)
+{
+	// count nr_triangles
+	int * piTriListIn = NULL, * piGroupTrianglesBuffer = NULL;
+	STriInfo * pTriInfos = NULL;
+	SGroup * pGroups = NULL;
+	STSpace * psTspace = NULL;
+	int iNrTrianglesIn = 0, f=0, t=0, i=0;
+	int iNrTSPaces = 0, iTotTris = 0, iDegenTriangles = 0, iNrMaxGroups = 0;
+	int iNrActiveGroups = 0, index = 0;
+	const int iNrFaces = pContext->m_pInterface->m_getNumFaces(pContext);
+	tbool bRes = TFALSE;
+	const float fThresCos = (float) cos((fAngularThreshold*(float)M_PI)/180.0f);
+
+	// verify all call-backs have been set
+	if ( pContext->m_pInterface->m_getNumFaces==NULL ||
+		pContext->m_pInterface->m_getNumVerticesOfFace==NULL ||
+		pContext->m_pInterface->m_getPosition==NULL ||
+		pContext->m_pInterface->m_getNormal==NULL ||
+		pContext->m_pInterface->m_getTexCoord==NULL )
+		return TFALSE;
+
+	// count triangles on supported faces
+	for (f=0; f<iNrFaces; f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts==3) ++iNrTrianglesIn;
+		else if (verts==4) iNrTrianglesIn += 2;
+	}
+	if (iNrTrianglesIn<=0) return TFALSE;
+
+	// allocate memory for an index list
+	piTriListIn = (int *) malloc(sizeof(int)*3*iNrTrianglesIn);
+	pTriInfos = (STriInfo *) malloc(sizeof(STriInfo)*iNrTrianglesIn);
+	if (piTriListIn==NULL || pTriInfos==NULL)
+	{
+		if (piTriListIn!=NULL) free(piTriListIn);
+		if (pTriInfos!=NULL) free(pTriInfos);
+		return TFALSE;
+	}
+
+	// make an initial triangle --> face index list
+	iNrTSPaces = GenerateInitialVerticesIndexList(pTriInfos, piTriListIn, pContext, iNrTrianglesIn);
+
+	// make a welded index list of identical positions and attributes (pos, norm, texc)
+	//printf("gen welded index list begin\n");
+	GenerateSharedVerticesIndexList(piTriListIn, pContext, iNrTrianglesIn);
+	//printf("gen welded index list end\n");
+
+	// Mark all degenerate triangles
+	iTotTris = iNrTrianglesIn;
+	iDegenTriangles = 0;
+	for (t=0; t<iTotTris; t++)
+	{
+		const int i0 = piTriListIn[t*3+0];
+		const int i1 = piTriListIn[t*3+1];
+		const int i2 = piTriListIn[t*3+2];
+		const SVec3 p0 = GetPosition(pContext, i0);
+		const SVec3 p1 = GetPosition(pContext, i1);
+		const SVec3 p2 = GetPosition(pContext, i2);
+		if (veq(p0,p1) || veq(p0,p2) || veq(p1,p2))	// degenerate
+		{
+			pTriInfos[t].iFlag |= MARK_DEGENERATE;
+			++iDegenTriangles;
+		}
+	}
+	iNrTrianglesIn = iTotTris - iDegenTriangles;
+
+	// mark all triangle pairs that belong to a quad with only one
+	// good triangle. These need special treatment in DegenEpilogue().
+	// Additionally, move all good triangles to the start of
+	// pTriInfos[] and piTriListIn[] without changing order and
+	// put the degenerate triangles last.
+	DegenPrologue(pTriInfos, piTriListIn, iNrTrianglesIn, iTotTris);
+
+	
+	// evaluate triangle level attributes and neighbor list
+	//printf("gen neighbors list begin\n");
+	InitTriInfo(pTriInfos, piTriListIn, pContext, iNrTrianglesIn);
+	//printf("gen neighbors list end\n");
+
+	
+	// based on the 4 rules, identify groups based on connectivity
+	iNrMaxGroups = iNrTrianglesIn*3;
+	pGroups = (SGroup *) malloc(sizeof(SGroup)*iNrMaxGroups);
+	piGroupTrianglesBuffer = (int *) malloc(sizeof(int)*iNrTrianglesIn*3);
+	if (pGroups==NULL || piGroupTrianglesBuffer==NULL)
+	{
+		if (pGroups!=NULL) free(pGroups);
+		if (piGroupTrianglesBuffer!=NULL) free(piGroupTrianglesBuffer);
+		free(piTriListIn);
+		free(pTriInfos);
+		return TFALSE;
+	}
+	//printf("gen 4rule groups begin\n");
+	iNrActiveGroups =
+		Build4RuleGroups(pTriInfos, pGroups, piGroupTrianglesBuffer, piTriListIn, iNrTrianglesIn);
+	//printf("gen 4rule groups end\n");
+
+	//
+
+	psTspace = (STSpace *) malloc(sizeof(STSpace)*iNrTSPaces);
+	if (psTspace==NULL)
+	{
+		free(piTriListIn);
+		free(pTriInfos);
+		free(pGroups);
+		free(piGroupTrianglesBuffer);
+		return TFALSE;
+	}
+	memset(psTspace, 0, sizeof(STSpace)*iNrTSPaces);
+	for (t=0; t<iNrTSPaces; t++)
+	{
+		psTspace[t].vOs.x=1.0f; psTspace[t].vOs.y=0.0f; psTspace[t].vOs.z=0.0f; psTspace[t].fMagS = 1.0f;
+		psTspace[t].vOt.x=0.0f; psTspace[t].vOt.y=1.0f; psTspace[t].vOt.z=0.0f; psTspace[t].fMagT = 1.0f;
+	}
+
+	// make tspaces, each group is split up into subgroups if necessary
+	// based on fAngularThreshold. Finally a tangent space is made for
+	// every resulting subgroup
+	//printf("gen tspaces begin\n");
+	bRes = GenerateTSpaces(psTspace, pTriInfos, pGroups, iNrActiveGroups, piTriListIn, fThresCos, pContext);
+	//printf("gen tspaces end\n");
+	
+	// clean up
+	free(pGroups);
+	free(piGroupTrianglesBuffer);
+
+	if (!bRes)	// if an allocation in GenerateTSpaces() failed
+	{
+		// clean up and return false
+		free(pTriInfos); free(piTriListIn); free(psTspace);
+		return TFALSE;
+	}
+
+
+	// degenerate quads with one good triangle will be fixed by copying a space from
+	// the good triangle to the coinciding vertex.
+	// all other degenerate triangles will just copy a space from any good triangle
+	// with the same welded index in piTriListIn[].
+	DegenEpilogue(psTspace, pTriInfos, piTriListIn, pContext, iNrTrianglesIn, iTotTris);
+
+	free(pTriInfos); free(piTriListIn);
+
+	index = 0;
+	for (f=0; f<iNrFaces; f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts!=3 && verts!=4) continue;
+		
+
+		// I've decided to let degenerate triangles and group-with-anythings
+		// vary between left/right hand coordinate systems at the vertices.
+		// All healthy triangles on the other hand are built to always be either or.
+
+		/*// force the coordinate system orientation to be uniform for every face.
+		// (this is already the case for good triangles but not for
+		// degenerate ones and those with bGroupWithAnything==true)
+		bool bOrient = psTspace[index].bOrient;
+		if (psTspace[index].iCounter == 0)	// tspace was not derived from a group
+		{
+			// look for a space created in GenerateTSpaces() by iCounter>0
+			bool bNotFound = true;
+			int i=1;
+			while (i<verts && bNotFound)
+			{
+				if (psTspace[index+i].iCounter > 0) bNotFound=false;
+				else ++i;
+			}
+			if (!bNotFound) bOrient = psTspace[index+i].bOrient;
+		}*/
+
+		// set data
+		for (i=0; i<verts; i++)
+		{
+			const STSpace * pTSpace = &psTspace[index];
+			float tang[] = {pTSpace->vOs.x, pTSpace->vOs.y, pTSpace->vOs.z};
+			float bitang[] = {pTSpace->vOt.x, pTSpace->vOt.y, pTSpace->vOt.z};
+			if (pContext->m_pInterface->m_setTSpace!=NULL)
+				pContext->m_pInterface->m_setTSpace(pContext, tang, bitang, pTSpace->fMagS, pTSpace->fMagT, pTSpace->bOrient, f, i);
+			if (pContext->m_pInterface->m_setTSpaceBasic!=NULL)
+				pContext->m_pInterface->m_setTSpaceBasic(pContext, tang, pTSpace->bOrient==TTRUE ? 1.0f : (-1.0f), f, i);
+
+			++index;
+		}
+	}
+
+	free(psTspace);
+
+	
+	return TTRUE;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+	float vert[3];
+	int index;
+} STmpVert;
+
+static const int g_iCells = 2048;
+
+#ifdef _MSC_VER
+	#define NOINLINE __declspec(noinline)
+#else
+	#define NOINLINE __attribute__ ((noinline))
+#endif
+
+// it is IMPORTANT that this function is called to evaluate the hash since
+// inlining could potentially reorder instructions and generate different
+// results for the same effective input value fVal.
+static NOINLINE int FindGridCell(const float fMin, const float fMax, const float fVal)
+{
+	const float fIndex = g_iCells * ((fVal-fMin)/(fMax-fMin));
+	const int iIndex = (int)fIndex;
+	return iIndex < g_iCells ? (iIndex >= 0 ? iIndex : 0) : (g_iCells - 1);
+}
+
+static void MergeVertsFast(int piTriList_in_and_out[], STmpVert pTmpVert[], const SMikkTSpaceContext * pContext, const int iL_in, const int iR_in);
+static void MergeVertsSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int pTable[], const int iEntries);
+static void GenerateSharedVerticesIndexListSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+
+static void GenerateSharedVerticesIndexList(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+
+	// Generate bounding box
+	int * piHashTable=NULL, * piHashCount=NULL, * piHashOffsets=NULL, * piHashCount2=NULL;
+	STmpVert * pTmpVert = NULL;
+	int i=0, iChannel=0, k=0, e=0;
+	int iMaxCount=0;
+	SVec3 vMin = GetPosition(pContext, 0), vMax = vMin, vDim;
+	float fMin, fMax;
+	for (i=1; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+
+		const SVec3 vP = GetPosition(pContext, index);
+		if (vMin.x > vP.x) vMin.x = vP.x;
+		else if (vMax.x < vP.x) vMax.x = vP.x;
+		if (vMin.y > vP.y) vMin.y = vP.y;
+		else if (vMax.y < vP.y) vMax.y = vP.y;
+		if (vMin.z > vP.z) vMin.z = vP.z;
+		else if (vMax.z < vP.z) vMax.z = vP.z;
+	}
+
+	vDim = vsub(vMax,vMin);
+	iChannel = 0;
+	fMin = vMin.x; fMax=vMax.x;
+	if (vDim.y>vDim.x && vDim.y>vDim.z)
+	{
+		iChannel=1;
+		fMin = vMin.y, fMax=vMax.y;
+	}
+	else if (vDim.z>vDim.x)
+	{
+		iChannel=2;
+		fMin = vMin.z, fMax=vMax.z;
+	}
+
+	// make allocations
+	piHashTable = (int *) malloc(sizeof(int)*iNrTrianglesIn*3);
+	piHashCount = (int *) malloc(sizeof(int)*g_iCells);
+	piHashOffsets = (int *) malloc(sizeof(int)*g_iCells);
+	piHashCount2 = (int *) malloc(sizeof(int)*g_iCells);
+
+	if (piHashTable==NULL || piHashCount==NULL || piHashOffsets==NULL || piHashCount2==NULL)
+	{
+		if (piHashTable!=NULL) free(piHashTable);
+		if (piHashCount!=NULL) free(piHashCount);
+		if (piHashOffsets!=NULL) free(piHashOffsets);
+		if (piHashCount2!=NULL) free(piHashCount2);
+		GenerateSharedVerticesIndexListSlow(piTriList_in_and_out, pContext, iNrTrianglesIn);
+		return;
+	}
+	memset(piHashCount, 0, sizeof(int)*g_iCells);
+	memset(piHashCount2, 0, sizeof(int)*g_iCells);
+
+	// count amount of elements in each cell unit
+	for (i=0; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const float fVal = iChannel==0 ? vP.x : (iChannel==1 ? vP.y : vP.z);
+		const int iCell = FindGridCell(fMin, fMax, fVal);
+		++piHashCount[iCell];
+	}
+
+	// evaluate start index of each cell.
+	piHashOffsets[0]=0;
+	for (k=1; k<g_iCells; k++)
+		piHashOffsets[k]=piHashOffsets[k-1]+piHashCount[k-1];
+
+	// insert vertices
+	for (i=0; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const float fVal = iChannel==0 ? vP.x : (iChannel==1 ? vP.y : vP.z);
+		const int iCell = FindGridCell(fMin, fMax, fVal);
+		int * pTable = NULL;
+
+		assert(piHashCount2[iCell]<piHashCount[iCell]);
+		pTable = &piHashTable[piHashOffsets[iCell]];
+		pTable[piHashCount2[iCell]] = i;	// vertex i has been inserted.
+		++piHashCount2[iCell];
+	}
+	for (k=0; k<g_iCells; k++)
+		assert(piHashCount2[k] == piHashCount[k]);	// verify the count
+	free(piHashCount2);
+
+	// find maximum amount of entries in any hash entry
+	iMaxCount = piHashCount[0];
+	for (k=1; k<g_iCells; k++)
+		if (iMaxCount<piHashCount[k])
+			iMaxCount=piHashCount[k];
+	pTmpVert = (STmpVert *) malloc(sizeof(STmpVert)*iMaxCount);
+	
+
+	// complete the merge
+	for (k=0; k<g_iCells; k++)
+	{
+		// extract table of cell k and amount of entries in it
+		int * pTable = &piHashTable[piHashOffsets[k]];
+		const int iEntries = piHashCount[k];
+		if (iEntries < 2) continue;
+
+		if (pTmpVert!=NULL)
+		{
+			for (e=0; e<iEntries; e++)
+			{
+				int i = pTable[e];
+				const SVec3 vP = GetPosition(pContext, piTriList_in_and_out[i]);
+				pTmpVert[e].vert[0] = vP.x; pTmpVert[e].vert[1] = vP.y;
+				pTmpVert[e].vert[2] = vP.z; pTmpVert[e].index = i;
+			}
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, 0, iEntries-1);
+		}
+		else
+			MergeVertsSlow(piTriList_in_and_out, pContext, pTable, iEntries);
+	}
+
+	if (pTmpVert!=NULL) { free(pTmpVert); }
+	free(piHashTable);
+	free(piHashCount);
+	free(piHashOffsets);
+}
+
+static void MergeVertsFast(int piTriList_in_and_out[], STmpVert pTmpVert[], const SMikkTSpaceContext * pContext, const int iL_in, const int iR_in)
+{
+	// make bbox
+	int c=0, l=0, channel=0;
+	float fvMin[3], fvMax[3];
+	float dx=0, dy=0, dz=0, fSep=0;
+	for (c=0; c<3; c++)
+	{	fvMin[c]=pTmpVert[iL_in].vert[c]; fvMax[c]=fvMin[c];	}
+	for (l=(iL_in+1); l<=iR_in; l++)
+		for (c=0; c<3; c++)
+			if (fvMin[c]>pTmpVert[l].vert[c]) fvMin[c]=pTmpVert[l].vert[c];
+			else if (fvMax[c]<pTmpVert[l].vert[c]) fvMax[c]=pTmpVert[l].vert[c];
+
+	dx = fvMax[0]-fvMin[0];
+	dy = fvMax[1]-fvMin[1];
+	dz = fvMax[2]-fvMin[2];
+
+	channel = 0;
+	if (dy>dx && dy>dz) channel=1;
+	else if (dz>dx) channel=2;
+
+	fSep = 0.5f*(fvMax[channel]+fvMin[channel]);
+
+	// terminate recursion when the separation/average value
+	// is no longer strictly between fMin and fMax values.
+	if (fSep>=fvMax[channel] || fSep<=fvMin[channel])
+	{
+		// complete the weld
+		for (l=iL_in; l<=iR_in; l++)
+		{
+			int i = pTmpVert[l].index;
+			const int index = piTriList_in_and_out[i];
+			const SVec3 vP = GetPosition(pContext, index);
+			const SVec3 vN = GetNormal(pContext, index);
+			const SVec3 vT = GetTexCoord(pContext, index);
+
+			tbool bNotFound = TTRUE;
+			int l2=iL_in, i2rec=-1;
+			while (l2<l && bNotFound)
+			{
+				const int i2 = pTmpVert[l2].index;
+				const int index2 = piTriList_in_and_out[i2];
+				const SVec3 vP2 = GetPosition(pContext, index2);
+				const SVec3 vN2 = GetNormal(pContext, index2);
+				const SVec3 vT2 = GetTexCoord(pContext, index2);
+				i2rec=i2;
+
+				//if (vP==vP2 && vN==vN2 && vT==vT2)
+				if (vP.x==vP2.x && vP.y==vP2.y && vP.z==vP2.z &&
+					vN.x==vN2.x && vN.y==vN2.y && vN.z==vN2.z &&
+					vT.x==vT2.x && vT.y==vT2.y && vT.z==vT2.z)
+					bNotFound = TFALSE;
+				else
+					++l2;
+			}
+			
+			// merge if previously found
+			if (!bNotFound)
+				piTriList_in_and_out[i] = piTriList_in_and_out[i2rec];
+		}
+	}
+	else
+	{
+		int iL=iL_in, iR=iR_in;
+		assert((iR_in-iL_in)>0);	// at least 2 entries
+
+		// separate (by fSep) all points between iL_in and iR_in in pTmpVert[]
+		while (iL < iR)
+		{
+			tbool bReadyLeftSwap = TFALSE, bReadyRightSwap = TFALSE;
+			while ((!bReadyLeftSwap) && iL<iR)
+			{
+				assert(iL>=iL_in && iL<=iR_in);
+				bReadyLeftSwap = !(pTmpVert[iL].vert[channel]<fSep);
+				if (!bReadyLeftSwap) ++iL;
+			}
+			while ((!bReadyRightSwap) && iL<iR)
+			{
+				assert(iR>=iL_in && iR<=iR_in);
+				bReadyRightSwap = pTmpVert[iR].vert[channel]<fSep;
+				if (!bReadyRightSwap) --iR;
+			}
+			assert( (iL<iR) || !(bReadyLeftSwap && bReadyRightSwap) );
+
+			if (bReadyLeftSwap && bReadyRightSwap)
+			{
+				const STmpVert sTmp = pTmpVert[iL];
+				assert(iL<iR);
+				pTmpVert[iL] = pTmpVert[iR];
+				pTmpVert[iR] = sTmp;
+				++iL; --iR;
+			}
+		}
+
+		assert(iL==(iR+1) || (iL==iR));
+		if (iL==iR)
+		{
+			const tbool bReadyRightSwap = pTmpVert[iR].vert[channel]<fSep;
+			if (bReadyRightSwap) ++iL;
+			else --iR;
+		}
+
+		// only need to weld when there is more than 1 instance of the (x,y,z)
+		if (iL_in < iR)
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, iL_in, iR);	// weld all left of fSep
+		if (iL < iR_in)
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, iL, iR_in);	// weld all right of (or equal to) fSep
+	}
+}
+
+static void MergeVertsSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int pTable[], const int iEntries)
+{
+	// this can be optimized further using a tree structure or more hashing.
+	int e=0;
+	for (e=0; e<iEntries; e++)
+	{
+		int i = pTable[e];
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const SVec3 vN = GetNormal(pContext, index);
+		const SVec3 vT = GetTexCoord(pContext, index);
+
+		tbool bNotFound = TTRUE;
+		int e2=0, i2rec=-1;
+		while (e2<e && bNotFound)
+		{
+			const int i2 = pTable[e2];
+			const int index2 = piTriList_in_and_out[i2];
+			const SVec3 vP2 = GetPosition(pContext, index2);
+			const SVec3 vN2 = GetNormal(pContext, index2);
+			const SVec3 vT2 = GetTexCoord(pContext, index2);
+			i2rec = i2;
+
+			if (veq(vP,vP2) && veq(vN,vN2) && veq(vT,vT2))
+				bNotFound = TFALSE;
+			else
+				++e2;
+		}
+		
+		// merge if previously found
+		if (!bNotFound)
+			piTriList_in_and_out[i] = piTriList_in_and_out[i2rec];
+	}
+}
+
+static void GenerateSharedVerticesIndexListSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int iNumUniqueVerts = 0, t=0, i=0;
+	for (t=0; t<iNrTrianglesIn; t++)
+	{
+		for (i=0; i<3; i++)
+		{
+			const int offs = t*3 + i;
+			const int index = piTriList_in_and_out[offs];
+
+			const SVec3 vP = GetPosition(pContext, index);
+			const SVec3 vN = GetNormal(pContext, index);
+			const SVec3 vT = GetTexCoord(pContext, index);
+
+			tbool bFound = TFALSE;
+			int t2=0, index2rec=-1;
+			while (!bFound && t2<=t)
+			{
+				int j=0;
+				while (!bFound && j<3)
+				{
+					const int index2 = piTriList_in_and_out[t2*3 + j];
+					const SVec3 vP2 = GetPosition(pContext, index2);
+					const SVec3 vN2 = GetNormal(pContext, index2);
+					const SVec3 vT2 = GetTexCoord(pContext, index2);
+					
+					if (veq(vP,vP2) && veq(vN,vN2) && veq(vT,vT2))
+						bFound = TTRUE;
+					else
+						++j;
+				}
+				if (!bFound) ++t2;
+			}
+
+			assert(bFound);
+			// if we found our own
+			if (index2rec == index) { ++iNumUniqueVerts; }
+
+			piTriList_in_and_out[offs] = index2rec;
+		}
+	}
+}
+
+static int GenerateInitialVerticesIndexList(STriInfo pTriInfos[], int piTriList_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int iTSpacesOffs = 0, f=0, t=0;
+	int iDstTriIndex = 0;
+	for (f=0; f<pContext->m_pInterface->m_getNumFaces(pContext); f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts!=3 && verts!=4) continue;
+
+		pTriInfos[iDstTriIndex].iOrgFaceNumber = f;
+		pTriInfos[iDstTriIndex].iTSpacesOffs = iTSpacesOffs;
+
+		if (verts==3)
+		{
+			unsigned char * pVerts = pTriInfos[iDstTriIndex].vert_num;
+			pVerts[0]=0; pVerts[1]=1; pVerts[2]=2;
+			piTriList_out[iDstTriIndex*3+0] = MakeIndex(f, 0);
+			piTriList_out[iDstTriIndex*3+1] = MakeIndex(f, 1);
+			piTriList_out[iDstTriIndex*3+2] = MakeIndex(f, 2);
+			++iDstTriIndex;	// next
+		}
+		else
+		{
+			{
+				pTriInfos[iDstTriIndex+1].iOrgFaceNumber = f;
+				pTriInfos[iDstTriIndex+1].iTSpacesOffs = iTSpacesOffs;
+			}
+
+			{
+				// need an order independent way to evaluate
+				// tspace on quads. This is done by splitting
+				// along the shortest diagonal.
+				const int i0 = MakeIndex(f, 0);
+				const int i1 = MakeIndex(f, 1);
+				const int i2 = MakeIndex(f, 2);
+				const int i3 = MakeIndex(f, 3);
+				const SVec3 T0 = GetTexCoord(pContext, i0);
+				const SVec3 T1 = GetTexCoord(pContext, i1);
+				const SVec3 T2 = GetTexCoord(pContext, i2);
+				const SVec3 T3 = GetTexCoord(pContext, i3);
+				const float distSQ_02 = LengthSquared(vsub(T2,T0));
+				const float distSQ_13 = LengthSquared(vsub(T3,T1));
+				tbool bQuadDiagIs_02;
+				if (distSQ_02<distSQ_13)
+					bQuadDiagIs_02 = TTRUE;
+				else if (distSQ_13<distSQ_02)
+					bQuadDiagIs_02 = TFALSE;
+				else
+				{
+					const SVec3 P0 = GetPosition(pContext, i0);
+					const SVec3 P1 = GetPosition(pContext, i1);
+					const SVec3 P2 = GetPosition(pContext, i2);
+					const SVec3 P3 = GetPosition(pContext, i3);
+					const float distSQ_02 = LengthSquared(vsub(P2,P0));
+					const float distSQ_13 = LengthSquared(vsub(P3,P1));
+
+					bQuadDiagIs_02 = distSQ_13<distSQ_02 ? TFALSE : TTRUE;
+				}
+
+				if (bQuadDiagIs_02)
+				{
+					{
+						unsigned char * pVerts_A = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_A[0]=0; pVerts_A[1]=1; pVerts_A[2]=2;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i1;
+					piTriList_out[iDstTriIndex*3+2] = i2;
+					++iDstTriIndex;	// next
+					{
+						unsigned char * pVerts_B = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_B[0]=0; pVerts_B[1]=2; pVerts_B[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i2;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+				}
+				else
+				{
+					{
+						unsigned char * pVerts_A = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_A[0]=0; pVerts_A[1]=1; pVerts_A[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i1;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+					{
+						unsigned char * pVerts_B = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_B[0]=1; pVerts_B[1]=2; pVerts_B[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i1;
+					piTriList_out[iDstTriIndex*3+1] = i2;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+				}
+			}
+		}
+
+		iTSpacesOffs += verts;
+		assert(iDstTriIndex<=iNrTrianglesIn);
+	}
+
+	for (t=0; t<iNrTrianglesIn; t++)
+		pTriInfos[t].iFlag = 0;
+
+	// return total amount of tspaces
+	return iTSpacesOffs;
+}
+
+static SVec3 GetPosition(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float pos[3];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getPosition(pContext, pos, iF, iI);
+	res.x=pos[0]; res.y=pos[1]; res.z=pos[2];
+	return res;
+}
+
+static SVec3 GetNormal(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float norm[3];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getNormal(pContext, norm, iF, iI);
+	res.x=norm[0]; res.y=norm[1]; res.z=norm[2];
+	return res;
+}
+
+static SVec3 GetTexCoord(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float texc[2];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getTexCoord(pContext, texc, iF, iI);
+	res.x=texc[0]; res.y=texc[1]; res.z=1.0f;
+	return res;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef union {
+	struct
+	{
+		int i0, i1, f;
+	};
+	int array[3];
+} SEdge;
+
+static void BuildNeighborsFast(STriInfo pTriInfos[], SEdge * pEdges, const int piTriListIn[], const int iNrTrianglesIn);
+static void BuildNeighborsSlow(STriInfo pTriInfos[], const int piTriListIn[], const int iNrTrianglesIn);
+
+// returns the texture area times 2
+static float CalcTexArea(const SMikkTSpaceContext * pContext, const int indices[])
+{
+	const SVec3 t1 = GetTexCoord(pContext, indices[0]);
+	const SVec3 t2 = GetTexCoord(pContext, indices[1]);
+	const SVec3 t3 = GetTexCoord(pContext, indices[2]);
+
+	const float t21x = t2.x-t1.x;
+	const float t21y = t2.y-t1.y;
+	const float t31x = t3.x-t1.x;
+	const float t31y = t3.y-t1.y;
+
+	const float fSignedAreaSTx2 = t21x*t31y - t21y*t31x;
+
+	return fSignedAreaSTx2<0 ? (-fSignedAreaSTx2) : fSignedAreaSTx2;
+}
+
+static void InitTriInfo(STriInfo pTriInfos[], const int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int f=0, i=0, t=0;
+	// pTriInfos[f].iFlag is cleared in GenerateInitialVerticesIndexList() which is called before this function.
+
+	// generate neighbor info list
+	for (f=0; f<iNrTrianglesIn; f++)
+		for (i=0; i<3; i++)
+		{
+			pTriInfos[f].FaceNeighbors[i] = -1;
+			pTriInfos[f].AssignedGroup[i] = NULL;
+
+			pTriInfos[f].vOs.x=0.0f; pTriInfos[f].vOs.y=0.0f; pTriInfos[f].vOs.z=0.0f;
+			pTriInfos[f].vOt.x=0.0f; pTriInfos[f].vOt.y=0.0f; pTriInfos[f].vOt.z=0.0f;
+			pTriInfos[f].fMagS = 0;
+			pTriInfos[f].fMagT = 0;
+
+			// assumed bad
+			pTriInfos[f].iFlag |= GROUP_WITH_ANY;
+		}
+
+	// evaluate first order derivatives
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		// initial values
+		const SVec3 v1 = GetPosition(pContext, piTriListIn[f*3+0]);
+		const SVec3 v2 = GetPosition(pContext, piTriListIn[f*3+1]);
+		const SVec3 v3 = GetPosition(pContext, piTriListIn[f*3+2]);
+		const SVec3 t1 = GetTexCoord(pContext, piTriListIn[f*3+0]);
+		const SVec3 t2 = GetTexCoord(pContext, piTriListIn[f*3+1]);
+		const SVec3 t3 = GetTexCoord(pContext, piTriListIn[f*3+2]);
+
+		const float t21x = t2.x-t1.x;
+		const float t21y = t2.y-t1.y;
+		const float t31x = t3.x-t1.x;
+		const float t31y = t3.y-t1.y;
+		const SVec3 d1 = vsub(v2,v1);
+		const SVec3 d2 = vsub(v3,v1);
+
+		const float fSignedAreaSTx2 = t21x*t31y - t21y*t31x;
+		//assert(fSignedAreaSTx2!=0);
+		SVec3 vOs = vsub(vscale(t31y,d1), vscale(t21y,d2));	// eq 18
+		SVec3 vOt = vadd(vscale(-t31x,d1), vscale(t21x,d2)); // eq 19
+
+		pTriInfos[f].iFlag |= (fSignedAreaSTx2>0 ? ORIENT_PRESERVING : 0);
+
+		if ( NotZero(fSignedAreaSTx2) )
+		{
+			const float fAbsArea = fabsf(fSignedAreaSTx2);
+			const float fLenOs = Length(vOs);
+			const float fLenOt = Length(vOt);
+			const float fS = (pTriInfos[f].iFlag&ORIENT_PRESERVING)==0 ? (-1.0f) : 1.0f;
+			if ( NotZero(fLenOs) ) pTriInfos[f].vOs = vscale(fS/fLenOs, vOs);
+			if ( NotZero(fLenOt) ) pTriInfos[f].vOt = vscale(fS/fLenOt, vOt);
+
+			// evaluate magnitudes prior to normalization of vOs and vOt
+			pTriInfos[f].fMagS = fLenOs / fAbsArea;
+			pTriInfos[f].fMagT = fLenOt / fAbsArea;
+
+			// if this is a good triangle
+			if ( NotZero(pTriInfos[f].fMagS) && NotZero(pTriInfos[f].fMagT))
+				pTriInfos[f].iFlag &= (~GROUP_WITH_ANY);
+		}
+	}
+
+	// force otherwise healthy quads to a fixed orientation
+	while (t<(iNrTrianglesIn-1))
+	{
+		const int iFO_a = pTriInfos[t].iOrgFaceNumber;
+		const int iFO_b = pTriInfos[t+1].iOrgFaceNumber;
+		if (iFO_a==iFO_b)	// this is a quad
+		{
+			const tbool bIsDeg_a = (pTriInfos[t].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			const tbool bIsDeg_b = (pTriInfos[t+1].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			
+			// bad triangles should already have been removed by
+			// DegenPrologue(), but just in case check bIsDeg_a and bIsDeg_a are false
+			if ((bIsDeg_a||bIsDeg_b)==TFALSE)
+			{
+				const tbool bOrientA = (pTriInfos[t].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				const tbool bOrientB = (pTriInfos[t+1].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				// if this happens the quad has extremely bad mapping!!
+				if (bOrientA!=bOrientB)
+				{
+					//printf("found quad with bad mapping\n");
+					tbool bChooseOrientFirstTri = TFALSE;
+					if ((pTriInfos[t+1].iFlag&GROUP_WITH_ANY)!=0) bChooseOrientFirstTri = TTRUE;
+					else if ( CalcTexArea(pContext, &piTriListIn[t*3+0]) >= CalcTexArea(pContext, &piTriListIn[(t+1)*3+0]) )
+						bChooseOrientFirstTri = TTRUE;
+
+					// force match
+					{
+						const int t0 = bChooseOrientFirstTri ? t : (t+1);
+						const int t1 = bChooseOrientFirstTri ? (t+1) : t;
+						pTriInfos[t1].iFlag &= (~ORIENT_PRESERVING);	// clear first
+						pTriInfos[t1].iFlag |= (pTriInfos[t0].iFlag&ORIENT_PRESERVING);	// copy bit
+					}
+				}
+			}
+			t += 2;
+		}
+		else
+			++t;
+	}
+	
+	// match up edge pairs
+	{
+		SEdge * pEdges = (SEdge *) malloc(sizeof(SEdge)*iNrTrianglesIn*3);
+		if (pEdges==NULL)
+			BuildNeighborsSlow(pTriInfos, piTriListIn, iNrTrianglesIn);
+		else
+		{
+			BuildNeighborsFast(pTriInfos, pEdges, piTriListIn, iNrTrianglesIn);
+	
+			free(pEdges);
+		}
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static tbool AssignRecur(const int piTriListIn[], STriInfo psTriInfos[], const int iMyTriIndex, SGroup * pGroup);
+static void AddTriToGroup(SGroup * pGroup, const int iTriIndex);
+
+static int Build4RuleGroups(STriInfo pTriInfos[], SGroup pGroups[], int piGroupTrianglesBuffer[], const int piTriListIn[], const int iNrTrianglesIn)
+{
+	const int iNrMaxGroups = iNrTrianglesIn*3;
+	int iNrActiveGroups = 0;
+	int iOffset = 0, f=0, i=0;
+	(void)iNrMaxGroups;  /* quiet warnings in non debug mode */
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		for (i=0; i<3; i++)
+		{
+			// if not assigned to a group
+			if ((pTriInfos[f].iFlag&GROUP_WITH_ANY)==0 && pTriInfos[f].AssignedGroup[i]==NULL)
+			{
+				tbool bOrPre;
+				int neigh_indexL, neigh_indexR;
+				const int vert_index = piTriListIn[f*3+i];
+				assert(iNrActiveGroups<iNrMaxGroups);
+				pTriInfos[f].AssignedGroup[i] = &pGroups[iNrActiveGroups];
+				pTriInfos[f].AssignedGroup[i]->iVertexRepresentitive = vert_index;
+				pTriInfos[f].AssignedGroup[i]->bOrientPreservering = (pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0;
+				pTriInfos[f].AssignedGroup[i]->iNrFaces = 0;
+				pTriInfos[f].AssignedGroup[i]->pFaceIndices = &piGroupTrianglesBuffer[iOffset];
+				++iNrActiveGroups;
+
+				AddTriToGroup(pTriInfos[f].AssignedGroup[i], f);
+				bOrPre = (pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				neigh_indexL = pTriInfos[f].FaceNeighbors[i];
+				neigh_indexR = pTriInfos[f].FaceNeighbors[i>0?(i-1):2];
+				if (neigh_indexL>=0) // neighbor
+				{
+					const tbool bAnswer =
+						AssignRecur(piTriListIn, pTriInfos, neigh_indexL,
+									pTriInfos[f].AssignedGroup[i] );
+					
+					const tbool bOrPre2 = (pTriInfos[neigh_indexL].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+					const tbool bDiff = bOrPre!=bOrPre2 ? TTRUE : TFALSE;
+					assert(bAnswer || bDiff);
+					(void)bAnswer, (void)bDiff;  /* quiet warnings in non debug mode */
+				}
+				if (neigh_indexR>=0) // neighbor
+				{
+					const tbool bAnswer =
+						AssignRecur(piTriListIn, pTriInfos, neigh_indexR,
+									pTriInfos[f].AssignedGroup[i] );
+
+					const tbool bOrPre2 = (pTriInfos[neigh_indexR].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+					const tbool bDiff = bOrPre!=bOrPre2 ? TTRUE : TFALSE;
+					assert(bAnswer || bDiff);
+					(void)bAnswer, (void)bDiff;  /* quiet warnings in non debug mode */
+				}
+
+				// update offset
+				iOffset += pTriInfos[f].AssignedGroup[i]->iNrFaces;
+				// since the groups are disjoint a triangle can never
+				// belong to more than 3 groups. Subsequently something
+				// is completely screwed if this assertion ever hits.
+				assert(iOffset <= iNrMaxGroups);
+			}
+		}
+	}
+
+	return iNrActiveGroups;
+}
+
+static void AddTriToGroup(SGroup * pGroup, const int iTriIndex)
+{
+	pGroup->pFaceIndices[pGroup->iNrFaces] = iTriIndex;
+	++pGroup->iNrFaces;
+}
+
+static tbool AssignRecur(const int piTriListIn[], STriInfo psTriInfos[],
+				 const int iMyTriIndex, SGroup * pGroup)
+{
+	STriInfo * pMyTriInfo = &psTriInfos[iMyTriIndex];
+
+	// track down vertex
+	const int iVertRep = pGroup->iVertexRepresentitive;
+	const int * pVerts = &piTriListIn[3*iMyTriIndex+0];
+	int i=-1;
+	if (pVerts[0]==iVertRep) i=0;
+	else if (pVerts[1]==iVertRep) i=1;
+	else if (pVerts[2]==iVertRep) i=2;
+	assert(i>=0 && i<3);
+
+	// early out
+	if (pMyTriInfo->AssignedGroup[i] == pGroup) return TTRUE;
+	else if (pMyTriInfo->AssignedGroup[i]!=NULL) return TFALSE;
+	if ((pMyTriInfo->iFlag&GROUP_WITH_ANY)!=0)
+	{
+		// first to group with a group-with-anything triangle
+		// determines it's orientation.
+		// This is the only existing order dependency in the code!!
+		if ( pMyTriInfo->AssignedGroup[0] == NULL &&
+			pMyTriInfo->AssignedGroup[1] == NULL &&
+			pMyTriInfo->AssignedGroup[2] == NULL )
+		{
+			pMyTriInfo->iFlag &= (~ORIENT_PRESERVING);
+			pMyTriInfo->iFlag |= (pGroup->bOrientPreservering ? ORIENT_PRESERVING : 0);
+		}
+	}
+	{
+		const tbool bOrient = (pMyTriInfo->iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+		if (bOrient != pGroup->bOrientPreservering) return TFALSE;
+	}
+
+	AddTriToGroup(pGroup, iMyTriIndex);
+	pMyTriInfo->AssignedGroup[i] = pGroup;
+
+	{
+		const int neigh_indexL = pMyTriInfo->FaceNeighbors[i];
+		const int neigh_indexR = pMyTriInfo->FaceNeighbors[i>0?(i-1):2];
+		if (neigh_indexL>=0)
+			AssignRecur(piTriListIn, psTriInfos, neigh_indexL, pGroup);
+		if (neigh_indexR>=0)
+			AssignRecur(piTriListIn, psTriInfos, neigh_indexR, pGroup);
+	}
+
+
+
+	return TTRUE;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static tbool CompareSubGroups(const SSubGroup * pg1, const SSubGroup * pg2);
+static void QuickSort(int* pSortBuffer, int iLeft, int iRight, unsigned int uSeed);
+static STSpace EvalTspace(int face_indices[], const int iFaces, const int piTriListIn[], const STriInfo pTriInfos[], const SMikkTSpaceContext * pContext, const int iVertexRepresentitive);
+
+static tbool GenerateTSpaces(STSpace psTspace[], const STriInfo pTriInfos[], const SGroup pGroups[],
+                             const int iNrActiveGroups, const int piTriListIn[], const float fThresCos,
+                             const SMikkTSpaceContext * pContext)
+{
+	STSpace * pSubGroupTspace = NULL;
+	SSubGroup * pUniSubGroups = NULL;
+	int * pTmpMembers = NULL;
+	int iMaxNrFaces=0, iUniqueTspaces=0, g=0, i=0;
+	for (g=0; g<iNrActiveGroups; g++)
+		if (iMaxNrFaces < pGroups[g].iNrFaces)
+			iMaxNrFaces = pGroups[g].iNrFaces;
+
+	if (iMaxNrFaces == 0) return TTRUE;
+
+	// make initial allocations
+	pSubGroupTspace = (STSpace *) malloc(sizeof(STSpace)*iMaxNrFaces);
+	pUniSubGroups = (SSubGroup *) malloc(sizeof(SSubGroup)*iMaxNrFaces);
+	pTmpMembers = (int *) malloc(sizeof(int)*iMaxNrFaces);
+	if (pSubGroupTspace==NULL || pUniSubGroups==NULL || pTmpMembers==NULL)
+	{
+		if (pSubGroupTspace!=NULL) free(pSubGroupTspace);
+		if (pUniSubGroups!=NULL) free(pUniSubGroups);
+		if (pTmpMembers!=NULL) free(pTmpMembers);
+		return TFALSE;
+	}
+
+
+	iUniqueTspaces = 0;
+	for (g=0; g<iNrActiveGroups; g++)
+	{
+		const SGroup * pGroup = &pGroups[g];
+		int iUniqueSubGroups = 0, s=0;
+
+		for (i=0; i<pGroup->iNrFaces; i++)	// triangles
+		{
+			const int f = pGroup->pFaceIndices[i];	// triangle number
+			int index=-1, iVertIndex=-1, iOF_1=-1, iMembers=0, j=0, l=0;
+			SSubGroup tmp_group;
+			tbool bFound;
+			SVec3 n, vOs, vOt;
+			if (pTriInfos[f].AssignedGroup[0]==pGroup) index=0;
+			else if (pTriInfos[f].AssignedGroup[1]==pGroup) index=1;
+			else if (pTriInfos[f].AssignedGroup[2]==pGroup) index=2;
+			assert(index>=0 && index<3);
+
+			iVertIndex = piTriListIn[f*3+index];
+			assert(iVertIndex==pGroup->iVertexRepresentitive);
+
+			// is normalized already
+			n = GetNormal(pContext, iVertIndex);
+			
+			// project
+			vOs = vsub(pTriInfos[f].vOs, vscale(vdot(n,pTriInfos[f].vOs), n));
+			vOt = vsub(pTriInfos[f].vOt, vscale(vdot(n,pTriInfos[f].vOt), n));
+			if ( VNotZero(vOs) ) vOs = Normalize(vOs);
+			if ( VNotZero(vOt) ) vOt = Normalize(vOt);
+
+			// original face number
+			iOF_1 = pTriInfos[f].iOrgFaceNumber;
+			
+			iMembers = 0;
+			for (j=0; j<pGroup->iNrFaces; j++)
+			{
+				const int t = pGroup->pFaceIndices[j];	// triangle number
+				const int iOF_2 = pTriInfos[t].iOrgFaceNumber;
+
+				// project
+				SVec3 vOs2 = vsub(pTriInfos[t].vOs, vscale(vdot(n,pTriInfos[t].vOs), n));
+				SVec3 vOt2 = vsub(pTriInfos[t].vOt, vscale(vdot(n,pTriInfos[t].vOt), n));
+				if ( VNotZero(vOs2) ) vOs2 = Normalize(vOs2);
+				if ( VNotZero(vOt2) ) vOt2 = Normalize(vOt2);
+
+				{
+					const tbool bAny = ( (pTriInfos[f].iFlag | pTriInfos[t].iFlag) & GROUP_WITH_ANY )!=0 ? TTRUE : TFALSE;
+					// make sure triangles which belong to the same quad are joined.
+					const tbool bSameOrgFace = iOF_1==iOF_2 ? TTRUE : TFALSE;
+
+					const float fCosS = vdot(vOs,vOs2);
+					const float fCosT = vdot(vOt,vOt2);
+
+					assert(f!=t || bSameOrgFace);	// sanity check
+					if (bAny || bSameOrgFace || (fCosS>fThresCos && fCosT>fThresCos))
+						pTmpMembers[iMembers++] = t;
+				}
+			}
+
+			// sort pTmpMembers
+			tmp_group.iNrFaces = iMembers;
+			tmp_group.pTriMembers = pTmpMembers;
+			if (iMembers>1)
+			{
+				unsigned int uSeed = INTERNAL_RND_SORT_SEED;	// could replace with a random seed?
+				QuickSort(pTmpMembers, 0, iMembers-1, uSeed);
+			}
+
+			// look for an existing match
+			bFound = TFALSE;
+			l=0;
+			while (l<iUniqueSubGroups && !bFound)
+			{
+				bFound = CompareSubGroups(&tmp_group, &pUniSubGroups[l]);
+				if (!bFound) ++l;
+			}
+			
+			// assign tangent space index
+			assert(bFound || l==iUniqueSubGroups);
+			//piTempTangIndices[f*3+index] = iUniqueTspaces+l;
+
+			// if no match was found we allocate a new subgroup
+			if (!bFound)
+			{
+				// insert new subgroup
+				int * pIndices = (int *) malloc(sizeof(int)*iMembers);
+				if (pIndices==NULL)
+				{
+					// clean up and return false
+					int s=0;
+					for (s=0; s<iUniqueSubGroups; s++)
+						free(pUniSubGroups[s].pTriMembers);
+					free(pUniSubGroups);
+					free(pTmpMembers);
+					free(pSubGroupTspace);
+					return TFALSE;
+				}
+				pUniSubGroups[iUniqueSubGroups].iNrFaces = iMembers;
+				pUniSubGroups[iUniqueSubGroups].pTriMembers = pIndices;
+				memcpy(pIndices, tmp_group.pTriMembers, iMembers*sizeof(int));
+				pSubGroupTspace[iUniqueSubGroups] =
+					EvalTspace(tmp_group.pTriMembers, iMembers, piTriListIn, pTriInfos, pContext, pGroup->iVertexRepresentitive);
+				++iUniqueSubGroups;
+			}
+
+			// output tspace
+			{
+				const int iOffs = pTriInfos[f].iTSpacesOffs;
+				const int iVert = pTriInfos[f].vert_num[index];
+				STSpace * pTS_out = &psTspace[iOffs+iVert];
+				assert(pTS_out->iCounter<2);
+				assert(((pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0) == pGroup->bOrientPreservering);
+				if (pTS_out->iCounter==1)
+				{
+					*pTS_out = AvgTSpace(pTS_out, &pSubGroupTspace[l]);
+					pTS_out->iCounter = 2;	// update counter
+					pTS_out->bOrient = pGroup->bOrientPreservering;
+				}
+				else
+				{
+					assert(pTS_out->iCounter==0);
+					*pTS_out = pSubGroupTspace[l];
+					pTS_out->iCounter = 1;	// update counter
+					pTS_out->bOrient = pGroup->bOrientPreservering;
+				}
+			}
+		}
+
+		// clean up and offset iUniqueTspaces
+		for (s=0; s<iUniqueSubGroups; s++)
+			free(pUniSubGroups[s].pTriMembers);
+		iUniqueTspaces += iUniqueSubGroups;
+	}
+
+	// clean up
+	free(pUniSubGroups);
+	free(pTmpMembers);
+	free(pSubGroupTspace);
+
+	return TTRUE;
+}
+
+static STSpace EvalTspace(int face_indices[], const int iFaces, const int piTriListIn[], const STriInfo pTriInfos[],
+                          const SMikkTSpaceContext * pContext, const int iVertexRepresentitive)
+{
+	STSpace res;
+	float fAngleSum = 0;
+	int face=0;
+	res.vOs.x=0.0f; res.vOs.y=0.0f; res.vOs.z=0.0f;
+	res.vOt.x=0.0f; res.vOt.y=0.0f; res.vOt.z=0.0f;
+	res.fMagS = 0; res.fMagT = 0;
+
+	for (face=0; face<iFaces; face++)
+	{
+		const int f = face_indices[face];
+
+		// only valid triangles get to add their contribution
+		if ( (pTriInfos[f].iFlag&GROUP_WITH_ANY)==0 )
+		{
+			SVec3 n, vOs, vOt, p0, p1, p2, v1, v2;
+			float fCos, fAngle, fMagS, fMagT;
+			int i=-1, index=-1, i0=-1, i1=-1, i2=-1;
+			if (piTriListIn[3*f+0]==iVertexRepresentitive) i=0;
+			else if (piTriListIn[3*f+1]==iVertexRepresentitive) i=1;
+			else if (piTriListIn[3*f+2]==iVertexRepresentitive) i=2;
+			assert(i>=0 && i<3);
+
+			// project
+			index = piTriListIn[3*f+i];
+			n = GetNormal(pContext, index);
+			vOs = vsub(pTriInfos[f].vOs, vscale(vdot(n,pTriInfos[f].vOs), n));
+			vOt = vsub(pTriInfos[f].vOt, vscale(vdot(n,pTriInfos[f].vOt), n));
+			if ( VNotZero(vOs) ) vOs = Normalize(vOs);
+			if ( VNotZero(vOt) ) vOt = Normalize(vOt);
+
+			i2 = piTriListIn[3*f + (i<2?(i+1):0)];
+			i1 = piTriListIn[3*f + i];
+			i0 = piTriListIn[3*f + (i>0?(i-1):2)];
+
+			p0 = GetPosition(pContext, i0);
+			p1 = GetPosition(pContext, i1);
+			p2 = GetPosition(pContext, i2);
+			v1 = vsub(p0,p1);
+			v2 = vsub(p2,p1);
+
+			// project
+			v1 = vsub(v1, vscale(vdot(n,v1),n)); if ( VNotZero(v1) ) v1 = Normalize(v1);
+			v2 = vsub(v2, vscale(vdot(n,v2),n)); if ( VNotZero(v2) ) v2 = Normalize(v2);
+
+			// weight contribution by the angle
+			// between the two edge vectors
+			fCos = vdot(v1,v2); fCos=fCos>1?1:(fCos<(-1) ? (-1) : fCos);
+			fAngle = (float) acos(fCos);
+			fMagS = pTriInfos[f].fMagS;
+			fMagT = pTriInfos[f].fMagT;
+
+			res.vOs=vadd(res.vOs, vscale(fAngle,vOs));
+			res.vOt=vadd(res.vOt,vscale(fAngle,vOt));
+			res.fMagS+=(fAngle*fMagS);
+			res.fMagT+=(fAngle*fMagT);
+			fAngleSum += fAngle;
+		}
+	}
+
+	// normalize
+	if ( VNotZero(res.vOs) ) res.vOs = Normalize(res.vOs);
+	if ( VNotZero(res.vOt) ) res.vOt = Normalize(res.vOt);
+	if (fAngleSum>0)
+	{
+		res.fMagS /= fAngleSum;
+		res.fMagT /= fAngleSum;
+	}
+
+	return res;
+}
+
+static tbool CompareSubGroups(const SSubGroup * pg1, const SSubGroup * pg2)
+{
+	tbool bStillSame=TTRUE;
+	int i=0;
+	if (pg1->iNrFaces!=pg2->iNrFaces) return TFALSE;
+	while (i<pg1->iNrFaces && bStillSame)
+	{
+		bStillSame = pg1->pTriMembers[i]==pg2->pTriMembers[i] ? TTRUE : TFALSE;
+		if (bStillSame) ++i;
+	}
+	return bStillSame;
+}
+
+static void QuickSort(int* pSortBuffer, int iLeft, int iRight, unsigned int uSeed)
+{
+	int iL, iR, n, index, iMid, iTmp;
+
+	// Random
+	unsigned int t=uSeed&31;
+	t=(uSeed<<t)|(uSeed>>(32-t));
+	uSeed=uSeed+t+3;
+	// Random end
+
+	iL=iLeft; iR=iRight;
+	n = (iR-iL)+1;
+	assert(n>=0);
+	index = (int) (uSeed%n);
+
+	iMid=pSortBuffer[index + iL];
+
+
+	do
+	{
+		while (pSortBuffer[iL] < iMid)
+			++iL;
+		while (pSortBuffer[iR] > iMid)
+			--iR;
+
+		if (iL <= iR)
+		{
+			iTmp = pSortBuffer[iL];
+			pSortBuffer[iL] = pSortBuffer[iR];
+			pSortBuffer[iR] = iTmp;
+			++iL; --iR;
+		}
+	}
+	while (iL <= iR);
+
+	if (iLeft < iR)
+		QuickSort(pSortBuffer, iLeft, iR, uSeed);
+	if (iL < iRight)
+		QuickSort(pSortBuffer, iL, iRight, uSeed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+static void QuickSortEdges(SEdge * pSortBuffer, int iLeft, int iRight, const int channel, unsigned int uSeed);
+static void GetEdge(int * i0_out, int * i1_out, int * edgenum_out, const int indices[], const int i0_in, const int i1_in);
+
+static void BuildNeighborsFast(STriInfo pTriInfos[], SEdge * pEdges, const int piTriListIn[], const int iNrTrianglesIn)
+{
+	// build array of edges
+	unsigned int uSeed = INTERNAL_RND_SORT_SEED;				// could replace with a random seed?
+	int iEntries=0, iCurStartIndex=-1, f=0, i=0;
+	for (f=0; f<iNrTrianglesIn; f++)
+		for (i=0; i<3; i++)
+		{
+			const int i0 = piTriListIn[f*3+i];
+			const int i1 = piTriListIn[f*3+(i<2?(i+1):0)];
+			pEdges[f*3+i].i0 = i0 < i1 ? i0 : i1;			// put minimum index in i0
+			pEdges[f*3+i].i1 = !(i0 < i1) ? i0 : i1;		// put maximum index in i1
+			pEdges[f*3+i].f = f;							// record face number
+		}
+
+	// sort over all edges by i0, this is the pricy one.
+	QuickSortEdges(pEdges, 0, iNrTrianglesIn*3-1, 0, uSeed);	// sort channel 0 which is i0
+
+	// sub sort over i1, should be fast.
+	// could replace this with a 64 bit int sort over (i0,i1)
+	// with i0 as msb in the quicksort call above.
+	iEntries = iNrTrianglesIn*3;
+	iCurStartIndex = 0;
+	for (i=1; i<iEntries; i++)
+	{
+		if (pEdges[iCurStartIndex].i0 != pEdges[i].i0)
+		{
+			const int iL = iCurStartIndex;
+			const int iR = i-1;
+			//const int iElems = i-iL;
+			iCurStartIndex = i;
+			QuickSortEdges(pEdges, iL, iR, 1, uSeed);	// sort channel 1 which is i1
+		}
+	}
+
+	// sub sort over f, which should be fast.
+	// this step is to remain compliant with BuildNeighborsSlow() when
+	// more than 2 triangles use the same edge (such as a butterfly topology).
+	iCurStartIndex = 0;
+	for (i=1; i<iEntries; i++)
+	{
+		if (pEdges[iCurStartIndex].i0 != pEdges[i].i0 || pEdges[iCurStartIndex].i1 != pEdges[i].i1)
+		{
+			const int iL = iCurStartIndex;
+			const int iR = i-1;
+			//const int iElems = i-iL;
+			iCurStartIndex = i;
+			QuickSortEdges(pEdges, iL, iR, 2, uSeed);	// sort channel 2 which is f
+		}
+	}
+
+	// pair up, adjacent triangles
+	for (i=0; i<iEntries; i++)
+	{
+		const int i0=pEdges[i].i0;
+		const int i1=pEdges[i].i1;
+		const int f = pEdges[i].f;
+		tbool bUnassigned_A;
+
+		int i0_A, i1_A;
+		int edgenum_A, edgenum_B=0;	// 0,1 or 2
+		GetEdge(&i0_A, &i1_A, &edgenum_A, &piTriListIn[f*3], i0, i1);	// resolve index ordering and edge_num
+		bUnassigned_A = pTriInfos[f].FaceNeighbors[edgenum_A] == -1 ? TTRUE : TFALSE;
+
+		if (bUnassigned_A)
+		{
+			// get true index ordering
+			int j=i+1, t;
+			tbool bNotFound = TTRUE;
+			while (j<iEntries && i0==pEdges[j].i0 && i1==pEdges[j].i1 && bNotFound)
+			{
+				tbool bUnassigned_B;
+				int i0_B, i1_B;
+				t = pEdges[j].f;
+				// flip i0_B and i1_B
+				GetEdge(&i1_B, &i0_B, &edgenum_B, &piTriListIn[t*3], pEdges[j].i0, pEdges[j].i1);	// resolve index ordering and edge_num
+				//assert(!(i0_A==i1_B && i1_A==i0_B));
+				bUnassigned_B =  pTriInfos[t].FaceNeighbors[edgenum_B]==-1 ? TTRUE : TFALSE;
+				if (i0_A==i0_B && i1_A==i1_B && bUnassigned_B)
+					bNotFound = TFALSE;
+				else
+					++j;
+			}
+
+			if (!bNotFound)
+			{
+				int t = pEdges[j].f;
+				pTriInfos[f].FaceNeighbors[edgenum_A] = t;
+				//assert(pTriInfos[t].FaceNeighbors[edgenum_B]==-1);
+				pTriInfos[t].FaceNeighbors[edgenum_B] = f;
+			}
+		}
+	}
+}
+
+static void BuildNeighborsSlow(STriInfo pTriInfos[], const int piTriListIn[], const int iNrTrianglesIn)
+{
+	int f=0, i=0;
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		for (i=0; i<3; i++)
+		{
+			// if unassigned
+			if (pTriInfos[f].FaceNeighbors[i] == -1)
+			{
+				const int i0_A = piTriListIn[f*3+i];
+				const int i1_A = piTriListIn[f*3+(i<2?(i+1):0)];
+
+				// search for a neighbor
+				tbool bFound = TFALSE;
+				int t=0, j=0;
+				while (!bFound && t<iNrTrianglesIn)
+				{
+					if (t!=f)
+					{
+						j=0;
+						while (!bFound && j<3)
+						{
+							// in rev order
+							const int i1_B = piTriListIn[t*3+j];
+							const int i0_B = piTriListIn[t*3+(j<2?(j+1):0)];
+							//assert(!(i0_A==i1_B && i1_A==i0_B));
+							if (i0_A==i0_B && i1_A==i1_B)
+								bFound = TTRUE;
+							else
+								++j;
+						}
+					}
+					
+					if (!bFound) ++t;
+				}
+
+				// assign neighbors
+				if (bFound)
+				{
+					pTriInfos[f].FaceNeighbors[i] = t;
+					//assert(pTriInfos[t].FaceNeighbors[j]==-1);
+					pTriInfos[t].FaceNeighbors[j] = f;
+				}
+			}
+		}
+	}
+}
+
+static void QuickSortEdges(SEdge * pSortBuffer, int iLeft, int iRight, const int channel, unsigned int uSeed)
+{
+	unsigned int t;
+	int iL, iR, n, index, iMid;
+
+	// early out
+	SEdge sTmp;
+	const int iElems = iRight-iLeft+1;
+	if (iElems<2) return;
+	else if (iElems==2)
+	{
+		if (pSortBuffer[iLeft].array[channel] > pSortBuffer[iRight].array[channel])
+		{
+			sTmp = pSortBuffer[iLeft];
+			pSortBuffer[iLeft] = pSortBuffer[iRight];
+			pSortBuffer[iRight] = sTmp;
+		}
+		return;
+	}
+
+	// Random
+	t=uSeed&31;
+	t=(uSeed<<t)|(uSeed>>(32-t));
+	uSeed=uSeed+t+3;
+	// Random end
+
+	iL=iLeft, iR=iRight;
+	n = (iR-iL)+1;
+	assert(n>=0);
+	index = (int) (uSeed%n);
+
+	iMid=pSortBuffer[index + iL].array[channel];
+
+	do
+	{
+		while (pSortBuffer[iL].array[channel] < iMid)
+			++iL;
+		while (pSortBuffer[iR].array[channel] > iMid)
+			--iR;
+
+		if (iL <= iR)
+		{
+			sTmp = pSortBuffer[iL];
+			pSortBuffer[iL] = pSortBuffer[iR];
+			pSortBuffer[iR] = sTmp;
+			++iL; --iR;
+		}
+	}
+	while (iL <= iR);
+
+	if (iLeft < iR)
+		QuickSortEdges(pSortBuffer, iLeft, iR, channel, uSeed);
+	if (iL < iRight)
+		QuickSortEdges(pSortBuffer, iL, iRight, channel, uSeed);
+}
+
+// resolve ordering and edge number
+static void GetEdge(int * i0_out, int * i1_out, int * edgenum_out, const int indices[], const int i0_in, const int i1_in)
+{
+	*edgenum_out = -1;
+	
+	// test if first index is on the edge
+	if (indices[0]==i0_in || indices[0]==i1_in)
+	{
+		// test if second index is on the edge
+		if (indices[1]==i0_in || indices[1]==i1_in)
+		{
+			edgenum_out[0]=0;	// first edge
+			i0_out[0]=indices[0];
+			i1_out[0]=indices[1];
+		}
+		else
+		{
+			edgenum_out[0]=2;	// third edge
+			i0_out[0]=indices[2];
+			i1_out[0]=indices[0];
+		}
+	}
+	else
+	{
+		// only second and third index is on the edge
+		edgenum_out[0]=1;	// second edge
+		i0_out[0]=indices[1];
+		i1_out[0]=indices[2];
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Degenerate triangles ////////////////////////////////////
+
+static void DegenPrologue(STriInfo pTriInfos[], int piTriList_out[], const int iNrTrianglesIn, const int iTotTris)
+{
+	int iNextGoodTriangleSearchIndex=-1;
+	tbool bStillFindingGoodOnes;
+
+	// locate quads with only one good triangle
+	int t=0;
+	while (t<(iTotTris-1))
+	{
+		const int iFO_a = pTriInfos[t].iOrgFaceNumber;
+		const int iFO_b = pTriInfos[t+1].iOrgFaceNumber;
+		if (iFO_a==iFO_b)	// this is a quad
+		{
+			const tbool bIsDeg_a = (pTriInfos[t].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			const tbool bIsDeg_b = (pTriInfos[t+1].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			if ((bIsDeg_a^bIsDeg_b)!=0)
+			{
+				pTriInfos[t].iFlag |= QUAD_ONE_DEGEN_TRI;
+				pTriInfos[t+1].iFlag |= QUAD_ONE_DEGEN_TRI;
+			}
+			t += 2;
+		}
+		else
+			++t;
+	}
+
+	// reorder list so all degen triangles are moved to the back
+	// without reordering the good triangles
+	iNextGoodTriangleSearchIndex = 1;
+	t=0;
+	bStillFindingGoodOnes = TTRUE;
+	while (t<iNrTrianglesIn && bStillFindingGoodOnes)
+	{
+		const tbool bIsGood = (pTriInfos[t].iFlag&MARK_DEGENERATE)==0 ? TTRUE : TFALSE;
+		if (bIsGood)
+		{
+			if (iNextGoodTriangleSearchIndex < (t+2))
+				iNextGoodTriangleSearchIndex = t+2;
+		}
+		else
+		{
+			int t0, t1;
+			// search for the first good triangle.
+			tbool bJustADegenerate = TTRUE;
+			while (bJustADegenerate && iNextGoodTriangleSearchIndex<iTotTris)
+			{
+				const tbool bIsGood = (pTriInfos[iNextGoodTriangleSearchIndex].iFlag&MARK_DEGENERATE)==0 ? TTRUE : TFALSE;
+				if (bIsGood) bJustADegenerate=TFALSE;
+				else ++iNextGoodTriangleSearchIndex;
+			}
+
+			t0 = t;
+			t1 = iNextGoodTriangleSearchIndex;
+			++iNextGoodTriangleSearchIndex;
+			assert(iNextGoodTriangleSearchIndex > (t+1));
+
+			// swap triangle t0 and t1
+			if (!bJustADegenerate)
+			{
+				int i=0;
+				for (i=0; i<3; i++)
+				{
+					const int index = piTriList_out[t0*3+i];
+					piTriList_out[t0*3+i] = piTriList_out[t1*3+i];
+					piTriList_out[t1*3+i] = index;
+				}
+				{
+					const STriInfo tri_info = pTriInfos[t0];
+					pTriInfos[t0] = pTriInfos[t1];
+					pTriInfos[t1] = tri_info;
+				}
+			}
+			else
+				bStillFindingGoodOnes = TFALSE;	// this is not supposed to happen
+		}
+
+		if (bStillFindingGoodOnes) ++t;
+	}
+
+	assert(bStillFindingGoodOnes);	// code will still work.
+	assert(iNrTrianglesIn == t);
+}
+
+static void DegenEpilogue(STSpace psTspace[], STriInfo pTriInfos[], int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn, const int iTotTris)
+{
+	int t=0, i=0;
+	// deal with degenerate triangles
+	// punishment for degenerate triangles is O(N^2)
+	for (t=iNrTrianglesIn; t<iTotTris; t++)
+	{
+		// degenerate triangles on a quad with one good triangle are skipped
+		// here but processed in the next loop
+		const tbool bSkip = (pTriInfos[t].iFlag&QUAD_ONE_DEGEN_TRI)!=0 ? TTRUE : TFALSE;
+
+		if (!bSkip)
+		{
+			for (i=0; i<3; i++)
+			{
+				const int index1 = piTriListIn[t*3+i];
+				// search through the good triangles
+				tbool bNotFound = TTRUE;
+				int j=0;
+				while (bNotFound && j<(3*iNrTrianglesIn))
+				{
+					const int index2 = piTriListIn[j];
+					if (index1==index2) bNotFound=TFALSE;
+					else ++j;
+				}
+
+				if (!bNotFound)
+				{
+					const int iTri = j/3;
+					const int iVert = j%3;
+					const int iSrcVert=pTriInfos[iTri].vert_num[iVert];
+					const int iSrcOffs=pTriInfos[iTri].iTSpacesOffs;
+					const int iDstVert=pTriInfos[t].vert_num[i];
+					const int iDstOffs=pTriInfos[t].iTSpacesOffs;
+					
+					// copy tspace
+					psTspace[iDstOffs+iDstVert] = psTspace[iSrcOffs+iSrcVert];
+				}
+			}
+		}
+	}
+
+	// deal with degenerate quads with one good triangle
+	for (t=0; t<iNrTrianglesIn; t++)
+	{
+		// this triangle belongs to a quad where the
+		// other triangle is degenerate
+		if ( (pTriInfos[t].iFlag&QUAD_ONE_DEGEN_TRI)!=0 )
+		{
+			SVec3 vDstP;
+			int iOrgF=-1, i=0;
+			tbool bNotFound;
+			unsigned char * pV = pTriInfos[t].vert_num;
+			int iFlag = (1<<pV[0]) | (1<<pV[1]) | (1<<pV[2]);
+			int iMissingIndex = 0;
+			if ((iFlag&2)==0) iMissingIndex=1;
+			else if ((iFlag&4)==0) iMissingIndex=2;
+			else if ((iFlag&8)==0) iMissingIndex=3;
+
+			iOrgF = pTriInfos[t].iOrgFaceNumber;
+			vDstP = GetPosition(pContext, MakeIndex(iOrgF, iMissingIndex));
+			bNotFound = TTRUE;
+			i=0;
+			while (bNotFound && i<3)
+			{
+				const int iVert = pV[i];
+				const SVec3 vSrcP = GetPosition(pContext, MakeIndex(iOrgF, iVert));
+				if (veq(vSrcP, vDstP)==TTRUE)
+				{
+					const int iOffs = pTriInfos[t].iTSpacesOffs;
+					psTspace[iOffs+iMissingIndex] = psTspace[iOffs+iVert];
+					bNotFound=TFALSE;
+				}
+				else
+					++i;
+			}
+			assert(!bNotFound);
+		}
+	}
+}
diff --git a/thirdparty/misc/mikktspace.h b/thirdparty/misc/mikktspace.h
new file mode 100644
index 0000000000..52c44a713c
--- /dev/null
+++ b/thirdparty/misc/mikktspace.h
@@ -0,0 +1,145 @@
+/** \file mikktspace/mikktspace.h
+ *  \ingroup mikktspace
+ */
+/**
+ *  Copyright (C) 2011 by Morten S. Mikkelsen
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __MIKKTSPACE_H__
+#define __MIKKTSPACE_H__
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Author: Morten S. Mikkelsen
+ * Version: 1.0
+ *
+ * The files mikktspace.h and mikktspace.c are designed to be
+ * stand-alone files and it is important that they are kept this way.
+ * Not having dependencies on structures/classes/libraries specific
+ * to the program, in which they are used, allows them to be copied
+ * and used as is into any tool, program or plugin.
+ * The code is designed to consistently generate the same
+ * tangent spaces, for a given mesh, in any tool in which it is used.
+ * This is done by performing an internal welding step and subsequently an order-independent evaluation
+ * of tangent space for meshes consisting of triangles and quads.
+ * This means faces can be received in any order and the same is true for
+ * the order of vertices of each face. The generated result will not be affected
+ * by such reordering. Additionally, whether degenerate (vertices or texture coordinates)
+ * primitives are present or not will not affect the generated results either.
+ * Once tangent space calculation is done the vertices of degenerate primitives will simply
+ * inherit tangent space from neighboring non degenerate primitives.
+ * The analysis behind this implementation can be found in my master's thesis
+ * which is available for download --> http://image.diku.dk/projects/media/morten.mikkelsen.08.pdf
+ * Note that though the tangent spaces at the vertices are generated in an order-independent way,
+ * by this implementation, the interpolated tangent space is still affected by which diagonal is
+ * chosen to split each quad. A sensible solution is to have your tools pipeline always
+ * split quads by the shortest diagonal. This choice is order-independent and works with mirroring.
+ * If these have the same length then compare the diagonals defined by the texture coordinates.
+ * XNormal which is a tool for baking normal maps allows you to write your own tangent space plugin
+ * and also quad triangulator plugin.
+ */
+
+
+typedef int tbool;
+typedef struct SMikkTSpaceContext SMikkTSpaceContext;
+
+typedef struct {
+	// Returns the number of faces (triangles/quads) on the mesh to be processed.
+	int (*m_getNumFaces)(const SMikkTSpaceContext * pContext);
+
+	// Returns the number of vertices on face number iFace
+	// iFace is a number in the range {0, 1, ..., getNumFaces()-1}
+	int (*m_getNumVerticesOfFace)(const SMikkTSpaceContext * pContext, const int iFace);
+
+	// returns the position/normal/texcoord of the referenced face of vertex number iVert.
+	// iVert is in the range {0,1,2} for triangles and {0,1,2,3} for quads.
+	void (*m_getPosition)(const SMikkTSpaceContext * pContext, float fvPosOut[], const int iFace, const int iVert);
+	void (*m_getNormal)(const SMikkTSpaceContext * pContext, float fvNormOut[], const int iFace, const int iVert);
+	void (*m_getTexCoord)(const SMikkTSpaceContext * pContext, float fvTexcOut[], const int iFace, const int iVert);
+
+	// either (or both) of the two setTSpace callbacks can be set.
+	// The call-back m_setTSpaceBasic() is sufficient for basic normal mapping.
+
+	// This function is used to return the tangent and fSign to the application.
+	// fvTangent is a unit length vector.
+	// For normal maps it is sufficient to use the following simplified version of the bitangent which is generated at pixel/vertex level.
+	// bitangent = fSign * cross(vN, tangent);
+	// Note that the results are returned unindexed. It is possible to generate a new index list
+	// But averaging/overwriting tangent spaces by using an already existing index list WILL produce INCRORRECT results.
+	// DO NOT! use an already existing index list.
+	void (*m_setTSpaceBasic)(const SMikkTSpaceContext * pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert);
+
+	// This function is used to return tangent space results to the application.
+	// fvTangent and fvBiTangent are unit length vectors and fMagS and fMagT are their
+	// true magnitudes which can be used for relief mapping effects.
+	// fvBiTangent is the "real" bitangent and thus may not be perpendicular to fvTangent.
+	// However, both are perpendicular to the vertex normal.
+	// For normal maps it is sufficient to use the following simplified version of the bitangent which is generated at pixel/vertex level.
+	// fSign = bIsOrientationPreserving ? 1.0f : (-1.0f);
+	// bitangent = fSign * cross(vN, tangent);
+	// Note that the results are returned unindexed. It is possible to generate a new index list
+	// But averaging/overwriting tangent spaces by using an already existing index list WILL produce INCRORRECT results.
+	// DO NOT! use an already existing index list.
+	void (*m_setTSpace)(const SMikkTSpaceContext * pContext, const float fvTangent[], const float fvBiTangent[], const float fMagS, const float fMagT,
+						const tbool bIsOrientationPreserving, const int iFace, const int iVert);
+} SMikkTSpaceInterface;
+
+struct SMikkTSpaceContext
+{
+	SMikkTSpaceInterface * m_pInterface;	// initialized with callback functions
+	void * m_pUserData;						// pointer to client side mesh data etc. (passed as the first parameter with every interface call)
+};
+
+// these are both thread safe!
+tbool genTangSpaceDefault(const SMikkTSpaceContext * pContext);	// Default (recommended) fAngularThreshold is 180 degrees (which means threshold disabled)
+tbool genTangSpace(const SMikkTSpaceContext * pContext, const float fAngularThreshold);
+
+
+// To avoid visual errors (distortions/unwanted hard edges in lighting), when using sampled normal maps, the
+// normal map sampler must use the exact inverse of the pixel shader transformation.
+// The most efficient transformation we can possibly do in the pixel shader is
+// achieved by using, directly, the "unnormalized" interpolated tangent, bitangent and vertex normal: vT, vB and vN.
+// pixel shader (fast transform out)
+// vNout = normalize( vNt.x * vT + vNt.y * vB + vNt.z * vN );
+// where vNt is the tangent space normal. The normal map sampler must likewise use the
+// interpolated and "unnormalized" tangent, bitangent and vertex normal to be compliant with the pixel shader.
+// sampler does (exact inverse of pixel shader):
+// float3 row0 = cross(vB, vN);
+// float3 row1 = cross(vN, vT);
+// float3 row2 = cross(vT, vB);
+// float fSign = dot(vT, row0)<0 ? -1 : 1;
+// vNt = normalize( fSign * float3(dot(vNout,row0), dot(vNout,row1), dot(vNout,row2)) );
+// where vNout is the sampled normal in some chosen 3D space.
+//
+// Should you choose to reconstruct the bitangent in the pixel shader instead
+// of the vertex shader, as explained earlier, then be sure to do this in the normal map sampler also.
+// Finally, beware of quad triangulations. If the normal map sampler doesn't use the same triangulation of
+// quads as your renderer then problems will occur since the interpolated tangent spaces will differ
+// eventhough the vertex level tangent spaces match. This can be solved either by triangulating before
+// sampling/exporting or by using the order-independent choice of diagonal for splitting quads suggested earlier.
+// However, this must be used both by the sampler and your tools/rendering pipeline.
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/thirdparty/misc/pcg.cpp b/thirdparty/misc/pcg.cpp
new file mode 100644
index 0000000000..eac3b36d36
--- /dev/null
+++ b/thirdparty/misc/pcg.cpp
@@ -0,0 +1,15 @@
+// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org
+// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website)
+
+#include "pcg.h"
+
+uint32_t pcg32_random_r(pcg32_random_t* rng)
+{
+    uint64_t oldstate = rng->state;
+    // Advance internal state
+    rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
diff --git a/thirdparty/misc/pcg.h b/thirdparty/misc/pcg.h
new file mode 100644
index 0000000000..81f4c9770e
--- /dev/null
+++ b/thirdparty/misc/pcg.h
@@ -0,0 +1,14 @@
+// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org
+// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website)
+
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#include "typedefs.h"
+
+#define PCG_DEFAULT_INC_64 1442695040888963407ULL
+
+typedef struct { uint64_t state;  uint64_t inc; } pcg32_random_t;
+uint32_t pcg32_random_r(pcg32_random_t* rng);
+
+#endif // RANDOM_H
diff --git a/thirdparty/misc/sha256.c b/thirdparty/misc/sha256.c
new file mode 100644
index 0000000000..68a4339af9
--- /dev/null
+++ b/thirdparty/misc/sha256.c
@@ -0,0 +1,245 @@
+/*
+*   SHA-256 implementation.
+*
+*   Copyright (c) 2010 Ilya O. Levin, http://www.literatecode.com
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+#define SWAP_BYTES
+// #define USE_STD_MEMCPY
+// #define SELF_TEST
+
+#ifdef USE_STD_MEMCPY
+#include <string.h>
+#endif
+#include "sha256.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RL(x,n)   (((x) << n) | ((x) >> (32 - n)))
+#define RR(x,n)   (((x) >> n) | ((x) << (32 - n)))
+
+#define S0(x)  (RR((x), 2) ^ RR((x),13) ^ RR((x),22))
+#define S1(x)  (RR((x), 6) ^ RR((x),11) ^ RR((x),25))
+#define G0(x)  (RR((x), 7) ^ RR((x),18) ^ ((x) >> 3))
+#define G1(x)  (RR((x),17) ^ RR((x),19) ^ ((x) >> 10))
+
+#ifdef SWAP_BYTES
+#define BSWP(x,y)  _bswapw((uint32_t *)(x), (uint32_t)(y))
+#else
+#define BSWP(p,n)
+#endif
+#ifdef USE_STD_MEMCPY
+#define MEMCP(x,y,z) memcpy((x),(y),(z))
+#else
+#define MEMCP(x,y,z) _memcp((x),(y),(z))
+#endif
+
+#ifndef __cdecl
+#define __cdecl
+#endif
+
+static const uint32_t K[64] = {
+     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* -------------------------------------------------------------------------- */
+static void _bswapw(uint32_t *p, uint32_t i)
+{
+    while (i--) p[i] = (RR(p[i],24) & 0x00ff00ff) | (RR(p[i],8) & 0xff00ff00);
+
+} /* _bswapw */
+
+/* -------------------------------------------------------------------------- */
+#ifndef USE_STD_MEMCPY
+void * __cdecl _memcp (void *d, const void *s, uint32_t sz)
+{
+    void *rv = d;
+
+    while (sz--) *(char *)d = *(char *)s, d = (char *)d + 1,  s = (char *)s + 1;
+
+    return(rv);
+} /* _memcp */
+#endif
+
+/* -------------------------------------------------------------------------- */
+static void _rtrf(uint32_t *b, uint32_t *p, uint32_t i, uint32_t j)
+{
+    #define B(x, y) b[(x-y) & 7]
+    #define P(x, y) p[(x+y) & 15]
+
+    B(7,i) += (j ? (p[i & 15] += G1(P(i,14)) + P(i,9) + G0(P(i,1))) : p[i & 15])
+              + K[i+j] + S1(B(4,i))
+              + (B(6,i) ^ (B(4,i) & (B(5,i) ^ B(6,i))));
+    B(3,i) += B(7,i);
+    B(7,i) += S0(B(0,i)) + ( (B(0,i) & B(1,i)) | (B(2,i) & (B(0,i) ^ B(1,i))) );
+
+    #undef P
+    #undef B
+} /* _rtrf */
+
+/* -------------------------------------------------------------------------- */
+static void _hash(sha256_context *ctx)
+{
+    uint32_t b[8], *p, j;
+
+    b[0] = ctx->hash[0]; b[1] = ctx->hash[1]; b[2] = ctx->hash[2];
+    b[3] = ctx->hash[3]; b[4] = ctx->hash[4]; b[5] = ctx->hash[5];
+    b[6] = ctx->hash[6]; b[7] = ctx->hash[7];
+
+    for (p = ctx->buf, j = 0; j < 64; j += 16)
+        _rtrf(b, p,  0, j), _rtrf(b, p,  1, j), _rtrf(b, p,  2, j),
+        _rtrf(b, p,  3, j), _rtrf(b, p,  4, j), _rtrf(b, p,  5, j),
+        _rtrf(b, p,  6, j), _rtrf(b, p,  7, j), _rtrf(b, p,  8, j),
+        _rtrf(b, p,  9, j), _rtrf(b, p, 10, j), _rtrf(b, p, 11, j),
+        _rtrf(b, p, 12, j), _rtrf(b, p, 13, j), _rtrf(b, p, 14, j),
+        _rtrf(b, p, 15, j);
+
+    ctx->hash[0] += b[0]; ctx->hash[1] += b[1]; ctx->hash[2] += b[2];
+    ctx->hash[3] += b[3]; ctx->hash[4] += b[4]; ctx->hash[5] += b[5];
+    ctx->hash[6] += b[6]; ctx->hash[7] += b[7];
+
+} /* _hash */
+
+/* -------------------------------------------------------------------------- */
+void sha256_init(sha256_context ctx[1])
+{
+    ctx->len[0] = ctx->len[1] = 0;
+    ctx->hash[0] = 0x6a09e667; ctx->hash[1] = 0xbb67ae85;
+    ctx->hash[2] = 0x3c6ef372; ctx->hash[3] = 0xa54ff53a;
+    ctx->hash[4] = 0x510e527f; ctx->hash[5] = 0x9b05688c;
+    ctx->hash[6] = 0x1f83d9ab; ctx->hash[7] = 0x5be0cd19;
+
+} /* sha256_init */
+
+/* -------------------------------------------------------------------------- */
+void sha256_hash(sha256_context *ctx, uint8_t *dat, uint32_t sz)
+{
+    register uint32_t i = ctx->len[0] & 63, l, j;
+
+    if ((ctx->len[0] += sz) < sz)  ++(ctx->len[1]);
+
+    for (j = 0, l = 64-i; sz >= l; j += l, sz -= l, l = 64, i = 0)
+    {
+        MEMCP(&ctx->buf[i], &dat[j], l);
+        BSWP(ctx->buf, 16 );
+        _hash(ctx);
+    }
+    MEMCP(&ctx->buf[i], &dat[j], sz);
+
+} /* _hash */
+
+/* -------------------------------------------------------------------------- */
+void sha256_done(sha256_context *ctx, uint8_t *buf)
+{
+    uint32_t i = (uint32_t)(ctx->len[0] & 63), j = ((~i) & 3) << 3;
+
+    BSWP(ctx->buf, (i + 3) >> 2);
+
+    ctx->buf[i >> 2] &= 0xffffff80 << j;  /* add padding */
+    ctx->buf[i >> 2] |= 0x00000080 << j;
+
+    if (i < 56) i = (i >> 2) + 1;
+       else ctx->buf[15] ^= (i < 60) ? ctx->buf[15] : 0, _hash(ctx), i = 0;
+
+    while (i < 14) ctx->buf[i++] = 0;
+
+    ctx->buf[14] = (ctx->len[1] << 3)|(ctx->len[0] >> 29); /* add length */
+    ctx->buf[15] = ctx->len[0] << 3;
+
+    _hash(ctx);
+
+    for (i = 0; i < 32; i++)
+       ctx->buf[i % 16] = 0, /* may remove this line in case of a DIY cleanup */
+       buf[i] = (uint8_t)(ctx->hash[i >> 2] >> ((~i & 3) << 3));
+
+} /* sha256_done */
+
+
+#ifdef SELF_TEST
+#pragma warning (push, 0)
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#pragma warning(pop)
+
+char *buf[] = {
+    "",
+    "e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855",
+
+    "abc",
+    "ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad",
+
+    "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+    "248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1",
+
+    "The quick brown fox jumps over the lazy dog",
+    "d7a8fbb3 07d78094 69ca9abc b0082e4f 8d5651e4 6d3cdb76 2d02d0bf 37c9e592",
+
+    "The quick brown fox jumps over the lazy cog", /* avalanche effect test */
+    "e4c4d8f3 bf76b692 de791a17 3e053211 50f7a345 b46484fe 427f6acc 7ecc81be",
+
+    "bhn5bjmoniertqea40wro2upyflkydsibsk8ylkmgbvwi420t44cq034eou1szc1k0mk46oeb7ktzmlxqkbte2sy",
+    "9085df2f 02e0cc45 5928d0f5 1b27b4bf 1d9cd260 a66ed1fd a11b0a3f f5756d99"
+};
+
+int main(int argc, char *argv[])
+{
+    sha256_context ctx;
+    uint8_t hv[32];
+    uint32_t i, j;
+
+    for (j = 0; j < (sizeof(buf)/sizeof(buf[0])); j += 2)
+    {
+        sha256_init(&ctx);
+        sha256_hash(&ctx, (uint8_t *)buf[j], (uint32_t)strlen(buf[j]));
+        sha256_done(&ctx, hv);
+        printf("input = %s\ndigest: %s\nresult: ", buf[j], buf[j+1]);
+        for (i = 0; i < 32; i++) printf("%02x%s", hv[i], ((i%4)==3)?" ":"");
+        printf("\n\n");
+    }
+
+    for (j = 1; j < (uint32_t)argc; j++)
+    {
+        printf("argv[%d]: %s\nresult: ", (int)j, argv[j]);
+        sha256_init(&ctx);
+        sha256_hash(&ctx, (uint8_t *)argv[j], (uint32_t)strlen(argv[j]));
+        sha256_done(&ctx, hv);
+        for (i = 0; i < 32; i++) printf("%02x%s", hv[i], ((i%4)==3)?" ":"");
+        printf("\n\n");
+    }
+
+    return 0;
+} /* main */
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/misc/sha256.h b/thirdparty/misc/sha256.h
new file mode 100644
index 0000000000..e19e56b4cc
--- /dev/null
+++ b/thirdparty/misc/sha256.h
@@ -0,0 +1,50 @@
+/*
+*   SHA-256 implementation.
+*
+*   Copyright (c) 2010 Ilya O. Levin, http://www.literatecode.com
+*
+*   Permission to use, copy, modify, and distribute this software for any
+*   purpose with or without fee is hereby granted, provided that the above
+*   copyright notice and this permission notice appear in all copies.
+*
+*   THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+*   WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+*   MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+*   ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+*   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+*   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+#ifdef _MSC_VER
+#ifndef uint8_t
+typedef unsigned __int8 uint8_t;
+#endif
+#ifndef uint32_t
+typedef unsigned __int32 uint32_t;
+#endif
+#ifndef uint64_t
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+   typedef struct {
+       uint32_t buf[16];
+       uint32_t hash[8];
+       uint32_t len[2];
+   } sha256_context;
+
+   void sha256_init(sha256_context *);
+   void sha256_hash(sha256_context *, uint8_t * /* data */, uint32_t /* len */);
+   void sha256_done(sha256_context *, uint8_t * /* hash */);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/misc/smaz.c b/thirdparty/misc/smaz.c
new file mode 100644
index 0000000000..555dfea844
--- /dev/null
+++ b/thirdparty/misc/smaz.c
@@ -0,0 +1,207 @@
+/*
+Copyright (c) 2006-2009, Salvatore Sanfilippo
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Smaz nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string.h>
+
+/* Our compression codebook, used for compression */
+static const char *Smaz_cb[241] = {
+"\002s,\266", "\003had\232\002leW", "\003on \216", "", "\001yS",
+"\002ma\255\002li\227", "\003or \260", "", "\002ll\230\003s t\277",
+"\004fromg\002mel", "", "\003its\332", "\001z\333", "\003ingF", "\001>\336",
+"\001 \000\003   (\002nc\344", "\002nd=\003 on\312",
+"\002ne\213\003hat\276\003re q", "", "\002ngT\003herz\004have\306\003s o\225",
+"", "\003ionk\003s a\254\002ly\352", "\003hisL\003 inN\003 be\252", "",
+"\003 fo\325\003 of \003 ha\311", "", "\002of\005",
+"\003 co\241\002no\267\003 ma\370", "", "", "\003 cl\356\003enta\003 an7",
+"\002ns\300\001\"e", "\003n t\217\002ntP\003s, \205",
+"\002pe\320\003 we\351\002om\223", "\002on\037", "", "\002y G", "\003 wa\271",
+"\003 re\321\002or*", "", "\002=\"\251\002ot\337", "\003forD\002ou[",
+"\003 toR", "\003 th\r", "\003 it\366",
+"\003but\261\002ra\202\003 wi\363\002</\361", "\003 wh\237", "\002  4",
+"\003nd ?", "\002re!", "", "\003ng c", "",
+"\003ly \307\003ass\323\001a\004\002rir", "", "", "", "\002se_", "\003of \"",
+"\003div\364\002ros\003ere\240", "", "\002ta\310\001bZ\002si\324", "",
+"\003and\a\002rs\335", "\002rt\362", "\002teE", "\003ati\316", "\002so\263",
+"\002th\021", "\002tiJ\001c\034\003allp", "\003ate\345", "\002ss\246",
+"\002stM", "", "\002><\346", "\002to\024", "\003arew", "\001d\030",
+"\002tr\303", "", "\001\n1\003 a \222", "\003f tv\002veo", "\002un\340", "",
+"\003e o\242", "\002a \243\002wa\326\001e\002", "\002ur\226\003e a\274",
+"\002us\244\003\n\r\n\247", "\002ut\304\003e c\373", "\002we\221", "", "",
+"\002wh\302", "\001f,", "", "", "", "\003d t\206", "", "", "\003th \343",
+"\001g;", "", "", "\001\r9\003e s\265", "\003e t\234", "", "\003to Y",
+"\003e\r\n\236", "\002d \036\001h\022", "", "\001,Q", "\002 a\031", "\002 b^",
+"\002\r\n\025\002 cI", "\002 d\245", "\002 e\253", "\002 fh\001i\b\002e \v",
+"", "\002 hU\001-\314", "\002 i8", "", "", "\002 l\315", "\002 m{",
+"\002f :\002 n\354", "\002 o\035", "\002 p}\001.n\003\r\n\r\250", "",
+"\002 r\275", "\002 s>", "\002 t\016", "", "\002g \235\005which+\003whi\367",
+"\002 w5", "\001/\305", "\003as \214", "\003at \207", "", "\003who\331", "",
+"\001l\026\002h \212", "", "\002, $", "", "\004withV", "", "", "", "\001m-", "",
+"", "\002ac\357", "\002ad\350", "\003TheH", "", "", "\004this\233\001n\t",
+"", "\002. y", "", "\002alX\003e, \365", "\003tio\215\002be\\",
+"\002an\032\003ver\347", "", "\004that0\003tha\313\001o\006", "\003was2",
+"\002arO", "\002as.", "\002at'\003the\001\004they\200\005there\322\005theird",
+"\002ce\210", "\004were]", "", "\002ch\231\002l \264\001p<", "", "",
+"\003one\256", "", "\003he \023\002dej", "\003ter\270", "\002cou", "",
+"\002by\177\002di\201\002eax", "", "\002ec\327", "\002edB", "\002ee\353", "",
+"", "\001r\f\002n )", "", "", "", "\002el\262", "", "\003in i\002en3", "",
+"\002o `\001s\n", "", "\002er\033", "\003is t\002es6", "", "\002ge\371",
+"\004.com\375", "\002fo\334\003our\330", "\003ch \301\001t\003", "\002hab", "",
+"\003men\374", "", "\002he\020", "", "", "\001u&", "\002hif", "",
+"\003not\204\002ic\203", "\003ed @\002id\355", "", "", "\002ho\273",
+"\002r K\001vm", "", "", "", "\003t t\257\002il\360", "\002im\342",
+"\003en \317\002in\017", "\002io\220", "\002s \027\001wA", "", "\003er |",
+"\003es ~\002is%", "\002it/", "", "\002iv\272", "",
+"\002t #\ahttp://C\001x\372", "\002la\211", "\001<\341", "\003, a\224"
+};
+
+/* Reverse compression codebook, used for decompression */
+static char *Smaz_rcb[254] = {
+" ", "the", "e", "t", "a", "of", "o", "and", "i", "n", "s", "e ", "r", " th",
+" t", "in", "he", "th", "h", "he ", "to", "\r\n", "l", "s ", "d", " a", "an",
+"er", "c", " o", "d ", "on", " of", "re", "of ", "t ", ", ", "is", "u", "at",
+"   ", "n ", "or", "which", "f", "m", "as", "it", "that", "\n", "was", "en",
+"  ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s", "nd ", "ed ",
+"w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ", "his",
+"st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al", "to ",
+"b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
+"hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ",
+"ri", "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p",
+"es ", "by", "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la",
+"h ", "ne", "as ", "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o",
+"ur", "li", "ll", "ch", "had", "this", "e t", "g ", "e\r\n", " wh", "ere",
+" co", "e o", "a ", "us", " d", "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e",
+"s a", "ma", "one", "t t", "or ", "but", "el", "so", "l ", "e s", "s,", "no",
+"ter", " wa", "iv", "ho", "e a", " r", "hat", "s t", "ns", "ch ", "wh", "tr",
+"ut", "/", "have", "ly ", "ta", " ha", " on", "tha", "-", " l", "ati", "en ",
+"pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our", "who", "its", "z",
+"fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><", "ver", "ad",
+" we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi", "div",
+"e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com"
+};
+
+int smaz_compress(const char *in, int inlen, char *out, int outlen) {
+    unsigned int h1,h2,h3=0;
+    int verblen = 0, _outlen = outlen;
+    char verb[256], *_out = out;
+
+    while(inlen) {
+        int j = 7, needed;
+        char *flush = NULL;
+        char *slot;
+
+        h1 = h2 = in[0]<<3;
+        if (inlen > 1) h2 += in[1];
+        if (inlen > 2) h3 = h2^in[2];
+        if (j > inlen) j = inlen;
+
+        /* Try to lookup substrings into the hash table, starting from the
+         * longer to the shorter substrings */
+        for (; j > 0; j--) {
+            switch(j) {
+            case 1: slot = Smaz_cb[h1%241]; break;
+            case 2: slot = Smaz_cb[h2%241]; break;
+            default: slot = Smaz_cb[h3%241]; break;
+            }
+            while(slot[0]) {
+                if (slot[0] == j && memcmp(slot+1,in,j) == 0) {
+                    /* Match found in the hash table,
+                     * prepare a verbatim bytes flush if needed */
+                    if (verblen) {
+                        needed = (verblen == 1) ? 2 : 2+verblen;
+                        flush = out;
+                        out += needed;
+                        outlen -= needed;
+                    }
+                    /* Emit the byte */
+                    if (outlen <= 0) return _outlen+1;
+                    out[0] = slot[slot[0]+1];
+                    out++;
+                    outlen--;
+                    inlen -= j;
+                    in += j;
+                    goto out;
+                } else {
+                    slot += slot[0]+2;
+                }
+            }
+        }
+        /* Match not found - add the byte to the verbatim buffer */
+        verb[verblen] = in[0];
+        verblen++;
+        inlen--;
+        in++;
+out:
+        /* Prepare a flush if we reached the flush length limit, and there
+         * is not already a pending flush operation. */
+        if (!flush && (verblen == 256 || (verblen > 0 && inlen == 0))) {
+            needed = (verblen == 1) ? 2 : 2+verblen;
+            flush = out;
+            out += needed;
+            outlen -= needed;
+            if (outlen < 0) return _outlen+1;
+        }
+        /* Perform a verbatim flush if needed */
+        if (flush) {
+            if (verblen == 1) {
+                flush[0] = (signed char)254;
+                flush[1] = verb[0];
+            } else {
+                flush[0] = (signed char)255;
+                flush[1] = (signed char)(verblen-1);
+                memcpy(flush+2,verb,verblen);
+            }
+            flush = NULL;
+            verblen = 0;
+        }
+    }
+    return out-_out;
+}
+
+int smaz_decompress(const char *in, int inlen, char *out, int outlen) {
+    unsigned char *c = (unsigned char*) in;
+    char *_out = out;
+    int _outlen = outlen;
+
+    while(inlen) {
+        if (*c == 254) {
+            /* Verbatim byte */
+            if (outlen < 1) return _outlen+1;
+            *out = *(c+1);
+            out++;
+            outlen--;
+            c += 2;
+            inlen -= 2;
+        } else if (*c == 255) {
+            /* Verbatim string */
+            int len = (*(c+1))+1;
+            if (outlen < len) return _outlen+1;
+            memcpy(out,c+2,len);
+            out += len;
+            outlen -= len;
+            c += 2+len;
+            inlen -= 2+len;
+        } else {
+            /* Codebook entry */
+            const char *s = Smaz_rcb[*c];
+            int len = strlen(s);
+
+            if (outlen < len) return _outlen+1;
+            memcpy(out,s,len);
+            out += len;
+            outlen -= len;
+            c++;
+            inlen--;
+        }
+    }
+    return out-_out;
+}
diff --git a/thirdparty/misc/smaz.h b/thirdparty/misc/smaz.h
new file mode 100644
index 0000000000..a9d8a337a7
--- /dev/null
+++ b/thirdparty/misc/smaz.h
@@ -0,0 +1,20 @@
+/*
+Copyright (c) 2006-2009, Salvatore Sanfilippo
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Smaz nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _SMAZ_H
+#define _SMAZ_H
+
+int smaz_compress(const char *in, int inlen, char *out, int outlen);
+int smaz_decompress(const char *in, int inlen, char *out, int outlen);
+
+#endif
diff --git a/thirdparty/misc/stb_truetype.h b/thirdparty/misc/stb_truetype.h
new file mode 100644
index 0000000000..016972785a
--- /dev/null
+++ b/thirdparty/misc/stb_truetype.h
@@ -0,0 +1,3267 @@
+// stb_truetype.h - v1.11 - public domain
+// authored from 2009-2015 by Sean Barrett / RAD Game Tools
+//
+//   This library processes TrueType files:
+//        parse files
+//        extract glyph metrics
+//        extract glyph shapes
+//        render glyphs to one-channel bitmaps with antialiasing (box filter)
+//
+//   Todo:
+//        non-MS cmaps
+//        crashproof on bad data
+//        hinting? (no longer patented)
+//        cleartype-style AA?
+//        optimize: use simple memory allocator for intermediates
+//        optimize: build edge-list directly from curves
+//        optimize: rasterize directly from curves?
+//
+// ADDITIONAL CONTRIBUTORS
+//
+//   Mikko Mononen: compound shape support, more cmap formats
+//   Tor Andersson: kerning, subpixel rendering
+//
+//   Misc other:
+//       Ryan Gordon
+//       Simon Glass
+//
+//   Bug/warning reports/fixes:
+//       "Zer" on mollyrocket (with fix)
+//       Cass Everitt
+//       stoiko (Haemimont Games)
+//       Brian Hook
+//       Walter van Niftrik
+//       David Gow
+//       David Given
+//       Ivan-Assen Ivanov
+//       Anthony Pesch
+//       Johan Duparc
+//       Hou Qiming
+//       Fabian "ryg" Giesen
+//       Martins Mozeiko
+//       Cap Petschulat
+//       Omar Cornut
+//       github:aloucks
+//       Peter LaValle
+//       Sergey Popov
+//       Giumo X. Clanjor
+//       Higor Euripedes
+//       Thomas Fields
+//       Derek Vinyard
+//
+// VERSION HISTORY
+//
+//   1.11 (2016-04-02) fix unused-variable warning
+//   1.10 (2016-04-02) user-defined fabs(); rare memory leak; remove duplicate typedef
+//   1.09 (2016-01-16) warning fix; avoid crash on outofmem; use allocation userdata properly
+//   1.08 (2015-09-13) document stbtt_Rasterize(); fixes for vertical & horizontal edges
+//   1.07 (2015-08-01) allow PackFontRanges to accept arrays of sparse codepoints;
+//                     variant PackFontRanges to pack and render in separate phases;
+//                     fix stbtt_GetFontOFfsetForIndex (never worked for non-0 input?);
+//                     fixed an assert() bug in the new rasterizer
+//                     replace assert() with STBTT_assert() in new rasterizer
+//   1.06 (2015-07-14) performance improvements (~35% faster on x86 and x64 on test machine)
+//                     also more precise AA rasterizer, except if shapes overlap
+//                     remove need for STBTT_sort
+//   1.05 (2015-04-15) fix misplaced definitions for STBTT_STATIC
+//   1.04 (2015-04-15) typo in example
+//   1.03 (2015-04-12) STBTT_STATIC, fix memory leak in new packing, various fixes
+//
+//   Full history can be found at the end of this file.
+//
+// LICENSE
+//
+//   This software is dual-licensed to the public domain and under the following
+//   license: you are granted a perpetual, irrevocable license to copy, modify,
+//   publish, and distribute this file as you see fit.
+//
+// USAGE
+//
+//   Include this file in whatever places neeed to refer to it. In ONE C/C++
+//   file, write:
+//      #define STB_TRUETYPE_IMPLEMENTATION
+//   before the #include of this file. This expands out the actual
+//   implementation into that C/C++ file.
+//
+//   To make the implementation private to the file that generates the implementation,
+//      #define STBTT_STATIC
+//
+//   Simple 3D API (don't ship this, but it's fine for tools and quick start)
+//           stbtt_BakeFontBitmap()               -- bake a font to a bitmap for use as texture
+//           stbtt_GetBakedQuad()                 -- compute quad to draw for a given char
+//
+//   Improved 3D API (more shippable):
+//           #include "stb_rect_pack.h"           -- optional, but you really want it
+//           stbtt_PackBegin()
+//           stbtt_PackSetOversample()            -- for improved quality on small fonts
+//           stbtt_PackFontRanges()               -- pack and renders
+//           stbtt_PackEnd()
+//           stbtt_GetPackedQuad()
+//
+//   "Load" a font file from a memory buffer (you have to keep the buffer loaded)
+//           stbtt_InitFont()
+//           stbtt_GetFontOffsetForIndex()        -- use for TTC font collections
+//
+//   Render a unicode codepoint to a bitmap
+//           stbtt_GetCodepointBitmap()           -- allocates and returns a bitmap
+//           stbtt_MakeCodepointBitmap()          -- renders into bitmap you provide
+//           stbtt_GetCodepointBitmapBox()        -- how big the bitmap must be
+//
+//   Character advance/positioning
+//           stbtt_GetCodepointHMetrics()
+//           stbtt_GetFontVMetrics()
+//           stbtt_GetCodepointKernAdvance()
+//
+//   Starting with version 1.06, the rasterizer was replaced with a new,
+//   faster and generally-more-precise rasterizer. The new rasterizer more
+//   accurately measures pixel coverage for anti-aliasing, except in the case
+//   where multiple shapes overlap, in which case it overestimates the AA pixel
+//   coverage. Thus, anti-aliasing of intersecting shapes may look wrong. If
+//   this turns out to be a problem, you can re-enable the old rasterizer with
+//        #define STBTT_RASTERIZER_VERSION 1
+//   which will incur about a 15% speed hit.
+//
+// ADDITIONAL DOCUMENTATION
+//
+//   Immediately after this block comment are a series of sample programs.
+//
+//   After the sample programs is the "header file" section. This section
+//   includes documentation for each API function.
+//
+//   Some important concepts to understand to use this library:
+//
+//      Codepoint
+//         Characters are defined by unicode codepoints, e.g. 65 is
+//         uppercase A, 231 is lowercase c with a cedilla, 0x7e30 is
+//         the hiragana for "ma".
+//
+//      Glyph
+//         A visual character shape (every codepoint is rendered as
+//         some glyph)
+//
+//      Glyph index
+//         A font-specific integer ID representing a glyph
+//
+//      Baseline
+//         Glyph shapes are defined relative to a baseline, which is the
+//         bottom of uppercase characters. Characters extend both above
+//         and below the baseline.
+//
+//      Current Point
+//         As you draw text to the screen, you keep track of a "current point"
+//         which is the origin of each character. The current point's vertical
+//         position is the baseline. Even "baked fonts" use this model.
+//
+//      Vertical Font Metrics
+//         The vertical qualities of the font, used to vertically position
+//         and space the characters. See docs for stbtt_GetFontVMetrics.
+//
+//      Font Size in Pixels or Points
+//         The preferred interface for specifying font sizes in stb_truetype
+//         is to specify how tall the font's vertical extent should be in pixels.
+//         If that sounds good enough, skip the next paragraph.
+//
+//         Most font APIs instead use "points", which are a common typographic
+//         measurement for describing font size, defined as 72 points per inch.
+//         stb_truetype provides a point API for compatibility. However, true
+//         "per inch" conventions don't make much sense on computer displays
+//         since they different monitors have different number of pixels per
+//         inch. For example, Windows traditionally uses a convention that
+//         there are 96 pixels per inch, thus making 'inch' measurements have
+//         nothing to do with inches, and thus effectively defining a point to
+//         be 1.333 pixels. Additionally, the TrueType font data provides
+//         an explicit scale factor to scale a given font's glyphs to points,
+//         but the author has observed that this scale factor is often wrong
+//         for non-commercial fonts, thus making fonts scaled in points
+//         according to the TrueType spec incoherently sized in practice.
+//
+// ADVANCED USAGE
+//
+//   Quality:
+//
+//    - Use the functions with Subpixel at the end to allow your characters
+//      to have subpixel positioning. Since the font is anti-aliased, not
+//      hinted, this is very import for quality. (This is not possible with
+//      baked fonts.)
+//
+//    - Kerning is now supported, and if you're supporting subpixel rendering
+//      then kerning is worth using to give your text a polished look.
+//
+//   Performance:
+//
+//    - Convert Unicode codepoints to glyph indexes and operate on the glyphs;
+//      if you don't do this, stb_truetype is forced to do the conversion on
+//      every call.
+//
+//    - There are a lot of memory allocations. We should modify it to take
+//      a temp buffer and allocate from the temp buffer (without freeing),
+//      should help performance a lot.
+//
+// NOTES
+//
+//   The system uses the raw data found in the .ttf file without changing it
+//   and without building auxiliary data structures. This is a bit inefficient
+//   on little-endian systems (the data is big-endian), but assuming you're
+//   caching the bitmaps or glyph shapes this shouldn't be a big deal.
+//
+//   It appears to be very hard to programmatically determine what font a
+//   given file is in a general way. I provide an API for this, but I don't
+//   recommend it.
+//
+//
+// SOURCE STATISTICS (based on v0.6c, 2050 LOC)
+//
+//   Documentation & header file        520 LOC  \___ 660 LOC documentation
+//   Sample code                        140 LOC  /
+//   Truetype parsing                   620 LOC  ---- 620 LOC TrueType
+//   Software rasterization             240 LOC  \                           .
+//   Curve tesselation                  120 LOC   \__ 550 LOC Bitmap creation
+//   Bitmap management                  100 LOC   /
+//   Baked bitmap interface              70 LOC  /
+//   Font name matching & access        150 LOC  ---- 150
+//   C runtime library abstraction       60 LOC  ----  60
+//
+//
+// PERFORMANCE MEASUREMENTS FOR 1.06:
+//
+//                      32-bit     64-bit
+//   Previous release:  8.83 s     7.68 s
+//   Pool allocations:  7.72 s     6.34 s
+//   Inline sort     :  6.54 s     5.65 s
+//   New rasterizer  :  5.63 s     5.00 s
+
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+////
+////  SAMPLE PROGRAMS
+////
+//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
+//
+#if 0
+#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+#include "stb_truetype.h"
+
+unsigned char ttf_buffer[1<<20];
+unsigned char temp_bitmap[512*512];
+
+stbtt_bakedchar cdata[96]; // ASCII 32..126 is 95 glyphs
+GLuint ftex;
+
+void my_stbtt_initfont(void)
+{
+   fread(ttf_buffer, 1, 1<<20, fopen("c:/windows/fonts/times.ttf", "rb"));
+   stbtt_BakeFontBitmap(ttf_buffer,0, 32.0, temp_bitmap,512,512, 32,96, cdata); // no guarantee this fits!
+   // can free ttf_buffer at this point
+   glGenTextures(1, &ftex);
+   glBindTexture(GL_TEXTURE_2D, ftex);
+   glTexImage2D(GL_TEXTURE_2D, 0, GL_ALPHA, 512,512, 0, GL_ALPHA, GL_UNSIGNED_BYTE, temp_bitmap);
+   // can free temp_bitmap at this point
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+}
+
+void my_stbtt_print(float x, float y, char *text)
+{
+   // assume orthographic projection with units = screen pixels, origin at top left
+   glEnable(GL_TEXTURE_2D);
+   glBindTexture(GL_TEXTURE_2D, ftex);
+   glBegin(GL_QUADS);
+   while (*text) {
+      if (*text >= 32 && *text < 128) {
+         stbtt_aligned_quad q;
+         stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+      }
+      ++text;
+   }
+   glEnd();
+}
+#endif
+//
+//
+//////////////////////////////////////////////////////////////////////////////
+//
+// Complete program (this compiles): get a single bitmap, print as ASCII art
+//
+#if 0
+#include <stdio.h>
+#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+#include "stb_truetype.h"
+
+char ttf_buffer[1<<25];
+
+int main(int argc, char **argv)
+{
+   stbtt_fontinfo font;
+   unsigned char *bitmap;
+   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : 'a'), s = (argc > 2 ? atoi(argv[2]) : 20);
+
+   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/arialbd.ttf", "rb"));
+
+   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
+   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, s), c, &w, &h, 0,0);
+
+   for (j=0; j < h; ++j) {
+      for (i=0; i < w; ++i)
+         putchar(" .:ioVM@"[bitmap[j*w+i]>>5]);
+      putchar('\n');
+   }
+   return 0;
+}
+#endif
+//
+// Output:
+//
+//     .ii.
+//    @@@@@@.
+//   V@Mio@@o
+//   :i.  V@V
+//     :oM@@M
+//   :@@@MM@M
+//   @@o  o@M
+//  :@@.  M@M
+//   @@@o@@@@
+//   :M@@V:@@.
+//
+//////////////////////////////////////////////////////////////////////////////
+//
+// Complete program: print "Hello World!" banner, with bugs
+//
+#if 0
+char buffer[24<<20];
+unsigned char screen[20][79];
+
+int main(int arg, char **argv)
+{
+   stbtt_fontinfo font;
+   int i,j,ascent,baseline,ch=0;
+   float scale, xpos=2; // leave a little padding in case the character extends left
+   char *text = "Heljo World!"; // intentionally misspelled to show 'lj' brokenness
+
+   fread(buffer, 1, 1000000, fopen("c:/windows/fonts/arialbd.ttf", "rb"));
+   stbtt_InitFont(&font, buffer, 0);
+
+   scale = stbtt_ScaleForPixelHeight(&font, 15);
+   stbtt_GetFontVMetrics(&font, &ascent,0,0);
+   baseline = (int) (ascent*scale);
+
+   while (text[ch]) {
+      int advance,lsb,x0,y0,x1,y1;
+      float x_shift = xpos - (float) floor(xpos);
+      stbtt_GetCodepointHMetrics(&font, text[ch], &advance, &lsb);
+      stbtt_GetCodepointBitmapBoxSubpixel(&font, text[ch], scale,scale,x_shift,0, &x0,&y0,&x1,&y1);
+      stbtt_MakeCodepointBitmapSubpixel(&font, &screen[baseline + y0][(int) xpos + x0], x1-x0,y1-y0, 79, scale,scale,x_shift,0, text[ch]);
+      // note that this stomps the old data, so where character boxes overlap (e.g. 'lj') it's wrong
+      // because this API is really for baking character bitmaps into textures. if you want to render
+      // a sequence of characters, you really need to render each bitmap to a temp buffer, then
+      // "alpha blend" that into the working buffer
+      xpos += (advance * scale);
+      if (text[ch+1])
+         xpos += scale*stbtt_GetCodepointKernAdvance(&font, text[ch],text[ch+1]);
+      ++ch;
+   }
+
+   for (j=0; j < 20; ++j) {
+      for (i=0; i < 78; ++i)
+         putchar(" .:ioVM@"[screen[j][i]>>5]);
+      putchar('\n');
+   }
+
+   return 0;
+}
+#endif
+
+
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+////
+////   INTEGRATION WITH YOUR CODEBASE
+////
+////   The following sections allow you to supply alternate definitions
+////   of C library functions used by stb_truetype.
+
+#ifdef STB_TRUETYPE_IMPLEMENTATION
+   // #define your own (u)stbtt_int8/16/32 before including to override this
+   #ifndef stbtt_uint8
+   typedef unsigned char   stbtt_uint8;
+   typedef signed   char   stbtt_int8;
+   typedef unsigned short  stbtt_uint16;
+   typedef signed   short  stbtt_int16;
+   typedef unsigned int    stbtt_uint32;
+   typedef signed   int    stbtt_int32;
+   #endif
+
+   typedef char stbtt__check_size32[sizeof(stbtt_int32)==4 ? 1 : -1];
+   typedef char stbtt__check_size16[sizeof(stbtt_int16)==2 ? 1 : -1];
+
+   // #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
+   #ifndef STBTT_ifloor
+   #include <math.h>
+   #define STBTT_ifloor(x)   ((int) floor(x))
+   #define STBTT_iceil(x)    ((int) ceil(x))
+   #endif
+
+   #ifndef STBTT_sqrt
+   #include <math.h>
+   #define STBTT_sqrt(x)      sqrt(x)
+   #endif
+
+   #ifndef STBTT_fabs
+   #include <math.h>
+   #define STBTT_fabs(x)      fabs(x)
+   #endif
+
+   // #define your own functions "STBTT_malloc" / "STBTT_free" to avoid malloc.h
+   #ifndef STBTT_malloc
+   #include <stdlib.h>
+   #define STBTT_malloc(x,u)  ((void)(u),malloc(x))
+   #define STBTT_free(x,u)    ((void)(u),free(x))
+   #endif
+
+   #ifndef STBTT_assert
+   #include <assert.h>
+   #define STBTT_assert(x)    assert(x)
+   #endif
+
+   #ifndef STBTT_strlen
+   #include <string.h>
+   #define STBTT_strlen(x)    strlen(x)
+   #endif
+
+   #ifndef STBTT_memcpy
+   #include <memory.h>
+   #define STBTT_memcpy       memcpy
+   #define STBTT_memset       memset
+   #endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////
+////   INTERFACE
+////
+////
+
+#ifndef __STB_INCLUDE_STB_TRUETYPE_H__
+#define __STB_INCLUDE_STB_TRUETYPE_H__
+
+#ifdef STBTT_STATIC
+#define STBTT_DEF static
+#else
+#define STBTT_DEF extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// TEXTURE BAKING API
+//
+// If you use this API, you only have to call two functions ever.
+//
+
+typedef struct
+{
+   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
+   float xoff,yoff,xadvance;
+} stbtt_bakedchar;
+
+STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+                                float pixel_height,                     // height of font in pixels
+                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+                                int first_char, int num_chars,          // characters to bake
+                                stbtt_bakedchar *chardata);             // you allocate this, it's num_chars long
+// if return is positive, the first unused row of the bitmap
+// if return is negative, returns the negative of the number of characters that fit
+// if return is 0, no characters fit and no rows were used
+// This uses a very crappy packing.
+
+typedef struct
+{
+   float x0,y0,s0,t0; // top-left
+   float x1,y1,s1,t1; // bottom-right
+} stbtt_aligned_quad;
+
+STBTT_DEF void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
+                               int char_index,             // character to display
+                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
+                               stbtt_aligned_quad *q,      // output: quad to draw
+                               int opengl_fillrule);       // true if opengl fill rule; false if DX9 or earlier
+// Call GetBakedQuad with char_index = 'character - first_char', and it
+// creates the quad you need to draw and advances the current position.
+//
+// The coordinate system used assumes y increases downwards.
+//
+// Characters will extend both above and below the current position;
+// see discussion of "BASELINE" above.
+//
+// It's inefficient; you might want to c&p it and optimize it.
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// NEW TEXTURE BAKING API
+//
+// This provides options for packing multiple fonts into one atlas, not
+// perfectly but better than nothing.
+
+typedef struct
+{
+   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
+   float xoff,yoff,xadvance;
+   float xoff2,yoff2;
+} stbtt_packedchar;
+
+typedef struct stbtt_pack_context stbtt_pack_context;
+typedef struct stbtt_fontinfo stbtt_fontinfo;
+#ifndef STB_RECT_PACK_VERSION
+typedef struct stbrp_rect stbrp_rect;
+#endif
+
+STBTT_DEF int  stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, int width, int height, int stride_in_bytes, int padding, void *alloc_context);
+// Initializes a packing context stored in the passed-in stbtt_pack_context.
+// Future calls using this context will pack characters into the bitmap passed
+// in here: a 1-channel bitmap that is weight x height. stride_in_bytes is
+// the distance from one row to the next (or 0 to mean they are packed tightly
+// together). "padding" is the amount of padding to leave between each
+// character (normally you want '1' for bitmaps you'll use as textures with
+// bilinear filtering).
+//
+// Returns 0 on failure, 1 on success.
+
+STBTT_DEF void stbtt_PackEnd  (stbtt_pack_context *spc);
+// Cleans up the packing context and frees all memory.
+
+#define STBTT_POINT_SIZE(x)   (-(x))
+
+STBTT_DEF int  stbtt_PackFontRange(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, float font_size,
+                                int first_unicode_char_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range);
+// Creates character bitmaps from the font_index'th font found in fontdata (use
+// font_index=0 if you don't know what that is). It creates num_chars_in_range
+// bitmaps for characters with unicode values starting at first_unicode_char_in_range
+// and increasing. Data for how to render them is stored in chardata_for_range;
+// pass these to stbtt_GetPackedQuad to get back renderable quads.
+//
+// font_size is the full height of the character from ascender to descender,
+// as computed by stbtt_ScaleForPixelHeight. To use a point size as computed
+// by stbtt_ScaleForMappingEmToPixels, wrap the point size in STBTT_POINT_SIZE()
+// and pass that result as 'font_size':
+//       ...,                  20 , ... // font max minus min y is 20 pixels tall
+//       ..., STBTT_POINT_SIZE(20), ... // 'M' is 20 pixels tall
+
+typedef struct
+{
+   float font_size;
+   int first_unicode_codepoint_in_range;  // if non-zero, then the chars are continuous, and this is the first codepoint
+   int *array_of_unicode_codepoints;       // if non-zero, then this is an array of unicode codepoints
+   int num_chars;
+   stbtt_packedchar *chardata_for_range; // output
+   unsigned char h_oversample, v_oversample; // don't set these, they're used internally
+} stbtt_pack_range;
+
+STBTT_DEF int  stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges);
+// Creates character bitmaps from multiple ranges of characters stored in
+// ranges. This will usually create a better-packed bitmap than multiple
+// calls to stbtt_PackFontRange. Note that you can call this multiple
+// times within a single PackBegin/PackEnd.
+
+STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h_oversample, unsigned int v_oversample);
+// Oversampling a font increases the quality by allowing higher-quality subpixel
+// positioning, and is especially valuable at smaller text sizes.
+//
+// This function sets the amount of oversampling for all following calls to
+// stbtt_PackFontRange(s) or stbtt_PackFontRangesGatherRects for a given
+// pack context. The default (no oversampling) is achieved by h_oversample=1
+// and v_oversample=1. The total number of pixels required is
+// h_oversample*v_oversample larger than the default; for example, 2x2
+// oversampling requires 4x the storage of 1x1. For best results, render
+// oversampled textures with bilinear filtering. Look at the readme in
+// stb/tests/oversample for information about oversampled fonts
+//
+// To use with PackFontRangesGather etc., you must set it before calls
+// call to PackFontRangesGatherRects.
+
+STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph,  // same data as above
+                               int char_index,             // character to display
+                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
+                               stbtt_aligned_quad *q,      // output: quad to draw
+                               int align_to_integer);
+
+STBTT_DEF int  stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect *rects, int num_rects);
+STBTT_DEF int  stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+// Calling these functions in sequence is roughly equivalent to calling
+// stbtt_PackFontRanges(). If you more control over the packing of multiple
+// fonts, or if you want to pack custom data into a font texture, take a look
+// at the source to of stbtt_PackFontRanges() and create a custom version
+// using these functions, e.g. call GatherRects multiple times,
+// building up a single array of rects, then call PackRects once,
+// then call RenderIntoRects repeatedly. This may result in a
+// better packing than calling PackFontRanges multiple times
+// (or it may not).
+
+// this is an opaque structure that you shouldn't mess with which holds
+// all the context needed from PackBegin to PackEnd.
+struct stbtt_pack_context {
+   void *user_allocator_context;
+   void *pack_info;
+   int   width;
+   int   height;
+   int   stride_in_bytes;
+   int   padding;
+   unsigned int   h_oversample, v_oversample;
+   unsigned char *pixels;
+   void  *nodes;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// FONT LOADING
+//
+//
+
+STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index);
+// Each .ttf/.ttc file may have more than one font. Each font has a sequential
+// index number starting from 0. Call this function to get the font offset for
+// a given index; it returns -1 if the index is out of range. A regular .ttf
+// file will only define one font and it always be at offset 0, so it will
+// return '0' for index 0, and -1 for all other indices. You can just skip
+// this step if you know it's that kind of font.
+
+
+// The following structure is defined publically so you can declare one on
+// the stack or as a global or etc, but you should treat it as opaque.
+struct stbtt_fontinfo
+{
+   void           * userdata;
+   unsigned char  * data;              // pointer to .ttf file
+   int              fontstart;         // offset of start of font
+
+   int numGlyphs;                     // number of glyphs, needed for range checking
+
+   int loca,head,glyf,hhea,hmtx,kern; // table locations as offset from start of .ttf
+   int index_map;                     // a cmap mapping for our chosen character encoding
+   int indexToLocFormat;              // format needed to map from glyph index to glyph
+};
+
+STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset);
+// Given an offset into the file that defines a font, this function builds
+// the necessary cached info for the rest of the system. You must allocate
+// the stbtt_fontinfo yourself, and stbtt_InitFont will fill it out. You don't
+// need to do anything special to free it, because the contents are pure
+// value data with no additional data structures. Returns 0 on failure.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// CHARACTER TO GLYPH-INDEX CONVERSIOn
+
+STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint);
+// If you're going to perform multiple operations on the same character
+// and you want a speed-up, call this function with the character you're
+// going to process, then use glyph-based functions instead of the
+// codepoint-based functions.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// CHARACTER PROPERTIES
+//
+
+STBTT_DEF float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float pixels);
+// computes a scale factor to produce a font whose "height" is 'pixels' tall.
+// Height is measured as the distance from the highest ascender to the lowest
+// descender; in other words, it's equivalent to calling stbtt_GetFontVMetrics
+// and computing:
+//       scale = pixels / (ascent - descent)
+// so if you prefer to measure height by the ascent only, use a similar calculation.
+
+STBTT_DEF float stbtt_ScaleForMappingEmToPixels(const stbtt_fontinfo *info, float pixels);
+// computes a scale factor to produce a font whose EM size is mapped to
+// 'pixels' tall. This is probably what traditional APIs compute, but
+// I'm not positive.
+
+STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap);
+// ascent is the coordinate above the baseline the font extends; descent
+// is the coordinate below the baseline the font extends (i.e. it is typically negative)
+// lineGap is the spacing between one row's descent and the next row's ascent...
+// so you should advance the vertical position by "*ascent - *descent + *lineGap"
+//   these are expressed in unscaled coordinates, so you must multiply by
+//   the scale factor for a given size
+
+STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1);
+// the bounding box around all possible characters
+
+STBTT_DEF void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing);
+// leftSideBearing is the offset from the current horizontal position to the left edge of the character
+// advanceWidth is the offset from the current horizontal position to the next horizontal position
+//   these are expressed in unscaled coordinates
+
+STBTT_DEF int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2);
+// an additional amount to add to the 'advance' value between ch1 and ch2
+
+STBTT_DEF int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1);
+// Gets the bounding box of the visible part of the glyph, in unscaled coordinates
+
+STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing);
+STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2);
+STBTT_DEF int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
+// as above, but takes one or more glyph indices for greater efficiency
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// GLYPH SHAPES (you probably don't need these, but they have to go before
+// the bitmaps for C declaration-order reasons)
+//
+
+#ifndef STBTT_vmove // you can predefine these to use different values (but why?)
+   enum {
+      STBTT_vmove=1,
+      STBTT_vline,
+      STBTT_vcurve
+   };
+#endif
+
+#ifndef stbtt_vertex // you can predefine this to use different values
+                   // (we share this with other code at RAD)
+   #define stbtt_vertex_type short // can't use stbtt_int16 because that's not visible in the header file
+   typedef struct
+   {
+      stbtt_vertex_type x,y,cx,cy;
+      unsigned char type,padding;
+   } stbtt_vertex;
+#endif
+
+STBTT_DEF int stbtt_IsGlyphEmpty(const stbtt_fontinfo *info, int glyph_index);
+// returns non-zero if nothing is drawn for this glyph
+
+STBTT_DEF int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices);
+STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **vertices);
+// returns # of vertices and fills *vertices with the pointer to them
+//   these are expressed in "unscaled" coordinates
+//
+// The shape is a series of countours. Each one starts with
+// a STBTT_moveto, then consists of a series of mixed
+// STBTT_lineto and STBTT_curveto segments. A lineto
+// draws a line from previous endpoint to its x,y; a curveto
+// draws a quadratic bezier from previous endpoint to
+// its x,y, using cx,cy as the bezier control point.
+
+STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
+// frees the data allocated above
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// BITMAP RENDERING
+//
+
+STBTT_DEF void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata);
+// frees the bitmap allocated below
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
+// allocates a large-enough single-channel 8bpp bitmap and renders the
+// specified character/glyph at the specified scale into it, with
+// antialiasing. 0 is no coverage (transparent), 255 is fully covered (opaque).
+// *width & *height are filled out with the width & height of the bitmap,
+// which is stored left-to-right, top-to-bottom.
+//
+// xoff/yoff are the offset it pixel space from the glyph origin to the top-left of the bitmap
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
+// the same as stbtt_GetCodepoitnBitmap, but you can specify a subpixel
+// shift for the character
+
+STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint);
+// the same as stbtt_GetCodepointBitmap, but you pass in storage for the bitmap
+// in the form of 'output', with row spacing of 'out_stride' bytes. the bitmap
+// is clipped to out_w/out_h bytes. Call stbtt_GetCodepointBitmapBox to get the
+// width and height and positioning info for it first.
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint);
+// same as stbtt_MakeCodepointBitmap, but you can specify a subpixel
+// shift for the character
+
+STBTT_DEF void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+// get the bbox of the bitmap centered around the glyph origin; so the
+// bitmap width is ix1-ix0, height is iy1-iy0, and location to place
+// the bitmap top left is (leftSideBearing*scale,iy0).
+// (Note that the bitmap uses y-increases-down, but the shape uses
+// y-increases-up, so CodepointBitmapBox and CodepointBox are inverted.)
+
+STBTT_DEF void stbtt_GetCodepointBitmapBoxSubpixel(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1);
+// same as stbtt_GetCodepointBitmapBox, but you can specify a subpixel
+// shift for the character
+
+// the following functions are equivalent to the above functions, but operate
+// on glyph indices instead of Unicode codepoints (for efficiency)
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int glyph, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph);
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int glyph);
+STBTT_DEF void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+STBTT_DEF void stbtt_GetGlyphBitmapBoxSubpixel(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y,float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1);
+
+
+// @TODO: don't expose this structure
+typedef struct
+{
+   int w,h,stride;
+   unsigned char *pixels;
+} stbtt__bitmap;
+
+// rasterize a shape with quadratic beziers into a bitmap
+STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result,        // 1-channel bitmap to draw into
+                               float flatness_in_pixels,     // allowable error of curve in pixels
+                               stbtt_vertex *vertices,       // array of vertices defining shape
+                               int num_verts,                // number of vertices in above array
+                               float scale_x, float scale_y, // scale applied to input vertices
+                               float shift_x, float shift_y, // translation applied to input vertices
+                               int x_off, int y_off,         // another translation applied to input
+                               int invert,                   // if non-zero, vertically flip shape
+                               void *userdata);              // context for to STBTT_MALLOC
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Finding the right font...
+//
+// You should really just solve this offline, keep your own tables
+// of what font is what, and don't try to get it out of the .ttf file.
+// That's because getting it out of the .ttf file is really hard, because
+// the names in the file can appear in many possible encodings, in many
+// possible languages, and e.g. if you need a case-insensitive comparison,
+// the details of that depend on the encoding & language in a complex way
+// (actually underspecified in truetype, but also gigantic).
+//
+// But you can use the provided functions in two possible ways:
+//     stbtt_FindMatchingFont() will use *case-sensitive* comparisons on
+//             unicode-encoded names to try to find the font you want;
+//             you can run this before calling stbtt_InitFont()
+//
+//     stbtt_GetFontNameString() lets you get any of the various strings
+//             from the file yourself and do your own comparisons on them.
+//             You have to have called stbtt_InitFont() first.
+
+
+STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags);
+// returns the offset (not index) of the font that matches, or -1 if none
+//   if you use STBTT_MACSTYLE_DONTCARE, use a font name like "Arial Bold".
+//   if you use any other flag, use a font name like "Arial"; this checks
+//     the 'macStyle' header field; i don't know if fonts set this consistently
+#define STBTT_MACSTYLE_DONTCARE     0
+#define STBTT_MACSTYLE_BOLD         1
+#define STBTT_MACSTYLE_ITALIC       2
+#define STBTT_MACSTYLE_UNDERSCORE   4
+#define STBTT_MACSTYLE_NONE         8   // <= not same as 0, this makes us check the bitfield is 0
+
+STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2);
+// returns 1/0 whether the first string interpreted as utf8 is identical to
+// the second string interpreted as big-endian utf16... useful for strings from next func
+
+STBTT_DEF const char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID);
+// returns the string (which may be big-endian double byte, e.g. for unicode)
+// and puts the length in bytes in *length.
+//
+// some of the values for the IDs are below; for more see the truetype spec:
+//     http://developer.apple.com/textfonts/TTRefMan/RM06/Chap6name.html
+//     http://www.microsoft.com/typography/otspec/name.htm
+
+enum { // platformID
+   STBTT_PLATFORM_ID_UNICODE   =0,
+   STBTT_PLATFORM_ID_MAC       =1,
+   STBTT_PLATFORM_ID_ISO       =2,
+   STBTT_PLATFORM_ID_MICROSOFT =3
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_UNICODE
+   STBTT_UNICODE_EID_UNICODE_1_0    =0,
+   STBTT_UNICODE_EID_UNICODE_1_1    =1,
+   STBTT_UNICODE_EID_ISO_10646      =2,
+   STBTT_UNICODE_EID_UNICODE_2_0_BMP=3,
+   STBTT_UNICODE_EID_UNICODE_2_0_FULL=4
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_MICROSOFT
+   STBTT_MS_EID_SYMBOL        =0,
+   STBTT_MS_EID_UNICODE_BMP   =1,
+   STBTT_MS_EID_SHIFTJIS      =2,
+   STBTT_MS_EID_UNICODE_FULL  =10
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_MAC; same as Script Manager codes
+   STBTT_MAC_EID_ROMAN        =0,   STBTT_MAC_EID_ARABIC       =4,
+   STBTT_MAC_EID_JAPANESE     =1,   STBTT_MAC_EID_HEBREW       =5,
+   STBTT_MAC_EID_CHINESE_TRAD =2,   STBTT_MAC_EID_GREEK        =6,
+   STBTT_MAC_EID_KOREAN       =3,   STBTT_MAC_EID_RUSSIAN      =7
+};
+
+enum { // languageID for STBTT_PLATFORM_ID_MICROSOFT; same as LCID...
+       // problematic because there are e.g. 16 english LCIDs and 16 arabic LCIDs
+   STBTT_MS_LANG_ENGLISH     =0x0409,   STBTT_MS_LANG_ITALIAN     =0x0410,
+   STBTT_MS_LANG_CHINESE     =0x0804,   STBTT_MS_LANG_JAPANESE    =0x0411,
+   STBTT_MS_LANG_DUTCH       =0x0413,   STBTT_MS_LANG_KOREAN      =0x0412,
+   STBTT_MS_LANG_FRENCH      =0x040c,   STBTT_MS_LANG_RUSSIAN     =0x0419,
+   STBTT_MS_LANG_GERMAN      =0x0407,   STBTT_MS_LANG_SPANISH     =0x0409,
+   STBTT_MS_LANG_HEBREW      =0x040d,   STBTT_MS_LANG_SWEDISH     =0x041D
+};
+
+enum { // languageID for STBTT_PLATFORM_ID_MAC
+   STBTT_MAC_LANG_ENGLISH      =0 ,   STBTT_MAC_LANG_JAPANESE     =11,
+   STBTT_MAC_LANG_ARABIC       =12,   STBTT_MAC_LANG_KOREAN       =23,
+   STBTT_MAC_LANG_DUTCH        =4 ,   STBTT_MAC_LANG_RUSSIAN      =32,
+   STBTT_MAC_LANG_FRENCH       =1 ,   STBTT_MAC_LANG_SPANISH      =6 ,
+   STBTT_MAC_LANG_GERMAN       =2 ,   STBTT_MAC_LANG_SWEDISH      =5 ,
+   STBTT_MAC_LANG_HEBREW       =10,   STBTT_MAC_LANG_CHINESE_SIMPLIFIED =33,
+   STBTT_MAC_LANG_ITALIAN      =3 ,   STBTT_MAC_LANG_CHINESE_TRAD =19
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STB_INCLUDE_STB_TRUETYPE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////
+////   IMPLEMENTATION
+////
+////
+
+#ifdef STB_TRUETYPE_IMPLEMENTATION
+
+#ifndef STBTT_MAX_OVERSAMPLE
+#define STBTT_MAX_OVERSAMPLE   8
+#endif
+
+#if STBTT_MAX_OVERSAMPLE > 255
+#error "STBTT_MAX_OVERSAMPLE cannot be > 255"
+#endif
+
+typedef int stbtt__test_oversample_pow2[(STBTT_MAX_OVERSAMPLE & (STBTT_MAX_OVERSAMPLE-1)) == 0 ? 1 : -1];
+
+#ifndef STBTT_RASTERIZER_VERSION
+#define STBTT_RASTERIZER_VERSION 2
+#endif
+
+#ifdef _MSC_VER
+#define STBTT__NOTUSED(v)  (void)(v)
+#else
+#define STBTT__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+//
+// accessors to parse data from file
+//
+
+// on platforms that don't allow misaligned reads, if we want to allow
+// truetype fonts that aren't padded to alignment, define ALLOW_UNALIGNED_TRUETYPE
+
+#define ttBYTE(p)     (* (stbtt_uint8 *) (p))
+#define ttCHAR(p)     (* (stbtt_int8 *) (p))
+#define ttFixed(p)    ttLONG(p)
+
+#if defined(STB_TRUETYPE_BIGENDIAN) && !defined(ALLOW_UNALIGNED_TRUETYPE)
+
+   #define ttUSHORT(p)   (* (stbtt_uint16 *) (p))
+   #define ttSHORT(p)    (* (stbtt_int16 *) (p))
+   #define ttULONG(p)    (* (stbtt_uint32 *) (p))
+   #define ttLONG(p)     (* (stbtt_int32 *) (p))
+
+#else
+
+   static stbtt_uint16 ttUSHORT(const stbtt_uint8 *p) { return p[0]*256 + p[1]; }
+   static stbtt_int16 ttSHORT(const stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
+   static stbtt_uint32 ttULONG(const stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+   static stbtt_int32 ttLONG(const stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+
+#endif
+
+#define stbtt_tag4(p,c0,c1,c2,c3) ((p)[0] == (c0) && (p)[1] == (c1) && (p)[2] == (c2) && (p)[3] == (c3))
+#define stbtt_tag(p,str)           stbtt_tag4(p,str[0],str[1],str[2],str[3])
+
+static int stbtt__isfont(const stbtt_uint8 *font)
+{
+   // check the version number
+   if (stbtt_tag4(font, '1',0,0,0))  return 1; // TrueType 1
+   if (stbtt_tag(font, "typ1"))   return 1; // TrueType with type 1 font -- we don't support this!
+   if (stbtt_tag(font, "OTTO"))   return 1; // OpenType with CFF
+   if (stbtt_tag4(font, 0,1,0,0)) return 1; // OpenType 1.0
+   return 0;
+}
+
+// @OPTIMIZE: binary search
+static stbtt_uint32 stbtt__find_table(stbtt_uint8 *data, stbtt_uint32 fontstart, const char *tag)
+{
+   stbtt_int32 num_tables = ttUSHORT(data+fontstart+4);
+   stbtt_uint32 tabledir = fontstart + 12;
+   stbtt_int32 i;
+   for (i=0; i < num_tables; ++i) {
+      stbtt_uint32 loc = tabledir + 16*i;
+      if (stbtt_tag(data+loc+0, tag))
+         return ttULONG(data+loc+8);
+   }
+   return 0;
+}
+
+STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *font_collection, int index)
+{
+   // if it's just a font, there's only one valid index
+   if (stbtt__isfont(font_collection))
+      return index == 0 ? 0 : -1;
+
+   // check if it's a TTC
+   if (stbtt_tag(font_collection, "ttcf")) {
+      // version 1?
+      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
+         stbtt_int32 n = ttLONG(font_collection+8);
+         if (index >= n)
+            return -1;
+         return ttULONG(font_collection+12+index*4);
+      }
+   }
+   return -1;
+}
+
+STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, int fontstart)
+{
+   stbtt_uint8 *data = (stbtt_uint8 *) data2;
+   stbtt_uint32 cmap, t;
+   stbtt_int32 i,numTables;
+
+   info->data = data;
+   info->fontstart = fontstart;
+
+   cmap = stbtt__find_table(data, fontstart, "cmap");       // required
+   info->loca = stbtt__find_table(data, fontstart, "loca"); // required
+   info->head = stbtt__find_table(data, fontstart, "head"); // required
+   info->glyf = stbtt__find_table(data, fontstart, "glyf"); // required
+   info->hhea = stbtt__find_table(data, fontstart, "hhea"); // required
+   info->hmtx = stbtt__find_table(data, fontstart, "hmtx"); // required
+   info->kern = stbtt__find_table(data, fontstart, "kern"); // not required
+   if (!cmap || !info->loca || !info->head || !info->glyf || !info->hhea || !info->hmtx)
+      return 0;
+
+   t = stbtt__find_table(data, fontstart, "maxp");
+   if (t)
+      info->numGlyphs = ttUSHORT(data+t+4);
+   else
+      info->numGlyphs = 0xffff;
+
+   // find a cmap encoding table we understand *now* to avoid searching
+   // later. (todo: could make this installable)
+   // the same regardless of glyph.
+   numTables = ttUSHORT(data + cmap + 2);
+   info->index_map = 0;
+   for (i=0; i < numTables; ++i) {
+      stbtt_uint32 encoding_record = cmap + 4 + 8 * i;
+      // find an encoding we understand:
+      switch(ttUSHORT(data+encoding_record)) {
+         case STBTT_PLATFORM_ID_MICROSOFT:
+            switch (ttUSHORT(data+encoding_record+2)) {
+               case STBTT_MS_EID_UNICODE_BMP:
+               case STBTT_MS_EID_UNICODE_FULL:
+                  // MS/Unicode
+                  info->index_map = cmap + ttULONG(data+encoding_record+4);
+                  break;
+            }
+            break;
+        case STBTT_PLATFORM_ID_UNICODE:
+            // Mac/iOS has these
+            // all the encodingIDs are unicode, so we don't bother to check it
+            info->index_map = cmap + ttULONG(data+encoding_record+4);
+            break;
+      }
+   }
+   if (info->index_map == 0)
+      return 0;
+
+   info->indexToLocFormat = ttUSHORT(data+info->head + 50);
+   return 1;
+}
+
+STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint)
+{
+   stbtt_uint8 *data = info->data;
+   stbtt_uint32 index_map = info->index_map;
+
+   stbtt_uint16 format = ttUSHORT(data + index_map + 0);
+   if (format == 0) { // apple byte encoding
+      stbtt_int32 bytes = ttUSHORT(data + index_map + 2);
+      if (unicode_codepoint < bytes-6)
+         return ttBYTE(data + index_map + 6 + unicode_codepoint);
+      return 0;
+   } else if (format == 6) {
+      stbtt_uint32 first = ttUSHORT(data + index_map + 6);
+      stbtt_uint32 count = ttUSHORT(data + index_map + 8);
+      if ((stbtt_uint32) unicode_codepoint >= first && (stbtt_uint32) unicode_codepoint < first+count)
+         return ttUSHORT(data + index_map + 10 + (unicode_codepoint - first)*2);
+      return 0;
+   } else if (format == 2) {
+      STBTT_assert(0); // @TODO: high-byte mapping for japanese/chinese/korean
+      return 0;
+   } else if (format == 4) { // standard mapping for windows fonts: binary search collection of ranges
+      stbtt_uint16 segcount = ttUSHORT(data+index_map+6) >> 1;
+      stbtt_uint16 searchRange = ttUSHORT(data+index_map+8) >> 1;
+      stbtt_uint16 entrySelector = ttUSHORT(data+index_map+10);
+      stbtt_uint16 rangeShift = ttUSHORT(data+index_map+12) >> 1;
+
+      // do a binary search of the segments
+      stbtt_uint32 endCount = index_map + 14;
+      stbtt_uint32 search = endCount;
+
+      if (unicode_codepoint > 0xffff)
+         return 0;
+
+      // they lie from endCount .. endCount + segCount
+      // but searchRange is the nearest power of two, so...
+      if (unicode_codepoint >= ttUSHORT(data + search + rangeShift*2))
+         search += rangeShift*2;
+
+      // now decrement to bias correctly to find smallest
+      search -= 2;
+      while (entrySelector) {
+         stbtt_uint16 end;
+         searchRange >>= 1;
+         end = ttUSHORT(data + search + searchRange*2);
+         if (unicode_codepoint > end)
+            search += searchRange*2;
+         --entrySelector;
+      }
+      search += 2;
+
+      {
+         stbtt_uint16 offset, start;
+         stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1);
+
+         STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
+         start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
+         if (unicode_codepoint < start)
+            return 0;
+
+         offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
+         if (offset == 0)
+            return (stbtt_uint16) (unicode_codepoint + ttSHORT(data + index_map + 14 + segcount*4 + 2 + 2*item));
+
+         return ttUSHORT(data + offset + (unicode_codepoint-start)*2 + index_map + 14 + segcount*6 + 2 + 2*item);
+      }
+   } else if (format == 12 || format == 13) {
+      stbtt_uint32 ngroups = ttULONG(data+index_map+12);
+      stbtt_int32 low,high;
+      low = 0; high = (stbtt_int32)ngroups;
+      // Binary search the right group.
+      while (low < high) {
+         stbtt_int32 mid = low + ((high-low) >> 1); // rounds down, so low <= mid < high
+         stbtt_uint32 start_char = ttULONG(data+index_map+16+mid*12);
+         stbtt_uint32 end_char = ttULONG(data+index_map+16+mid*12+4);
+         if ((stbtt_uint32) unicode_codepoint < start_char)
+            high = mid;
+         else if ((stbtt_uint32) unicode_codepoint > end_char)
+            low = mid+1;
+         else {
+            stbtt_uint32 start_glyph = ttULONG(data+index_map+16+mid*12+8);
+            if (format == 12)
+               return start_glyph + unicode_codepoint-start_char;
+            else // format == 13
+               return start_glyph;
+         }
+      }
+      return 0; // not found
+   }
+   // @TODO
+   STBTT_assert(0);
+   return 0;
+}
+
+STBTT_DEF int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices)
+{
+   return stbtt_GetGlyphShape(info, stbtt_FindGlyphIndex(info, unicode_codepoint), vertices);
+}
+
+static void stbtt_setvertex(stbtt_vertex *v, stbtt_uint8 type, stbtt_int32 x, stbtt_int32 y, stbtt_int32 cx, stbtt_int32 cy)
+{
+   v->type = type;
+   v->x = (stbtt_int16) x;
+   v->y = (stbtt_int16) y;
+   v->cx = (stbtt_int16) cx;
+   v->cy = (stbtt_int16) cy;
+}
+
+static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
+{
+   int g1,g2;
+
+   if (glyph_index >= info->numGlyphs) return -1; // glyph index out of range
+   if (info->indexToLocFormat >= 2)    return -1; // unknown index->glyph map format
+
+   if (info->indexToLocFormat == 0) {
+      g1 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2) * 2;
+      g2 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2 + 2) * 2;
+   } else {
+      g1 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4);
+      g2 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4 + 4);
+   }
+
+   return g1==g2 ? -1 : g1; // if length is 0, return -1
+}
+
+STBTT_DEF int stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
+{
+   int g = stbtt__GetGlyfOffset(info, glyph_index);
+   if (g < 0) return 0;
+
+   if (x0) *x0 = ttSHORT(info->data + g + 2);
+   if (y0) *y0 = ttSHORT(info->data + g + 4);
+   if (x1) *x1 = ttSHORT(info->data + g + 6);
+   if (y1) *y1 = ttSHORT(info->data + g + 8);
+   return 1;
+}
+
+STBTT_DEF int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1)
+{
+   return stbtt_GetGlyphBox(info, stbtt_FindGlyphIndex(info,codepoint), x0,y0,x1,y1);
+}
+
+STBTT_DEF int stbtt_IsGlyphEmpty(const stbtt_fontinfo *info, int glyph_index)
+{
+   stbtt_int16 numberOfContours;
+   int g = stbtt__GetGlyfOffset(info, glyph_index);
+   if (g < 0) return 1;
+   numberOfContours = ttSHORT(info->data + g);
+   return numberOfContours == 0;
+}
+
+static int stbtt__close_shape(stbtt_vertex *vertices, int num_vertices, int was_off, int start_off,
+    stbtt_int32 sx, stbtt_int32 sy, stbtt_int32 scx, stbtt_int32 scy, stbtt_int32 cx, stbtt_int32 cy)
+{
+   if (start_off) {
+      if (was_off)
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+scx)>>1, (cy+scy)>>1, cx,cy);
+      stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, sx,sy,scx,scy);
+   } else {
+      if (was_off)
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
+      else
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
+   }
+   return num_vertices;
+}
+
+STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   stbtt_int16 numberOfContours;
+   stbtt_uint8 *endPtsOfContours;
+   stbtt_uint8 *data = info->data;
+   stbtt_vertex *vertices=0;
+   int num_vertices=0;
+   int g = stbtt__GetGlyfOffset(info, glyph_index);
+
+   *pvertices = NULL;
+
+   if (g < 0) return 0;
+
+   numberOfContours = ttSHORT(data + g);
+
+   if (numberOfContours > 0) {
+      stbtt_uint8 flags=0,flagcount;
+      stbtt_int32 ins, i,j=0,m,n, next_move, was_off=0, off, start_off=0;
+      stbtt_int32 x,y,cx,cy,sx,sy, scx,scy;
+      stbtt_uint8 *points;
+      endPtsOfContours = (data + g + 10);
+      ins = ttUSHORT(data + g + 10 + numberOfContours * 2);
+      points = data + g + 10 + numberOfContours * 2 + 2 + ins;
+
+      n = 1+ttUSHORT(endPtsOfContours + numberOfContours*2-2);
+
+      m = n + 2*numberOfContours;  // a loose bound on how many vertices we might need
+      vertices = (stbtt_vertex *) STBTT_malloc(m * sizeof(vertices[0]), info->userdata);
+      if (vertices == 0)
+         return 0;
+
+      next_move = 0;
+      flagcount=0;
+
+      // in first pass, we load uninterpreted data into the allocated array
+      // above, shifted to the end of the array so we won't overwrite it when
+      // we create our final data starting from the front
+
+      off = m - n; // starting offset for uninterpreted data, regardless of how m ends up being calculated
+
+      // first load flags
+
+      for (i=0; i < n; ++i) {
+         if (flagcount == 0) {
+            flags = *points++;
+            if (flags & 8)
+               flagcount = *points++;
+         } else
+            --flagcount;
+         vertices[off+i].type = flags;
+      }
+
+      // now load x coordinates
+      x=0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         if (flags & 2) {
+            stbtt_int16 dx = *points++;
+            x += (flags & 16) ? dx : -dx; // ???
+         } else {
+            if (!(flags & 16)) {
+               x = x + (stbtt_int16) (points[0]*256 + points[1]);
+               points += 2;
+            }
+         }
+         vertices[off+i].x = (stbtt_int16) x;
+      }
+
+      // now load y coordinates
+      y=0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         if (flags & 4) {
+            stbtt_int16 dy = *points++;
+            y += (flags & 32) ? dy : -dy; // ???
+         } else {
+            if (!(flags & 32)) {
+               y = y + (stbtt_int16) (points[0]*256 + points[1]);
+               points += 2;
+            }
+         }
+         vertices[off+i].y = (stbtt_int16) y;
+      }
+
+      // now convert them to our format
+      num_vertices=0;
+      sx = sy = cx = cy = scx = scy = 0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         x     = (stbtt_int16) vertices[off+i].x;
+         y     = (stbtt_int16) vertices[off+i].y;
+
+         if (next_move == i) {
+            if (i != 0)
+               num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
+
+            // now start the new one
+            start_off = !(flags & 1);
+            if (start_off) {
+               // if we start off with an off-curve point, then when we need to find a point on the curve
+               // where we can start, and we need to save some state for when we wraparound.
+               scx = x;
+               scy = y;
+               if (!(vertices[off+i+1].type & 1)) {
+                  // next point is also a curve point, so interpolate an on-point curve
+                  sx = (x + (stbtt_int32) vertices[off+i+1].x) >> 1;
+                  sy = (y + (stbtt_int32) vertices[off+i+1].y) >> 1;
+               } else {
+                  // otherwise just use the next point as our start point
+                  sx = (stbtt_int32) vertices[off+i+1].x;
+                  sy = (stbtt_int32) vertices[off+i+1].y;
+                  ++i; // we're using point i+1 as the starting point, so skip it
+               }
+            } else {
+               sx = x;
+               sy = y;
+            }
+            stbtt_setvertex(&vertices[num_vertices++], STBTT_vmove,sx,sy,0,0);
+            was_off = 0;
+            next_move = 1 + ttUSHORT(endPtsOfContours+j*2);
+            ++j;
+         } else {
+            if (!(flags & 1)) { // if it's a curve
+               if (was_off) // two off-curve control points in a row means interpolate an on-curve midpoint
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+x)>>1, (cy+y)>>1, cx, cy);
+               cx = x;
+               cy = y;
+               was_off = 1;
+            } else {
+               if (was_off)
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, x,y, cx, cy);
+               else
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline, x,y,0,0);
+               was_off = 0;
+            }
+         }
+      }
+      num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
+   } else if (numberOfContours == -1) {
+      // Compound shapes.
+      int more = 1;
+      stbtt_uint8 *comp = data + g + 10;
+      num_vertices = 0;
+      vertices = 0;
+      while (more) {
+         stbtt_uint16 flags, gidx;
+         int comp_num_verts = 0, i;
+         stbtt_vertex *comp_verts = 0, *tmp = 0;
+         float mtx[6] = {1,0,0,1,0,0}, m, n;
+
+         flags = ttSHORT(comp); comp+=2;
+         gidx = ttSHORT(comp); comp+=2;
+
+         if (flags & 2) { // XY values
+            if (flags & 1) { // shorts
+               mtx[4] = ttSHORT(comp); comp+=2;
+               mtx[5] = ttSHORT(comp); comp+=2;
+            } else {
+               mtx[4] = ttCHAR(comp); comp+=1;
+               mtx[5] = ttCHAR(comp); comp+=1;
+            }
+         }
+         else {
+            // @TODO handle matching point
+            STBTT_assert(0);
+         }
+         if (flags & (1<<3)) { // WE_HAVE_A_SCALE
+            mtx[0] = mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = mtx[2] = 0;
+         } else if (flags & (1<<6)) { // WE_HAVE_AN_X_AND_YSCALE
+            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = mtx[2] = 0;
+            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+         } else if (flags & (1<<7)) { // WE_HAVE_A_TWO_BY_TWO
+            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[2] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+         }
+
+         // Find transformation scales.
+         m = (float) STBTT_sqrt(mtx[0]*mtx[0] + mtx[1]*mtx[1]);
+         n = (float) STBTT_sqrt(mtx[2]*mtx[2] + mtx[3]*mtx[3]);
+
+         // Get indexed glyph.
+         comp_num_verts = stbtt_GetGlyphShape(info, gidx, &comp_verts);
+         if (comp_num_verts > 0) {
+            // Transform vertices.
+            for (i = 0; i < comp_num_verts; ++i) {
+               stbtt_vertex* v = &comp_verts[i];
+               stbtt_vertex_type x,y;
+               x=v->x; y=v->y;
+               v->x = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+               v->y = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+               x=v->cx; y=v->cy;
+               v->cx = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+               v->cy = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+            }
+            // Append vertices.
+            tmp = (stbtt_vertex*)STBTT_malloc((num_vertices+comp_num_verts)*sizeof(stbtt_vertex), info->userdata);
+            if (!tmp) {
+               if (vertices) STBTT_free(vertices, info->userdata);
+               if (comp_verts) STBTT_free(comp_verts, info->userdata);
+               return 0;
+            }
+            if (num_vertices > 0) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+            STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
+            if (vertices) STBTT_free(vertices, info->userdata);
+            vertices = tmp;
+            STBTT_free(comp_verts, info->userdata);
+            num_vertices += comp_num_verts;
+         }
+         // More components ?
+         more = flags & (1<<5);
+      }
+   } else if (numberOfContours < 0) {
+      // @TODO other compound variations?
+      STBTT_assert(0);
+   } else {
+      // numberOfCounters == 0, do nothing
+   }
+
+   *pvertices = vertices;
+   return num_vertices;
+}
+
+STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
+{
+   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
+   if (glyph_index < numOfLongHorMetrics) {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
+   } else {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
+   }
+}
+
+STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+   stbtt_uint32 needle, straw;
+   int l, r, m;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   l = 0;
+   r = ttUSHORT(data+10) - 1;
+   needle = glyph1 << 16 | glyph2;
+   while (l <= r) {
+      m = (l + r) >> 1;
+      straw = ttULONG(data+18+(m*6)); // note: unaligned read
+      if (needle < straw)
+         r = m - 1;
+      else if (needle > straw)
+         l = m + 1;
+      else
+         return ttSHORT(data+22+(m*6));
+   }
+   return 0;
+}
+
+STBTT_DEF int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2)
+{
+   if (!info->kern) // if no kerning table, don't waste time looking up both codepoint->glyphs
+      return 0;
+   return stbtt_GetGlyphKernAdvance(info, stbtt_FindGlyphIndex(info,ch1), stbtt_FindGlyphIndex(info,ch2));
+}
+
+STBTT_DEF void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing)
+{
+   stbtt_GetGlyphHMetrics(info, stbtt_FindGlyphIndex(info,codepoint), advanceWidth, leftSideBearing);
+}
+
+STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap)
+{
+   if (ascent ) *ascent  = ttSHORT(info->data+info->hhea + 4);
+   if (descent) *descent = ttSHORT(info->data+info->hhea + 6);
+   if (lineGap) *lineGap = ttSHORT(info->data+info->hhea + 8);
+}
+
+STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1)
+{
+   *x0 = ttSHORT(info->data + info->head + 36);
+   *y0 = ttSHORT(info->data + info->head + 38);
+   *x1 = ttSHORT(info->data + info->head + 40);
+   *y1 = ttSHORT(info->data + info->head + 42);
+}
+
+STBTT_DEF float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float height)
+{
+   int fheight = ttSHORT(info->data + info->hhea + 4) - ttSHORT(info->data + info->hhea + 6);
+   return (float) height / fheight;
+}
+
+STBTT_DEF float stbtt_ScaleForMappingEmToPixels(const stbtt_fontinfo *info, float pixels)
+{
+   int unitsPerEm = ttUSHORT(info->data + info->head + 18);
+   return pixels / unitsPerEm;
+}
+
+STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *v)
+{
+   STBTT_free(v, info->userdata);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// antialiasing software rasterizer
+//
+
+STBTT_DEF void stbtt_GetGlyphBitmapBoxSubpixel(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y,float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   int x0=0,y0=0,x1,y1; // =0 suppresses compiler warning
+   if (!stbtt_GetGlyphBox(font, glyph, &x0,&y0,&x1,&y1)) {
+      // e.g. space character
+      if (ix0) *ix0 = 0;
+      if (iy0) *iy0 = 0;
+      if (ix1) *ix1 = 0;
+      if (iy1) *iy1 = 0;
+   } else {
+      // move to integral bboxes (treating pixels as little squares, what pixels get touched)?
+      if (ix0) *ix0 = STBTT_ifloor( x0 * scale_x + shift_x);
+      if (iy0) *iy0 = STBTT_ifloor(-y1 * scale_y + shift_y);
+      if (ix1) *ix1 = STBTT_iceil ( x1 * scale_x + shift_x);
+      if (iy1) *iy1 = STBTT_iceil (-y0 * scale_y + shift_y);
+   }
+}
+
+STBTT_DEF void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetGlyphBitmapBoxSubpixel(font, glyph, scale_x, scale_y,0.0f,0.0f, ix0, iy0, ix1, iy1);
+}
+
+STBTT_DEF void stbtt_GetCodepointBitmapBoxSubpixel(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetGlyphBitmapBoxSubpixel(font, stbtt_FindGlyphIndex(font,codepoint), scale_x, scale_y,shift_x,shift_y, ix0,iy0,ix1,iy1);
+}
+
+STBTT_DEF void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetCodepointBitmapBoxSubpixel(font, codepoint, scale_x, scale_y,0.0f,0.0f, ix0,iy0,ix1,iy1);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  Rasterizer
+
+typedef struct stbtt__hheap_chunk
+{
+   struct stbtt__hheap_chunk *next;
+} stbtt__hheap_chunk;
+
+typedef struct stbtt__hheap
+{
+   struct stbtt__hheap_chunk *head;
+   void   *first_free;
+   int    num_remaining_in_head_chunk;
+} stbtt__hheap;
+
+static void *stbtt__hheap_alloc(stbtt__hheap *hh, size_t size, void *userdata)
+{
+   if (hh->first_free) {
+      void *p = hh->first_free;
+      hh->first_free = * (void **) p;
+      return p;
+   } else {
+      if (hh->num_remaining_in_head_chunk == 0) {
+         int count = (size < 32 ? 2000 : size < 128 ? 800 : 100);
+         stbtt__hheap_chunk *c = (stbtt__hheap_chunk *) STBTT_malloc(sizeof(stbtt__hheap_chunk) + size * count, userdata);
+         if (c == NULL)
+            return NULL;
+         c->next = hh->head;
+         hh->head = c;
+         hh->num_remaining_in_head_chunk = count;
+      }
+      --hh->num_remaining_in_head_chunk;
+      return (char *) (hh->head) + size * hh->num_remaining_in_head_chunk;
+   }
+}
+
+static void stbtt__hheap_free(stbtt__hheap *hh, void *p)
+{
+   *(void **) p = hh->first_free;
+   hh->first_free = p;
+}
+
+static void stbtt__hheap_cleanup(stbtt__hheap *hh, void *userdata)
+{
+   stbtt__hheap_chunk *c = hh->head;
+   while (c) {
+      stbtt__hheap_chunk *n = c->next;
+      STBTT_free(c, userdata);
+      c = n;
+   }
+}
+
+typedef struct stbtt__edge {
+   float x0,y0, x1,y1;
+   int invert;
+} stbtt__edge;
+
+
+typedef struct stbtt__active_edge
+{
+   struct stbtt__active_edge *next;
+   #if STBTT_RASTERIZER_VERSION==1
+   int x,dx;
+   float ey;
+   int direction;
+   #elif STBTT_RASTERIZER_VERSION==2
+   float fx,fdx,fdy;
+   float direction;
+   float sy;
+   float ey;
+   #else
+   #error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+   #endif
+} stbtt__active_edge;
+
+#if STBTT_RASTERIZER_VERSION == 1
+#define STBTT_FIXSHIFT   10
+#define STBTT_FIX        (1 << STBTT_FIXSHIFT)
+#define STBTT_FIXMASK    (STBTT_FIX-1)
+
+static stbtt__active_edge *stbtt__new_active(stbtt__hheap *hh, stbtt__edge *e, int off_x, float start_point, void *userdata)
+{
+   stbtt__active_edge *z = (stbtt__active_edge *) stbtt__hheap_alloc(hh, sizeof(*z), userdata);
+   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
+   STBTT_assert(z != NULL);
+   if (!z) return z;
+
+   // round dx down to avoid overshooting
+   if (dxdy < 0)
+      z->dx = -STBTT_ifloor(STBTT_FIX * -dxdy);
+   else
+      z->dx = STBTT_ifloor(STBTT_FIX * dxdy);
+
+   z->x = STBTT_ifloor(STBTT_FIX * e->x0 + z->dx * (start_point - e->y0)); // use z->dx so when we offset later it's by the same amount
+   z->x -= off_x * STBTT_FIX;
+
+   z->ey = e->y1;
+   z->next = 0;
+   z->direction = e->invert ? 1 : -1;
+   return z;
+}
+#elif STBTT_RASTERIZER_VERSION == 2
+static stbtt__active_edge *stbtt__new_active(stbtt__hheap *hh, stbtt__edge *e, int off_x, float start_point, void *userdata)
+{
+   stbtt__active_edge *z = (stbtt__active_edge *) stbtt__hheap_alloc(hh, sizeof(*z), userdata);
+   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
+   STBTT_assert(z != NULL);
+   //STBTT_assert(e->y0 <= start_point);
+   if (!z) return z;
+   z->fdx = dxdy;
+   z->fdy = dxdy != 0.0f ? (1.0f/dxdy) : 0.0f;
+   z->fx = e->x0 + dxdy * (start_point - e->y0);
+   z->fx -= off_x;
+   z->direction = e->invert ? 1.0f : -1.0f;
+   z->sy = e->y0;
+   z->ey = e->y1;
+   z->next = 0;
+   return z;
+}
+#else
+#error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+
+#if STBTT_RASTERIZER_VERSION == 1
+// note: this routine clips fills that extend off the edges... ideally this
+// wouldn't happen, but it could happen if the truetype glyph bounding boxes
+// are wrong, or if the user supplies a too-small bitmap
+static void stbtt__fill_active_edges(unsigned char *scanline, int len, stbtt__active_edge *e, int max_weight)
+{
+   // non-zero winding fill
+   int x0=0, w=0;
+
+   while (e) {
+      if (w == 0) {
+         // if we're currently at zero, we need to record the edge start point
+         x0 = e->x; w += e->direction;
+      } else {
+         int x1 = e->x; w += e->direction;
+         // if we went to zero, we need to draw
+         if (w == 0) {
+            int i = x0 >> STBTT_FIXSHIFT;
+            int j = x1 >> STBTT_FIXSHIFT;
+
+            if (i < len && j >= 0) {
+               if (i == j) {
+                  // x0,x1 are the same pixel, so compute combined coverage
+                  scanline[i] = scanline[i] + (stbtt_uint8) ((x1 - x0) * max_weight >> STBTT_FIXSHIFT);
+               } else {
+                  if (i >= 0) // add antialiasing for x0
+                     scanline[i] = scanline[i] + (stbtt_uint8) (((STBTT_FIX - (x0 & STBTT_FIXMASK)) * max_weight) >> STBTT_FIXSHIFT);
+                  else
+                     i = -1; // clip
+
+                  if (j < len) // add antialiasing for x1
+                     scanline[j] = scanline[j] + (stbtt_uint8) (((x1 & STBTT_FIXMASK) * max_weight) >> STBTT_FIXSHIFT);
+                  else
+                     j = len; // clip
+
+                  for (++i; i < j; ++i) // fill pixels between x0 and x1
+                     scanline[i] = scanline[i] + (stbtt_uint8) max_weight;
+               }
+            }
+         }
+      }
+
+      e = e->next;
+   }
+}
+
+static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
+{
+   stbtt__hheap hh = { 0, 0, 0 };
+   stbtt__active_edge *active = NULL;
+   int y,j=0;
+   int max_weight = (255 / vsubsample);  // weight per vertical scanline
+   int s; // vertical subsample index
+   unsigned char scanline_data[512], *scanline;
+
+   if (result->w > 512)
+      scanline = (unsigned char *) STBTT_malloc(result->w, userdata);
+   else
+      scanline = scanline_data;
+
+   y = off_y * vsubsample;
+   e[n].y0 = (off_y + result->h) * (float) vsubsample + 1;
+
+   while (j < result->h) {
+      STBTT_memset(scanline, 0, result->w);
+      for (s=0; s < vsubsample; ++s) {
+         // find center of pixel for this scanline
+         float scan_y = y + 0.5f;
+         stbtt__active_edge **step = &active;
+
+         // update all active edges;
+         // remove all active edges that terminate before the center of this scanline
+         while (*step) {
+            stbtt__active_edge * z = *step;
+            if (z->ey <= scan_y) {
+               *step = z->next; // delete from list
+               STBTT_assert(z->direction);
+               z->direction = 0;
+               stbtt__hheap_free(&hh, z);
+            } else {
+               z->x += z->dx; // advance to position for current scanline
+               step = &((*step)->next); // advance through list
+            }
+         }
+
+         // resort the list if needed
+         for(;;) {
+            int changed=0;
+            step = &active;
+            while (*step && (*step)->next) {
+               if ((*step)->x > (*step)->next->x) {
+                  stbtt__active_edge *t = *step;
+                  stbtt__active_edge *q = t->next;
+
+                  t->next = q->next;
+                  q->next = t;
+                  *step = q;
+                  changed = 1;
+               }
+               step = &(*step)->next;
+            }
+            if (!changed) break;
+         }
+
+         // insert all edges that start before the center of this scanline -- omit ones that also end on this scanline
+         while (e->y0 <= scan_y) {
+            if (e->y1 > scan_y) {
+               stbtt__active_edge *z = stbtt__new_active(&hh, e, off_x, scan_y, userdata);
+               if (z != NULL) {
+                  // find insertion point
+                  if (active == NULL)
+                     active = z;
+                  else if (z->x < active->x) {
+                     // insert at front
+                     z->next = active;
+                     active = z;
+                  } else {
+                     // find thing to insert AFTER
+                     stbtt__active_edge *p = active;
+                     while (p->next && p->next->x < z->x)
+                        p = p->next;
+                     // at this point, p->next->x is NOT < z->x
+                     z->next = p->next;
+                     p->next = z;
+                  }
+               }
+            }
+            ++e;
+         }
+
+         // now process all active edges in XOR fashion
+         if (active)
+            stbtt__fill_active_edges(scanline, result->w, active, max_weight);
+
+         ++y;
+      }
+      STBTT_memcpy(result->pixels + j * result->stride, scanline, result->w);
+      ++j;
+   }
+
+   stbtt__hheap_cleanup(&hh, userdata);
+
+   if (scanline != scanline_data)
+      STBTT_free(scanline, userdata);
+}
+
+#elif STBTT_RASTERIZER_VERSION == 2
+
+// the edge passed in here does not cross the vertical line at x or the vertical line at x+1
+// (i.e. it has already been clipped to those)
+static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edge *e, float x0, float y0, float x1, float y1)
+{
+   if (y0 == y1) return;
+   STBTT_assert(y0 < y1);
+   STBTT_assert(e->sy <= e->ey);
+   if (y0 > e->ey) return;
+   if (y1 < e->sy) return;
+   if (y0 < e->sy) {
+      x0 += (x1-x0) * (e->sy - y0) / (y1-y0);
+      y0 = e->sy;
+   }
+   if (y1 > e->ey) {
+      x1 += (x1-x0) * (e->ey - y1) / (y1-y0);
+      y1 = e->ey;
+   }
+
+   if (x0 == x)
+      STBTT_assert(x1 <= x+1);
+   else if (x0 == x+1)
+      STBTT_assert(x1 >= x);
+   else if (x0 <= x)
+      STBTT_assert(x1 <= x);
+   else if (x0 >= x+1)
+      STBTT_assert(x1 >= x+1);
+   else
+      STBTT_assert(x1 >= x && x1 <= x+1);
+
+   if (x0 <= x && x1 <= x)
+      scanline[x] += e->direction * (y1-y0);
+   else if (x0 >= x+1 && x1 >= x+1)
+      ;
+   else {
+      STBTT_assert(x0 >= x && x0 <= x+1 && x1 >= x && x1 <= x+1);
+      scanline[x] += e->direction * (y1-y0) * (1-((x0-x)+(x1-x))/2); // coverage = 1 - average x position
+   }
+}
+
+static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top)
+{
+   float y_bottom = y_top+1;
+
+   while (e) {
+      // brute force every pixel
+
+      // compute intersection points with top & bottom
+      STBTT_assert(e->ey >= y_top);
+
+      if (e->fdx == 0) {
+         float x0 = e->fx;
+         if (x0 < len) {
+            if (x0 >= 0) {
+               stbtt__handle_clipped_edge(scanline,(int) x0,e, x0,y_top, x0,y_bottom);
+               stbtt__handle_clipped_edge(scanline_fill-1,(int) x0+1,e, x0,y_top, x0,y_bottom);
+            } else {
+               stbtt__handle_clipped_edge(scanline_fill-1,0,e, x0,y_top, x0,y_bottom);
+            }
+         }
+      } else {
+         float x0 = e->fx;
+         float dx = e->fdx;
+         float xb = x0 + dx;
+         float x_top, x_bottom;
+         float sy0,sy1;
+         float dy = e->fdy;
+         STBTT_assert(e->sy <= y_bottom && e->ey >= y_top);
+
+         // compute endpoints of line segment clipped to this scanline (if the
+         // line segment starts on this scanline. x0 is the intersection of the
+         // line with y_top, but that may be off the line segment.
+         if (e->sy > y_top) {
+            x_top = x0 + dx * (e->sy - y_top);
+            sy0 = e->sy;
+         } else {
+            x_top = x0;
+            sy0 = y_top;
+         }
+         if (e->ey < y_bottom) {
+            x_bottom = x0 + dx * (e->ey - y_top);
+            sy1 = e->ey;
+         } else {
+            x_bottom = xb;
+            sy1 = y_bottom;
+         }
+
+         if (x_top >= 0 && x_bottom >= 0 && x_top < len && x_bottom < len) {
+            // from here on, we don't have to range check x values
+
+            if ((int) x_top == (int) x_bottom) {
+               float height;
+               // simple case, only spans one pixel
+               int x = (int) x_top;
+               height = sy1 - sy0;
+               STBTT_assert(x >= 0 && x < len);
+               scanline[x] += e->direction * (1-((x_top - x) + (x_bottom-x))/2)  * height;
+               scanline_fill[x] += e->direction * height; // everything right of this pixel is filled
+            } else {
+               int x,x1,x2;
+               float y_crossing, step, sign, area;
+               // covers 2+ pixels
+               if (x_top > x_bottom) {
+                  // flip scanline vertically; signed area is the same
+                  float t;
+                  sy0 = y_bottom - (sy0 - y_top);
+                  sy1 = y_bottom - (sy1 - y_top);
+                  t = sy0, sy0 = sy1, sy1 = t;
+                  t = x_bottom, x_bottom = x_top, x_top = t;
+                  dx = -dx;
+                  dy = -dy;
+                  t = x0, x0 = xb, xb = t;
+               }
+
+               x1 = (int) x_top;
+               x2 = (int) x_bottom;
+               // compute intersection with y axis at x1+1
+               y_crossing = (x1+1 - x0) * dy + y_top;
+
+               sign = e->direction;
+               // area of the rectangle covered from y0..y_crossing
+               area = sign * (y_crossing-sy0);
+               // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing)
+               scanline[x1] += area * (1-((x_top - x1)+(x1+1-x1))/2);
+
+               step = sign * dy;
+               for (x = x1+1; x < x2; ++x) {
+                  scanline[x] += area + step/2;
+                  area += step;
+               }
+               y_crossing += dy * (x2 - (x1+1));
+
+               STBTT_assert(STBTT_fabs(area) <= 1.01f);
+
+               scanline[x2] += area + sign * (1-((x2-x2)+(x_bottom-x2))/2) * (sy1-y_crossing);
+
+               scanline_fill[x2] += sign * (sy1-sy0);
+            }
+         } else {
+            // if edge goes outside of box we're drawing, we require
+            // clipping logic. since this does not match the intended use
+            // of this library, we use a different, very slow brute
+            // force implementation
+            int x;
+            for (x=0; x < len; ++x) {
+               // cases:
+               //
+               // there can be up to two intersections with the pixel. any intersection
+               // with left or right edges can be handled by splitting into two (or three)
+               // regions. intersections with top & bottom do not necessitate case-wise logic.
+               //
+               // the old way of doing this found the intersections with the left & right edges,
+               // then used some simple logic to produce up to three segments in sorted order
+               // from top-to-bottom. however, this had a problem: if an x edge was epsilon
+               // across the x border, then the corresponding y position might not be distinct
+               // from the other y segment, and it might ignored as an empty segment. to avoid
+               // that, we need to explicitly produce segments based on x positions.
+
+               // rename variables to clear pairs
+               float y0 = y_top;
+               float x1 = (float) (x);
+               float x2 = (float) (x+1);
+               float x3 = xb;
+               float y3 = y_bottom;
+               float y1,y2;
+
+               // x = e->x + e->dx * (y-y_top)
+               // (y-y_top) = (x - e->x) / e->dx
+               // y = (x - e->x) / e->dx + y_top
+               y1 = (x - x0) / dx + y_top;
+               y2 = (x+1 - x0) / dx + y_top;
+
+               if (x0 < x1 && x3 > x2) {         // three segments descending down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else if (x3 < x1 && x0 > x2) {  // three segments descending down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x0 < x1 && x3 > x1) {  // two segments across x, down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x3 < x1 && x0 > x1) {  // two segments across x, down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x0 < x2 && x3 > x2) {  // two segments across x+1, down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else if (x3 < x2 && x0 > x2) {  // two segments across x+1, down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else {  // one segment
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x3,y3);
+               }
+            }
+         }
+      }
+      e = e->next;
+   }
+}
+
+// directly AA rasterize edges w/o supersampling
+static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
+{
+   stbtt__hheap hh = { 0, 0, 0 };
+   stbtt__active_edge *active = NULL;
+   int y,j=0, i;
+   float scanline_data[129], *scanline, *scanline2;
+
+   STBTT__NOTUSED(vsubsample);
+
+   if (result->w > 64)
+      scanline = (float *) STBTT_malloc((result->w*2+1) * sizeof(float), userdata);
+   else
+      scanline = scanline_data;
+
+   scanline2 = scanline + result->w;
+
+   y = off_y;
+   e[n].y0 = (float) (off_y + result->h) + 1;
+
+   while (j < result->h) {
+      // find center of pixel for this scanline
+      float scan_y_top    = y + 0.0f;
+      float scan_y_bottom = y + 1.0f;
+      stbtt__active_edge **step = &active;
+
+      STBTT_memset(scanline , 0, result->w*sizeof(scanline[0]));
+      STBTT_memset(scanline2, 0, (result->w+1)*sizeof(scanline[0]));
+
+      // update all active edges;
+      // remove all active edges that terminate before the top of this scanline
+      while (*step) {
+         stbtt__active_edge * z = *step;
+         if (z->ey <= scan_y_top) {
+            *step = z->next; // delete from list
+            STBTT_assert(z->direction);
+            z->direction = 0;
+            stbtt__hheap_free(&hh, z);
+         } else {
+            step = &((*step)->next); // advance through list
+         }
+      }
+
+      // insert all edges that start before the bottom of this scanline
+      while (e->y0 <= scan_y_bottom) {
+         if (e->y0 != e->y1) {
+            stbtt__active_edge *z = stbtt__new_active(&hh, e, off_x, scan_y_top, userdata);
+            if (z != NULL) {
+               STBTT_assert(z->ey >= scan_y_top);
+               // insert at front
+               z->next = active;
+               active = z;
+            }
+         }
+         ++e;
+      }
+
+      // now process all active edges
+      if (active)
+         stbtt__fill_active_edges_new(scanline, scanline2+1, result->w, active, scan_y_top);
+
+      {
+         float sum = 0;
+         for (i=0; i < result->w; ++i) {
+            float k;
+            int m;
+            sum += scanline2[i];
+            k = scanline[i] + sum;
+            k = (float) STBTT_fabs(k)*255 + 0.5f;
+            m = (int) k;
+            if (m > 255) m = 255;
+            result->pixels[j*result->stride + i] = (unsigned char) m;
+         }
+      }
+      // advance all the edges
+      step = &active;
+      while (*step) {
+         stbtt__active_edge *z = *step;
+         z->fx += z->fdx; // advance to position for current scanline
+         step = &((*step)->next); // advance through list
+      }
+
+      ++y;
+      ++j;
+   }
+
+   stbtt__hheap_cleanup(&hh, userdata);
+
+   if (scanline != scanline_data)
+      STBTT_free(scanline, userdata);
+}
+#else
+#error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+
+#define STBTT__COMPARE(a,b)  ((a)->y0 < (b)->y0)
+
+static void stbtt__sort_edges_ins_sort(stbtt__edge *p, int n)
+{
+   int i,j;
+   for (i=1; i < n; ++i) {
+      stbtt__edge t = p[i], *a = &t;
+      j = i;
+      while (j > 0) {
+         stbtt__edge *b = &p[j-1];
+         int c = STBTT__COMPARE(a,b);
+         if (!c) break;
+         p[j] = p[j-1];
+         --j;
+      }
+      if (i != j)
+         p[j] = t;
+   }
+}
+
+static void stbtt__sort_edges_quicksort(stbtt__edge *p, int n)
+{
+   /* threshhold for transitioning to insertion sort */
+   while (n > 12) {
+      stbtt__edge t;
+      int c01,c12,c,m,i,j;
+
+      /* compute median of three */
+      m = n >> 1;
+      c01 = STBTT__COMPARE(&p[0],&p[m]);
+      c12 = STBTT__COMPARE(&p[m],&p[n-1]);
+      /* if 0 >= mid >= end, or 0 < mid < end, then use mid */
+      if (c01 != c12) {
+         /* otherwise, we'll need to swap something else to middle */
+         int z;
+         c = STBTT__COMPARE(&p[0],&p[n-1]);
+         /* 0>mid && mid<n:  0>n => n; 0<n => 0 */
+         /* 0<mid && mid>n:  0>n => 0; 0<n => n */
+         z = (c == c12) ? 0 : n-1;
+         t = p[z];
+         p[z] = p[m];
+         p[m] = t;
+      }
+      /* now p[m] is the median-of-three */
+      /* swap it to the beginning so it won't move around */
+      t = p[0];
+      p[0] = p[m];
+      p[m] = t;
+
+      /* partition loop */
+      i=1;
+      j=n-1;
+      for(;;) {
+         /* handling of equality is crucial here */
+         /* for sentinels & efficiency with duplicates */
+         for (;;++i) {
+            if (!STBTT__COMPARE(&p[i], &p[0])) break;
+         }
+         for (;;--j) {
+            if (!STBTT__COMPARE(&p[0], &p[j])) break;
+         }
+         /* make sure we haven't crossed */
+         if (i >= j) break;
+         t = p[i];
+         p[i] = p[j];
+         p[j] = t;
+
+         ++i;
+         --j;
+      }
+      /* recurse on smaller side, iterate on larger */
+      if (j < (n-i)) {
+         stbtt__sort_edges_quicksort(p,j);
+         p = p+i;
+         n = n-i;
+      } else {
+         stbtt__sort_edges_quicksort(p+i, n-i);
+         n = j;
+      }
+   }
+}
+
+static void stbtt__sort_edges(stbtt__edge *p, int n)
+{
+   stbtt__sort_edges_quicksort(p, n);
+   stbtt__sort_edges_ins_sort(p, n);
+}
+
+typedef struct
+{
+   float x,y;
+} stbtt__point;
+
+static void stbtt__rasterize(stbtt__bitmap *result, stbtt__point *pts, int *wcount, int windings, float scale_x, float scale_y, float shift_x, float shift_y, int off_x, int off_y, int invert, void *userdata)
+{
+   float y_scale_inv = invert ? -scale_y : scale_y;
+   stbtt__edge *e;
+   int n,i,j,k,m;
+#if STBTT_RASTERIZER_VERSION == 1
+   int vsubsample = result->h < 8 ? 15 : 5;
+#elif STBTT_RASTERIZER_VERSION == 2
+   int vsubsample = 1;
+#else
+   #error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+   // vsubsample should divide 255 evenly; otherwise we won't reach full opacity
+
+   // now we have to blow out the windings into explicit edge lists
+   n = 0;
+   for (i=0; i < windings; ++i)
+      n += wcount[i];
+
+   e = (stbtt__edge *) STBTT_malloc(sizeof(*e) * (n+1), userdata); // add an extra one as a sentinel
+   if (e == 0) return;
+   n = 0;
+
+   m=0;
+   for (i=0; i < windings; ++i) {
+      stbtt__point *p = pts + m;
+      m += wcount[i];
+      j = wcount[i]-1;
+      for (k=0; k < wcount[i]; j=k++) {
+         int a=k,b=j;
+         // skip the edge if horizontal
+         if (p[j].y == p[k].y)
+            continue;
+         // add edge from j to k to the list
+         e[n].invert = 0;
+         if (invert ? p[j].y > p[k].y : p[j].y < p[k].y) {
+            e[n].invert = 1;
+            a=j,b=k;
+         }
+         e[n].x0 = p[a].x * scale_x + shift_x;
+         e[n].y0 = (p[a].y * y_scale_inv + shift_y) * vsubsample;
+         e[n].x1 = p[b].x * scale_x + shift_x;
+         e[n].y1 = (p[b].y * y_scale_inv + shift_y) * vsubsample;
+         ++n;
+      }
+   }
+
+   // now sort the edges by their highest point (should snap to integer, and then by x)
+   //STBTT_sort(e, n, sizeof(e[0]), stbtt__edge_compare);
+   stbtt__sort_edges(e, n);
+
+   // now, traverse the scanlines and find the intersections on each scanline, use xor winding rule
+   stbtt__rasterize_sorted_edges(result, e, n, vsubsample, off_x, off_y, userdata);
+
+   STBTT_free(e, userdata);
+}
+
+static void stbtt__add_point(stbtt__point *points, int n, float x, float y)
+{
+   if (!points) return; // during first pass, it's unallocated
+   points[n].x = x;
+   points[n].y = y;
+}
+
+// tesselate until threshhold p is happy... @TODO warped to compensate for non-linear stretching
+static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float objspace_flatness_squared, int n)
+{
+   // midpoint
+   float mx = (x0 + 2*x1 + x2)/4;
+   float my = (y0 + 2*y1 + y2)/4;
+   // versus directly drawn line
+   float dx = (x0+x2)/2 - mx;
+   float dy = (y0+y2)/2 - my;
+   if (n > 16) // 65536 segments on one curve better be enough!
+      return 1;
+   if (dx*dx+dy*dy > objspace_flatness_squared) { // half-pixel error allowed... need to be smaller if AA
+      stbtt__tesselate_curve(points, num_points, x0,y0, (x0+x1)/2.0f,(y0+y1)/2.0f, mx,my, objspace_flatness_squared,n+1);
+      stbtt__tesselate_curve(points, num_points, mx,my, (x1+x2)/2.0f,(y1+y2)/2.0f, x2,y2, objspace_flatness_squared,n+1);
+   } else {
+      stbtt__add_point(points, *num_points,x2,y2);
+      *num_points = *num_points+1;
+   }
+   return 1;
+}
+
+// returns number of contours
+static stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts, float objspace_flatness, int **contour_lengths, int *num_contours, void *userdata)
+{
+   stbtt__point *points=0;
+   int num_points=0;
+
+   float objspace_flatness_squared = objspace_flatness * objspace_flatness;
+   int i,n=0,start=0, pass;
+
+   // count how many "moves" there are to get the contour count
+   for (i=0; i < num_verts; ++i)
+      if (vertices[i].type == STBTT_vmove)
+         ++n;
+
+   *num_contours = n;
+   if (n == 0) return 0;
+
+   *contour_lengths = (int *) STBTT_malloc(sizeof(**contour_lengths) * n, userdata);
+
+   if (*contour_lengths == 0) {
+      *num_contours = 0;
+      return 0;
+   }
+
+   // make two passes through the points so we don't need to realloc
+   for (pass=0; pass < 2; ++pass) {
+      float x=0,y=0;
+      if (pass == 1) {
+         points = (stbtt__point *) STBTT_malloc(num_points * sizeof(points[0]), userdata);
+         if (points == NULL) goto error;
+      }
+      num_points = 0;
+      n= -1;
+      for (i=0; i < num_verts; ++i) {
+         switch (vertices[i].type) {
+            case STBTT_vmove:
+               // start the next contour
+               if (n >= 0)
+                  (*contour_lengths)[n] = num_points - start;
+               ++n;
+               start = num_points;
+
+               x = vertices[i].x, y = vertices[i].y;
+               stbtt__add_point(points, num_points++, x,y);
+               break;
+            case STBTT_vline:
+               x = vertices[i].x, y = vertices[i].y;
+               stbtt__add_point(points, num_points++, x, y);
+               break;
+            case STBTT_vcurve:
+               stbtt__tesselate_curve(points, &num_points, x,y,
+                                        vertices[i].cx, vertices[i].cy,
+                                        vertices[i].x,  vertices[i].y,
+                                        objspace_flatness_squared, 0);
+               x = vertices[i].x, y = vertices[i].y;
+               break;
+         }
+      }
+      (*contour_lengths)[n] = num_points - start;
+   }
+
+   return points;
+error:
+   STBTT_free(points, userdata);
+   STBTT_free(*contour_lengths, userdata);
+   *contour_lengths = 0;
+   *num_contours = 0;
+   return NULL;
+}
+
+STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, float shift_x, float shift_y, int x_off, int y_off, int invert, void *userdata)
+{
+   float scale = scale_x > scale_y ? scale_y : scale_x;
+   int winding_count, *winding_lengths;
+   stbtt__point *windings = stbtt_FlattenCurves(vertices, num_verts, flatness_in_pixels / scale, &winding_lengths, &winding_count, userdata);
+   if (windings) {
+      stbtt__rasterize(result, windings, winding_lengths, winding_count, scale_x, scale_y, shift_x, shift_y, x_off, y_off, invert, userdata);
+      STBTT_free(winding_lengths, userdata);
+      STBTT_free(windings, userdata);
+   }
+}
+
+STBTT_DEF void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata)
+{
+   STBTT_free(bitmap, userdata);
+}
+
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int glyph, int *width, int *height, int *xoff, int *yoff)
+{
+   int ix0,iy0,ix1,iy1;
+   stbtt__bitmap gbm;
+   stbtt_vertex *vertices;
+   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+
+   if (scale_x == 0) scale_x = scale_y;
+   if (scale_y == 0) {
+      if (scale_x == 0) {
+         STBTT_free(vertices, info->userdata);
+         return NULL;
+      }
+      scale_y = scale_x;
+   }
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale_x, scale_y, shift_x, shift_y, &ix0,&iy0,&ix1,&iy1);
+
+   // now we get the size
+   gbm.w = (ix1 - ix0);
+   gbm.h = (iy1 - iy0);
+   gbm.pixels = NULL; // in case we error
+
+   if (width ) *width  = gbm.w;
+   if (height) *height = gbm.h;
+   if (xoff  ) *xoff   = ix0;
+   if (yoff  ) *yoff   = iy0;
+
+   if (gbm.w && gbm.h) {
+      gbm.pixels = (unsigned char *) STBTT_malloc(gbm.w * gbm.h, info->userdata);
+      if (gbm.pixels) {
+         gbm.stride = gbm.w;
+
+         stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, shift_x, shift_y, ix0, iy0, 1, info->userdata);
+      }
+   }
+   STBTT_free(vertices, info->userdata);
+   return gbm.pixels;
+}
+
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphBitmapSubpixel(info, scale_x, scale_y, 0.0f, 0.0f, glyph, width, height, xoff, yoff);
+}
+
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int glyph)
+{
+   int ix0,iy0;
+   stbtt_vertex *vertices;
+   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+   stbtt__bitmap gbm;
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale_x, scale_y, shift_x, shift_y, &ix0,&iy0,0,0);
+   gbm.pixels = output;
+   gbm.w = out_w;
+   gbm.h = out_h;
+   gbm.stride = out_stride;
+
+   if (gbm.w && gbm.h)
+      stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, shift_x, shift_y, ix0,iy0, 1, info->userdata);
+
+   STBTT_free(vertices, info->userdata);
+}
+
+STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, 0.0f,0.0f, glyph);
+}
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphBitmapSubpixel(info, scale_x, scale_y,shift_x,shift_y, stbtt_FindGlyphIndex(info,codepoint), width,height,xoff,yoff);
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, shift_x, shift_y, stbtt_FindGlyphIndex(info,codepoint));
+}
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetCodepointBitmapSubpixel(info, scale_x, scale_y, 0.0f,0.0f, codepoint, width,height,xoff,yoff);
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint)
+{
+   stbtt_MakeCodepointBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, 0.0f,0.0f, codepoint);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// bitmap baking
+//
+// This is SUPER-CRAPPY packing to keep source code small
+
+STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+                                float pixel_height,                     // height of font in pixels
+                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+                                int first_char, int num_chars,          // characters to bake
+                                stbtt_bakedchar *chardata)
+{
+   float scale;
+   int x,y,bottom_y, i;
+   stbtt_fontinfo f;
+   f.userdata = NULL;
+   if (!stbtt_InitFont(&f, data, offset))
+      return -1;
+   STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
+   x=y=1;
+   bottom_y = 1;
+
+   scale = stbtt_ScaleForPixelHeight(&f, pixel_height);
+
+   for (i=0; i < num_chars; ++i) {
+      int advance, lsb, x0,y0,x1,y1,gw,gh;
+      int g = stbtt_FindGlyphIndex(&f, first_char + i);
+      stbtt_GetGlyphHMetrics(&f, g, &advance, &lsb);
+      stbtt_GetGlyphBitmapBox(&f, g, scale,scale, &x0,&y0,&x1,&y1);
+      gw = x1-x0;
+      gh = y1-y0;
+      if (x + gw + 1 >= pw)
+         y = bottom_y, x = 1; // advance to next row
+      if (y + gh + 1 >= ph) // check if it fits vertically AFTER potentially moving to next row
+         return -i;
+      STBTT_assert(x+gw < pw);
+      STBTT_assert(y+gh < ph);
+      stbtt_MakeGlyphBitmap(&f, pixels+x+y*pw, gw,gh,pw, scale,scale, g);
+      chardata[i].x0 = (stbtt_int16) x;
+      chardata[i].y0 = (stbtt_int16) y;
+      chardata[i].x1 = (stbtt_int16) (x + gw);
+      chardata[i].y1 = (stbtt_int16) (y + gh);
+      chardata[i].xadvance = scale * advance;
+      chardata[i].xoff     = (float) x0;
+      chardata[i].yoff     = (float) y0;
+      x = x + gw + 1;
+      if (y+gh+1 > bottom_y)
+         bottom_y = y+gh+1;
+   }
+   return bottom_y;
+}
+
+STBTT_DEF void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
+{
+   float d3d_bias = opengl_fillrule ? 0 : -0.5f;
+   float ipw = 1.0f / pw, iph = 1.0f / ph;
+   stbtt_bakedchar *b = chardata + char_index;
+   int round_x = STBTT_ifloor((*xpos + b->xoff) + 0.5f);
+   int round_y = STBTT_ifloor((*ypos + b->yoff) + 0.5f);
+
+   q->x0 = round_x + d3d_bias;
+   q->y0 = round_y + d3d_bias;
+   q->x1 = round_x + b->x1 - b->x0 + d3d_bias;
+   q->y1 = round_y + b->y1 - b->y0 + d3d_bias;
+
+   q->s0 = b->x0 * ipw;
+   q->t0 = b->y0 * iph;
+   q->s1 = b->x1 * ipw;
+   q->t1 = b->y1 * iph;
+
+   *xpos += b->xadvance;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// rectangle packing replacement routines if you don't have stb_rect_pack.h
+//
+
+#ifndef STB_RECT_PACK_VERSION
+
+typedef int stbrp_coord;
+
+////////////////////////////////////////////////////////////////////////////////////
+//                                                                                //
+//                                                                                //
+// COMPILER WARNING ?!?!?                                                         //
+//                                                                                //
+//                                                                                //
+// if you get a compile warning due to these symbols being defined more than      //
+// once, move #include "stb_rect_pack.h" before #include "stb_truetype.h"         //
+//                                                                                //
+////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct
+{
+   int width,height;
+   int x,y,bottom_y;
+} stbrp_context;
+
+typedef struct
+{
+   unsigned char x;
+} stbrp_node;
+
+struct stbrp_rect
+{
+   stbrp_coord x,y;
+   int id,w,h,was_packed;
+};
+
+static void stbrp_init_target(stbrp_context *con, int pw, int ph, stbrp_node *nodes, int num_nodes)
+{
+   con->width  = pw;
+   con->height = ph;
+   con->x = 0;
+   con->y = 0;
+   con->bottom_y = 0;
+   STBTT__NOTUSED(nodes);
+   STBTT__NOTUSED(num_nodes);
+}
+
+static void stbrp_pack_rects(stbrp_context *con, stbrp_rect *rects, int num_rects)
+{
+   int i;
+   for (i=0; i < num_rects; ++i) {
+      if (con->x + rects[i].w > con->width) {
+         con->x = 0;
+         con->y = con->bottom_y;
+      }
+      if (con->y + rects[i].h > con->height)
+         break;
+      rects[i].x = con->x;
+      rects[i].y = con->y;
+      rects[i].was_packed = 1;
+      con->x += rects[i].w;
+      if (con->y + rects[i].h > con->bottom_y)
+         con->bottom_y = con->y + rects[i].h;
+   }
+   for (   ; i < num_rects; ++i)
+      rects[i].was_packed = 0;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// bitmap baking
+//
+// This is SUPER-AWESOME (tm Ryan Gordon) packing using stb_rect_pack.h. If
+// stb_rect_pack.h isn't available, it uses the BakeFontBitmap strategy.
+
+STBTT_DEF int stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, int pw, int ph, int stride_in_bytes, int padding, void *alloc_context)
+{
+   stbrp_context *context = (stbrp_context *) STBTT_malloc(sizeof(*context)            ,alloc_context);
+   int            num_nodes = pw - padding;
+   stbrp_node    *nodes   = (stbrp_node    *) STBTT_malloc(sizeof(*nodes  ) * num_nodes,alloc_context);
+
+   if (context == NULL || nodes == NULL) {
+      if (context != NULL) STBTT_free(context, alloc_context);
+      if (nodes   != NULL) STBTT_free(nodes  , alloc_context);
+      return 0;
+   }
+
+   spc->user_allocator_context = alloc_context;
+   spc->width = pw;
+   spc->height = ph;
+   spc->pixels = pixels;
+   spc->pack_info = context;
+   spc->nodes = nodes;
+   spc->padding = padding;
+   spc->stride_in_bytes = stride_in_bytes != 0 ? stride_in_bytes : pw;
+   spc->h_oversample = 1;
+   spc->v_oversample = 1;
+
+   stbrp_init_target(context, pw-padding, ph-padding, nodes, num_nodes);
+
+   if (pixels)
+      STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
+
+   return 1;
+}
+
+STBTT_DEF void stbtt_PackEnd  (stbtt_pack_context *spc)
+{
+   STBTT_free(spc->nodes    , spc->user_allocator_context);
+   STBTT_free(spc->pack_info, spc->user_allocator_context);
+}
+
+STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h_oversample, unsigned int v_oversample)
+{
+   STBTT_assert(h_oversample <= STBTT_MAX_OVERSAMPLE);
+   STBTT_assert(v_oversample <= STBTT_MAX_OVERSAMPLE);
+   if (h_oversample <= STBTT_MAX_OVERSAMPLE)
+      spc->h_oversample = h_oversample;
+   if (v_oversample <= STBTT_MAX_OVERSAMPLE)
+      spc->v_oversample = v_oversample;
+}
+
+#define STBTT__OVER_MASK  (STBTT_MAX_OVERSAMPLE-1)
+
+static void stbtt__h_prefilter(unsigned char *pixels, int w, int h, int stride_in_bytes, unsigned int kernel_width)
+{
+   unsigned char buffer[STBTT_MAX_OVERSAMPLE];
+   int safe_w = w - kernel_width;
+   int j;
+   STBTT_memset(buffer, 0, STBTT_MAX_OVERSAMPLE); // suppress bogus warning from VS2013 -analyze
+   for (j=0; j < h; ++j) {
+      int i;
+      unsigned int total;
+      STBTT_memset(buffer, 0, kernel_width);
+
+      total = 0;
+
+      // make kernel_width a constant in common cases so compiler can optimize out the divide
+      switch (kernel_width) {
+         case 2:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 2);
+            }
+            break;
+         case 3:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 3);
+            }
+            break;
+         case 4:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 4);
+            }
+            break;
+         case 5:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 5);
+            }
+            break;
+         default:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / kernel_width);
+            }
+            break;
+      }
+
+      for (; i < w; ++i) {
+         STBTT_assert(pixels[i] == 0);
+         total -= buffer[i & STBTT__OVER_MASK];
+         pixels[i] = (unsigned char) (total / kernel_width);
+      }
+
+      pixels += stride_in_bytes;
+   }
+}
+
+static void stbtt__v_prefilter(unsigned char *pixels, int w, int h, int stride_in_bytes, unsigned int kernel_width)
+{
+   unsigned char buffer[STBTT_MAX_OVERSAMPLE];
+   int safe_h = h - kernel_width;
+   int j;
+   STBTT_memset(buffer, 0, STBTT_MAX_OVERSAMPLE); // suppress bogus warning from VS2013 -analyze
+   for (j=0; j < w; ++j) {
+      int i;
+      unsigned int total;
+      STBTT_memset(buffer, 0, kernel_width);
+
+      total = 0;
+
+      // make kernel_width a constant in common cases so compiler can optimize out the divide
+      switch (kernel_width) {
+         case 2:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 2);
+            }
+            break;
+         case 3:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 3);
+            }
+            break;
+         case 4:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 4);
+            }
+            break;
+         case 5:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 5);
+            }
+            break;
+         default:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / kernel_width);
+            }
+            break;
+      }
+
+      for (; i < h; ++i) {
+         STBTT_assert(pixels[i*stride_in_bytes] == 0);
+         total -= buffer[i & STBTT__OVER_MASK];
+         pixels[i*stride_in_bytes] = (unsigned char) (total / kernel_width);
+      }
+
+      pixels += 1;
+   }
+}
+
+static float stbtt__oversample_shift(int oversample)
+{
+   if (!oversample)
+      return 0.0f;
+
+   // The prefilter is a box filter of width "oversample",
+   // which shifts phase by (oversample - 1)/2 pixels in
+   // oversampled space. We want to shift in the opposite
+   // direction to counter this.
+   return (float)-(oversample - 1) / (2.0f * (float)oversample);
+}
+
+// rects array must be big enough to accommodate all characters in the given ranges
+STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+{
+   int i,j,k;
+
+   k=0;
+   for (i=0; i < num_ranges; ++i) {
+      float fh = ranges[i].font_size;
+      float scale = fh > 0 ? stbtt_ScaleForPixelHeight(info, fh) : stbtt_ScaleForMappingEmToPixels(info, -fh);
+      ranges[i].h_oversample = (unsigned char) spc->h_oversample;
+      ranges[i].v_oversample = (unsigned char) spc->v_oversample;
+      for (j=0; j < ranges[i].num_chars; ++j) {
+         int x0,y0,x1,y1;
+         int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
+         int glyph = stbtt_FindGlyphIndex(info, codepoint);
+         stbtt_GetGlyphBitmapBoxSubpixel(info,glyph,
+                                         scale * spc->h_oversample,
+                                         scale * spc->v_oversample,
+                                         0,0,
+                                         &x0,&y0,&x1,&y1);
+         rects[k].w = (stbrp_coord) (x1-x0 + spc->padding + spc->h_oversample-1);
+         rects[k].h = (stbrp_coord) (y1-y0 + spc->padding + spc->v_oversample-1);
+         ++k;
+      }
+   }
+
+   return k;
+}
+
+// rects array must be big enough to accommodate all characters in the given ranges
+STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+{
+   int i,j,k, return_value = 1;
+
+   // save current values
+   int old_h_over = spc->h_oversample;
+   int old_v_over = spc->v_oversample;
+
+   k = 0;
+   for (i=0; i < num_ranges; ++i) {
+      float fh = ranges[i].font_size;
+      float scale = fh > 0 ? stbtt_ScaleForPixelHeight(info, fh) : stbtt_ScaleForMappingEmToPixels(info, -fh);
+      float recip_h,recip_v,sub_x,sub_y;
+      spc->h_oversample = ranges[i].h_oversample;
+      spc->v_oversample = ranges[i].v_oversample;
+      recip_h = 1.0f / spc->h_oversample;
+      recip_v = 1.0f / spc->v_oversample;
+      sub_x = stbtt__oversample_shift(spc->h_oversample);
+      sub_y = stbtt__oversample_shift(spc->v_oversample);
+      for (j=0; j < ranges[i].num_chars; ++j) {
+         stbrp_rect *r = &rects[k];
+         if (r->was_packed) {
+            stbtt_packedchar *bc = &ranges[i].chardata_for_range[j];
+            int advance, lsb, x0,y0,x1,y1;
+            int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
+            int glyph = stbtt_FindGlyphIndex(info, codepoint);
+            stbrp_coord pad = (stbrp_coord) spc->padding;
+
+            // pad on left and top
+            r->x += pad;
+            r->y += pad;
+            r->w -= pad;
+            r->h -= pad;
+            stbtt_GetGlyphHMetrics(info, glyph, &advance, &lsb);
+            stbtt_GetGlyphBitmapBox(info, glyph,
+                                    scale * spc->h_oversample,
+                                    scale * spc->v_oversample,
+                                    &x0,&y0,&x1,&y1);
+            stbtt_MakeGlyphBitmapSubpixel(info,
+                                          spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                          r->w - spc->h_oversample+1,
+                                          r->h - spc->v_oversample+1,
+                                          spc->stride_in_bytes,
+                                          scale * spc->h_oversample,
+                                          scale * spc->v_oversample,
+                                          0,0,
+                                          glyph);
+
+            if (spc->h_oversample > 1)
+               stbtt__h_prefilter(spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                  r->w, r->h, spc->stride_in_bytes,
+                                  spc->h_oversample);
+
+            if (spc->v_oversample > 1)
+               stbtt__v_prefilter(spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                  r->w, r->h, spc->stride_in_bytes,
+                                  spc->v_oversample);
+
+            bc->x0       = (stbtt_int16)  r->x;
+            bc->y0       = (stbtt_int16)  r->y;
+            bc->x1       = (stbtt_int16) (r->x + r->w);
+            bc->y1       = (stbtt_int16) (r->y + r->h);
+            bc->xadvance =                scale * advance;
+            bc->xoff     =       (float)  x0 * recip_h + sub_x;
+            bc->yoff     =       (float)  y0 * recip_v + sub_y;
+            bc->xoff2    =                (x0 + r->w) * recip_h + sub_x;
+            bc->yoff2    =                (y0 + r->h) * recip_v + sub_y;
+         } else {
+            return_value = 0; // if any fail, report failure
+         }
+
+         ++k;
+      }
+   }
+
+   // restore original values
+   spc->h_oversample = old_h_over;
+   spc->v_oversample = old_v_over;
+
+   return return_value;
+}
+
+STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect *rects, int num_rects)
+{
+   stbrp_pack_rects((stbrp_context *) spc->pack_info, rects, num_rects);
+}
+
+STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges)
+{
+   stbtt_fontinfo info;
+   int i,j,n, return_value = 1;
+   //stbrp_context *context = (stbrp_context *) spc->pack_info;
+   stbrp_rect    *rects;
+
+   // flag all characters as NOT packed
+   for (i=0; i < num_ranges; ++i)
+      for (j=0; j < ranges[i].num_chars; ++j)
+         ranges[i].chardata_for_range[j].x0 =
+         ranges[i].chardata_for_range[j].y0 =
+         ranges[i].chardata_for_range[j].x1 =
+         ranges[i].chardata_for_range[j].y1 = 0;
+
+   n = 0;
+   for (i=0; i < num_ranges; ++i)
+      n += ranges[i].num_chars;
+
+   rects = (stbrp_rect *) STBTT_malloc(sizeof(*rects) * n, spc->user_allocator_context);
+   if (rects == NULL)
+      return 0;
+
+   info.userdata = spc->user_allocator_context;
+   stbtt_InitFont(&info, fontdata, stbtt_GetFontOffsetForIndex(fontdata,font_index));
+
+   n = stbtt_PackFontRangesGatherRects(spc, &info, ranges, num_ranges, rects);
+
+   stbtt_PackFontRangesPackRects(spc, rects, n);
+
+   return_value = stbtt_PackFontRangesRenderIntoRects(spc, &info, ranges, num_ranges, rects);
+
+   STBTT_free(rects, spc->user_allocator_context);
+   return return_value;
+}
+
+STBTT_DEF int stbtt_PackFontRange(stbtt_pack_context *spc, unsigned char *fontdata, int font_index, float font_size,
+            int first_unicode_codepoint_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range)
+{
+   stbtt_pack_range range;
+   range.first_unicode_codepoint_in_range = first_unicode_codepoint_in_range;
+   range.array_of_unicode_codepoints = NULL;
+   range.num_chars                   = num_chars_in_range;
+   range.chardata_for_range          = chardata_for_range;
+   range.font_size                   = font_size;
+   return stbtt_PackFontRanges(spc, fontdata, font_index, &range, 1);
+}
+
+STBTT_DEF void stbtt_GetPackedQuad(stbtt_packedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int align_to_integer)
+{
+   float ipw = 1.0f / pw, iph = 1.0f / ph;
+   stbtt_packedchar *b = chardata + char_index;
+
+   if (align_to_integer) {
+      float x = (float) STBTT_ifloor((*xpos + b->xoff) + 0.5f);
+      float y = (float) STBTT_ifloor((*ypos + b->yoff) + 0.5f);
+      q->x0 = x;
+      q->y0 = y;
+      q->x1 = x + b->xoff2 - b->xoff;
+      q->y1 = y + b->yoff2 - b->yoff;
+   } else {
+      q->x0 = *xpos + b->xoff;
+      q->y0 = *ypos + b->yoff;
+      q->x1 = *xpos + b->xoff2;
+      q->y1 = *ypos + b->yoff2;
+   }
+
+   q->s0 = b->x0 * ipw;
+   q->t0 = b->y0 * iph;
+   q->s1 = b->x1 * ipw;
+   q->t1 = b->y1 * iph;
+
+   *xpos += b->xadvance;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// font name matching -- recommended not to use this
+//
+
+// check if a utf8 string contains a prefix which is the utf16 string; if so return length of matching utf8 string
+static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(const stbtt_uint8 *s1, stbtt_int32 len1, const stbtt_uint8 *s2, stbtt_int32 len2)
+{
+   stbtt_int32 i=0;
+
+   // convert utf16 to utf8 and compare the results while converting
+   while (len2) {
+      stbtt_uint16 ch = s2[0]*256 + s2[1];
+      if (ch < 0x80) {
+         if (i >= len1) return -1;
+         if (s1[i++] != ch) return -1;
+      } else if (ch < 0x800) {
+         if (i+1 >= len1) return -1;
+         if (s1[i++] != 0xc0 + (ch >> 6)) return -1;
+         if (s1[i++] != 0x80 + (ch & 0x3f)) return -1;
+      } else if (ch >= 0xd800 && ch < 0xdc00) {
+         stbtt_uint32 c;
+         stbtt_uint16 ch2 = s2[2]*256 + s2[3];
+         if (i+3 >= len1) return -1;
+         c = ((ch - 0xd800) << 10) + (ch2 - 0xdc00) + 0x10000;
+         if (s1[i++] != 0xf0 + (c >> 18)) return -1;
+         if (s1[i++] != 0x80 + ((c >> 12) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((c >>  6) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((c      ) & 0x3f)) return -1;
+         s2 += 2; // plus another 2 below
+         len2 -= 2;
+      } else if (ch >= 0xdc00 && ch < 0xe000) {
+         return -1;
+      } else {
+         if (i+2 >= len1) return -1;
+         if (s1[i++] != 0xe0 + (ch >> 12)) return -1;
+         if (s1[i++] != 0x80 + ((ch >> 6) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((ch     ) & 0x3f)) return -1;
+      }
+      s2 += 2;
+      len2 -= 2;
+   }
+   return i;
+}
+
+STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2)
+{
+   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((const stbtt_uint8*) s1, len1, (const stbtt_uint8*) s2, len2);
+}
+
+// returns results in whatever encoding you request... but note that 2-byte encodings
+// will be BIG-ENDIAN... use stbtt_CompareUTF8toUTF16_bigendian() to compare
+STBTT_DEF const char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID)
+{
+   stbtt_int32 i,count,stringOffset;
+   stbtt_uint8 *fc = font->data;
+   stbtt_uint32 offset = font->fontstart;
+   stbtt_uint32 nm = stbtt__find_table(fc, offset, "name");
+   if (!nm) return NULL;
+
+   count = ttUSHORT(fc+nm+2);
+   stringOffset = nm + ttUSHORT(fc+nm+4);
+   for (i=0; i < count; ++i) {
+      stbtt_uint32 loc = nm + 6 + 12 * i;
+      if (platformID == ttUSHORT(fc+loc+0) && encodingID == ttUSHORT(fc+loc+2)
+          && languageID == ttUSHORT(fc+loc+4) && nameID == ttUSHORT(fc+loc+6)) {
+         *length = ttUSHORT(fc+loc+8);
+         return (const char *) (fc+stringOffset+ttUSHORT(fc+loc+10));
+      }
+   }
+   return NULL;
+}
+
+static int stbtt__matchpair(stbtt_uint8 *fc, stbtt_uint32 nm, stbtt_uint8 *name, stbtt_int32 nlen, stbtt_int32 target_id, stbtt_int32 next_id)
+{
+   stbtt_int32 i;
+   stbtt_int32 count = ttUSHORT(fc+nm+2);
+   stbtt_int32 stringOffset = nm + ttUSHORT(fc+nm+4);
+
+   for (i=0; i < count; ++i) {
+      stbtt_uint32 loc = nm + 6 + 12 * i;
+      stbtt_int32 id = ttUSHORT(fc+loc+6);
+      if (id == target_id) {
+         // find the encoding
+         stbtt_int32 platform = ttUSHORT(fc+loc+0), encoding = ttUSHORT(fc+loc+2), language = ttUSHORT(fc+loc+4);
+
+         // is this a Unicode encoding?
+         if (platform == 0 || (platform == 3 && encoding == 1) || (platform == 3 && encoding == 10)) {
+            stbtt_int32 slen = ttUSHORT(fc+loc+8);
+            stbtt_int32 off = ttUSHORT(fc+loc+10);
+
+            // check if there's a prefix match
+            stbtt_int32 matchlen = stbtt__CompareUTF8toUTF16_bigendian_prefix(name, nlen, fc+stringOffset+off,slen);
+            if (matchlen >= 0) {
+               // check for target_id+1 immediately following, with same encoding & language
+               if (i+1 < count && ttUSHORT(fc+loc+12+6) == next_id && ttUSHORT(fc+loc+12) == platform && ttUSHORT(fc+loc+12+2) == encoding && ttUSHORT(fc+loc+12+4) == language) {
+                  slen = ttUSHORT(fc+loc+12+8);
+                  off = ttUSHORT(fc+loc+12+10);
+                  if (slen == 0) {
+                     if (matchlen == nlen)
+                        return 1;
+                  } else if (matchlen < nlen && name[matchlen] == ' ') {
+                     ++matchlen;
+                     if (stbtt_CompareUTF8toUTF16_bigendian((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
+                        return 1;
+                  }
+               } else {
+                  // if nothing immediately following
+                  if (matchlen == nlen)
+                     return 1;
+               }
+            }
+         }
+
+         // @TODO handle other encodings
+      }
+   }
+   return 0;
+}
+
+static int stbtt__matches(stbtt_uint8 *fc, stbtt_uint32 offset, stbtt_uint8 *name, stbtt_int32 flags)
+{
+   stbtt_int32 nlen = (stbtt_int32) STBTT_strlen((char *) name);
+   stbtt_uint32 nm,hd;
+   if (!stbtt__isfont(fc+offset)) return 0;
+
+   // check italics/bold/underline flags in macStyle...
+   if (flags) {
+      hd = stbtt__find_table(fc, offset, "head");
+      if ((ttUSHORT(fc+hd+44) & 7) != (flags & 7)) return 0;
+   }
+
+   nm = stbtt__find_table(fc, offset, "name");
+   if (!nm) return 0;
+
+   if (flags) {
+      // if we checked the macStyle flags, then just check the family and ignore the subfamily
+      if (stbtt__matchpair(fc, nm, name, nlen, 16, -1))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  1, -1))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+   } else {
+      if (stbtt__matchpair(fc, nm, name, nlen, 16, 17))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  1,  2))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+   }
+
+   return 0;
+}
+
+STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *font_collection, const char *name_utf8, stbtt_int32 flags)
+{
+   stbtt_int32 i;
+   for (i=0;;++i) {
+      stbtt_int32 off = stbtt_GetFontOffsetForIndex(font_collection, i);
+      if (off < 0) return off;
+      if (stbtt__matches((stbtt_uint8 *) font_collection, off, (stbtt_uint8*) name_utf8, flags))
+         return off;
+   }
+}
+
+#endif // STB_TRUETYPE_IMPLEMENTATION
+
+
+// FULL VERSION HISTORY
+//
+//   1.11 (2016-04-02) fix unused-variable warning
+//   1.10 (2016-04-02) allow user-defined fabs() replacement
+//                     fix memory leak if fontsize=0.0
+//                     fix warning from duplicate typedef
+//   1.09 (2016-01-16) warning fix; avoid crash on outofmem; use alloc userdata for PackFontRanges
+//   1.08 (2015-09-13) document stbtt_Rasterize(); fixes for vertical & horizontal edges
+//   1.07 (2015-08-01) allow PackFontRanges to accept arrays of sparse codepoints;
+//                     allow PackFontRanges to pack and render in separate phases;
+//                     fix stbtt_GetFontOFfsetForIndex (never worked for non-0 input?);
+//                     fixed an assert() bug in the new rasterizer
+//                     replace assert() with STBTT_assert() in new rasterizer
+//   1.06 (2015-07-14) performance improvements (~35% faster on x86 and x64 on test machine)
+//                     also more precise AA rasterizer, except if shapes overlap
+//                     remove need for STBTT_sort
+//   1.05 (2015-04-15) fix misplaced definitions for STBTT_STATIC
+//   1.04 (2015-04-15) typo in example
+//   1.03 (2015-04-12) STBTT_STATIC, fix memory leak in new packing, various fixes
+//   1.02 (2014-12-10) fix various warnings & compile issues w/ stb_rect_pack, C++
+//   1.01 (2014-12-08) fix subpixel position when oversampling to exactly match
+//                        non-oversampled; STBTT_POINT_SIZE for packed case only
+//   1.00 (2014-12-06) add new PackBegin etc. API, w/ support for oversampling
+//   0.99 (2014-09-18) fix multiple bugs with subpixel rendering (ryg)
+//   0.9  (2014-08-07) support certain mac/iOS fonts without an MS platformID
+//   0.8b (2014-07-07) fix a warning
+//   0.8  (2014-05-25) fix a few more warnings
+//   0.7  (2013-09-25) bugfix: subpixel glyph bug fixed in 0.5 had come back
+//   0.6c (2012-07-24) improve documentation
+//   0.6b (2012-07-20) fix a few more warnings
+//   0.6  (2012-07-17) fix warnings; added stbtt_ScaleForMappingEmToPixels,
+//                        stbtt_GetFontBoundingBox, stbtt_IsGlyphEmpty
+//   0.5  (2011-12-09) bugfixes:
+//                        subpixel glyph renderer computed wrong bounding box
+//                        first vertex of shape can be off-curve (FreeSans)
+//   0.4b (2011-12-03) fixed an error in the font baking example
+//   0.4  (2011-12-01) kerning, subpixel rendering (tor)
+//                    bugfixes for:
+//                        codepoint-to-glyph conversion using table fmt=12
+//                        codepoint-to-glyph conversion using table fmt=4
+//                        stbtt_GetBakedQuad with non-square texture (Zer)
+//                    updated Hello World! sample to use kerning and subpixel
+//                    fixed some warnings
+//   0.3  (2009-06-24) cmap fmt=12, compound shapes (MM)
+//                    userdata, malloc-from-userdata, non-zero fill (stb)
+//   0.2  (2009-03-11) Fix unsigned/signed char warnings
+//   0.1  (2009-03-09) First public release
+//
diff --git a/thirdparty/misc/stb_vorbis.c b/thirdparty/misc/stb_vorbis.c
new file mode 100644
index 0000000000..c4f24d5898
--- /dev/null
+++ b/thirdparty/misc/stb_vorbis.c
@@ -0,0 +1,5399 @@
+// Ogg Vorbis audio decoder - v1.09 - public domain
+// http://nothings.org/stb_vorbis/
+//
+// Original version written by Sean Barrett in 2007.
+//
+// Originally sponsored by RAD Game Tools. Seeking sponsored
+// by Phillip Bennefall, Marc Andersen, Aaron Baker, Elias Software,
+// Aras Pranckevicius, and Sean Barrett.
+//
+// LICENSE
+//
+//   This software is dual-licensed to the public domain and under the following
+//   license: you are granted a perpetual, irrevocable license to copy, modify,
+//   publish, and distribute this file as you see fit.
+//
+// No warranty for any purpose is expressed or implied by the author (nor
+// by RAD Game Tools). Report bugs and send enhancements to the author.
+//
+// Limitations:
+//
+//   - floor 0 not supported (used in old ogg vorbis files pre-2004)
+//   - lossless sample-truncation at beginning ignored
+//   - cannot concatenate multiple vorbis streams
+//   - sample positions are 32-bit, limiting seekable 192Khz
+//       files to around 6 hours (Ogg supports 64-bit)
+//
+// Feature contributors:
+//    Dougall Johnson (sample-exact seeking)
+//
+// Bugfix/warning contributors:
+//    Terje Mathisen     Niklas Frykholm     Andy Hill
+//    Casey Muratori     John Bolton         Gargaj
+//    Laurent Gomila     Marc LeBlanc        Ronny Chevalier
+//    Bernhard Wodo      Evan Balster        alxprd@github
+//    Tom Beaumont       Ingo Leitgeb        Nicolas Guillemot
+//    Phillip Bennefall  Rohit               Thiago Goulart
+//    manxorist@github   saga musix
+//
+// Partial history:
+//    1.09    - 2016/04/04 - back out 'truncation of last frame' fix from previous version
+//    1.08    - 2016/04/02 - warnings; setup memory leaks; truncation of last frame
+//    1.07    - 2015/01/16 - fixes for crashes on invalid files; warning fixes; const
+//    1.06    - 2015/08/31 - full, correct support for seeking API (Dougall Johnson)
+//                           some crash fixes when out of memory or with corrupt files
+//                           fix some inappropriately signed shifts
+//    1.05    - 2015/04/19 - don't define __forceinline if it's redundant
+//    1.04    - 2014/08/27 - fix missing const-correct case in API
+//    1.03    - 2014/08/07 - warning fixes
+//    1.02    - 2014/07/09 - declare qsort comparison as explicitly _cdecl in Windows
+//    1.01    - 2014/06/18 - fix stb_vorbis_get_samples_float (interleaved was correct)
+//    1.0     - 2014/05/26 - fix memory leaks; fix warnings; fix bugs in >2-channel;
+//                           (API change) report sample rate for decode-full-file funcs
+//
+// See end of file for full version history.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  HEADER BEGINS HERE
+//
+
+#ifndef STB_VORBIS_INCLUDE_STB_VORBIS_H
+#define STB_VORBIS_INCLUDE_STB_VORBIS_H
+
+#if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
+#define STB_VORBIS_NO_STDIO 1
+#endif
+
+#ifndef STB_VORBIS_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////   THREAD SAFETY
+
+// Individual stb_vorbis* handles are not thread-safe; you cannot decode from
+// them from multiple threads at the same time. However, you can have multiple
+// stb_vorbis* handles and decode from them independently in multiple thrads.
+
+
+///////////   MEMORY ALLOCATION
+
+// normally stb_vorbis uses malloc() to allocate memory at startup,
+// and alloca() to allocate temporary memory during a frame on the
+// stack. (Memory consumption will depend on the amount of setup
+// data in the file and how you set the compile flags for speed
+// vs. size. In my test files the maximal-size usage is ~150KB.)
+//
+// You can modify the wrapper functions in the source (setup_malloc,
+// setup_temp_malloc, temp_malloc) to change this behavior, or you
+// can use a simpler allocation model: you pass in a buffer from
+// which stb_vorbis will allocate _all_ its memory (including the
+// temp memory). "open" may fail with a VORBIS_outofmem if you
+// do not pass in enough data; there is no way to determine how
+// much you do need except to succeed (at which point you can
+// query get_info to find the exact amount required. yes I know
+// this is lame).
+//
+// If you pass in a non-NULL buffer of the type below, allocation
+// will occur from it as described above. Otherwise just pass NULL
+// to use malloc()/alloca()
+
+typedef struct
+{
+   char *alloc_buffer;
+   int   alloc_buffer_length_in_bytes;
+} stb_vorbis_alloc;
+
+
+///////////   FUNCTIONS USEABLE WITH ALL INPUT MODES
+
+typedef struct stb_vorbis stb_vorbis;
+
+typedef struct
+{
+   unsigned int sample_rate;
+   int channels;
+
+   unsigned int setup_memory_required;
+   unsigned int setup_temp_memory_required;
+   unsigned int temp_memory_required;
+
+   int max_frame_size;
+} stb_vorbis_info;
+
+// get general information about the file
+extern stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f);
+
+// get the last error detected (clears it, too)
+extern int stb_vorbis_get_error(stb_vorbis *f);
+
+// close an ogg vorbis file and free all memory in use
+extern void stb_vorbis_close(stb_vorbis *f);
+
+// this function returns the offset (in samples) from the beginning of the
+// file that will be returned by the next decode, if it is known, or -1
+// otherwise. after a flush_pushdata() call, this may take a while before
+// it becomes valid again.
+// NOT WORKING YET after a seek with PULLDATA API
+extern int stb_vorbis_get_sample_offset(stb_vorbis *f);
+
+// returns the current seek point within the file, or offset from the beginning
+// of the memory buffer. In pushdata mode it returns 0.
+extern unsigned int stb_vorbis_get_file_offset(stb_vorbis *f);
+
+///////////   PUSHDATA API
+
+#ifndef STB_VORBIS_NO_PUSHDATA_API
+
+// this API allows you to get blocks of data from any source and hand
+// them to stb_vorbis. you have to buffer them; stb_vorbis will tell
+// you how much it used, and you have to give it the rest next time;
+// and stb_vorbis may not have enough data to work with and you will
+// need to give it the same data again PLUS more. Note that the Vorbis
+// specification does not bound the size of an individual frame.
+
+extern stb_vorbis *stb_vorbis_open_pushdata(
+         const unsigned char * datablock, int datablock_length_in_bytes,
+         int *datablock_memory_consumed_in_bytes,
+         int *error,
+         const stb_vorbis_alloc *alloc_buffer);
+// create a vorbis decoder by passing in the initial data block containing
+//    the ogg&vorbis headers (you don't need to do parse them, just provide
+//    the first N bytes of the file--you're told if it's not enough, see below)
+// on success, returns an stb_vorbis *, does not set error, returns the amount of
+//    data parsed/consumed on this call in *datablock_memory_consumed_in_bytes;
+// on failure, returns NULL on error and sets *error, does not change *datablock_memory_consumed
+// if returns NULL and *error is VORBIS_need_more_data, then the input block was
+//       incomplete and you need to pass in a larger block from the start of the file
+
+extern int stb_vorbis_decode_frame_pushdata(
+         stb_vorbis *f,
+         const unsigned char *datablock, int datablock_length_in_bytes,
+         int *channels,             // place to write number of float * buffers
+         float ***output,           // place to write float ** array of float * buffers
+         int *samples               // place to write number of output samples
+     );
+// decode a frame of audio sample data if possible from the passed-in data block
+//
+// return value: number of bytes we used from datablock
+//
+// possible cases:
+//     0 bytes used, 0 samples output (need more data)
+//     N bytes used, 0 samples output (resynching the stream, keep going)
+//     N bytes used, M samples output (one frame of data)
+// note that after opening a file, you will ALWAYS get one N-bytes,0-sample
+// frame, because Vorbis always "discards" the first frame.
+//
+// Note that on resynch, stb_vorbis will rarely consume all of the buffer,
+// instead only datablock_length_in_bytes-3 or less. This is because it wants
+// to avoid missing parts of a page header if they cross a datablock boundary,
+// without writing state-machiney code to record a partial detection.
+//
+// The number of channels returned are stored in *channels (which can be
+// NULL--it is always the same as the number of channels reported by
+// get_info). *output will contain an array of float* buffers, one per
+// channel. In other words, (*output)[0][0] contains the first sample from
+// the first channel, and (*output)[1][0] contains the first sample from
+// the second channel.
+
+extern void stb_vorbis_flush_pushdata(stb_vorbis *f);
+// inform stb_vorbis that your next datablock will not be contiguous with
+// previous ones (e.g. you've seeked in the data); future attempts to decode
+// frames will cause stb_vorbis to resynchronize (as noted above), and
+// once it sees a valid Ogg page (typically 4-8KB, as large as 64KB), it
+// will begin decoding the _next_ frame.
+//
+// if you want to seek using pushdata, you need to seek in your file, then
+// call stb_vorbis_flush_pushdata(), then start calling decoding, then once
+// decoding is returning you data, call stb_vorbis_get_sample_offset, and
+// if you don't like the result, seek your file again and repeat.
+#endif
+
+
+//////////   PULLING INPUT API
+
+#ifndef STB_VORBIS_NO_PULLDATA_API
+// This API assumes stb_vorbis is allowed to pull data from a source--
+// either a block of memory containing the _entire_ vorbis stream, or a
+// FILE * that you or it create, or possibly some other reading mechanism
+// if you go modify the source to replace the FILE * case with some kind
+// of callback to your code. (But if you don't support seeking, you may
+// just want to go ahead and use pushdata.)
+
+#if !defined(STB_VORBIS_NO_STDIO) && !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
+extern int stb_vorbis_decode_filename(const char *filename, int *channels, int *sample_rate, short **output);
+#endif
+#if !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
+extern int stb_vorbis_decode_memory(const unsigned char *mem, int len, int *channels, int *sample_rate, short **output);
+#endif
+// decode an entire file and output the data interleaved into a malloc()ed
+// buffer stored in *output. The return value is the number of samples
+// decoded, or -1 if the file could not be opened or was not an ogg vorbis file.
+// When you're done with it, just free() the pointer returned in *output.
+
+extern stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len,
+                                  int *error, const stb_vorbis_alloc *alloc_buffer);
+// create an ogg vorbis decoder from an ogg vorbis stream in memory (note
+// this must be the entire stream!). on failure, returns NULL and sets *error
+
+#ifndef STB_VORBIS_NO_STDIO
+extern stb_vorbis * stb_vorbis_open_filename(const char *filename,
+                                  int *error, const stb_vorbis_alloc *alloc_buffer);
+// create an ogg vorbis decoder from a filename via fopen(). on failure,
+// returns NULL and sets *error (possibly to VORBIS_file_open_failure).
+
+extern stb_vorbis * stb_vorbis_open_file(FILE *f, int close_handle_on_close,
+                                  int *error, const stb_vorbis_alloc *alloc_buffer);
+// create an ogg vorbis decoder from an open FILE *, looking for a stream at
+// the _current_ seek point (ftell). on failure, returns NULL and sets *error.
+// note that stb_vorbis must "own" this stream; if you seek it in between
+// calls to stb_vorbis, it will become confused. Morever, if you attempt to
+// perform stb_vorbis_seek_*() operations on this file, it will assume it
+// owns the _entire_ rest of the file after the start point. Use the next
+// function, stb_vorbis_open_file_section(), to limit it.
+
+extern stb_vorbis * stb_vorbis_open_file_section(FILE *f, int close_handle_on_close,
+                int *error, const stb_vorbis_alloc *alloc_buffer, unsigned int len);
+// create an ogg vorbis decoder from an open FILE *, looking for a stream at
+// the _current_ seek point (ftell); the stream will be of length 'len' bytes.
+// on failure, returns NULL and sets *error. note that stb_vorbis must "own"
+// this stream; if you seek it in between calls to stb_vorbis, it will become
+// confused.
+#endif
+
+extern int stb_vorbis_seek_frame(stb_vorbis *f, unsigned int sample_number);
+extern int stb_vorbis_seek(stb_vorbis *f, unsigned int sample_number);
+// these functions seek in the Vorbis file to (approximately) 'sample_number'.
+// after calling seek_frame(), the next call to get_frame_*() will include
+// the specified sample. after calling stb_vorbis_seek(), the next call to
+// stb_vorbis_get_samples_* will start with the specified sample. If you
+// do not need to seek to EXACTLY the target sample when using get_samples_*,
+// you can also use seek_frame().
+
+extern void stb_vorbis_seek_start(stb_vorbis *f);
+// this function is equivalent to stb_vorbis_seek(f,0)
+
+extern unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f);
+extern float        stb_vorbis_stream_length_in_seconds(stb_vorbis *f);
+// these functions return the total length of the vorbis stream
+
+extern int stb_vorbis_get_frame_float(stb_vorbis *f, int *channels, float ***output);
+// decode the next frame and return the number of samples. the number of
+// channels returned are stored in *channels (which can be NULL--it is always
+// the same as the number of channels reported by get_info). *output will
+// contain an array of float* buffers, one per channel. These outputs will
+// be overwritten on the next call to stb_vorbis_get_frame_*.
+//
+// You generally should not intermix calls to stb_vorbis_get_frame_*()
+// and stb_vorbis_get_samples_*(), since the latter calls the former.
+
+#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
+extern int stb_vorbis_get_frame_short_interleaved(stb_vorbis *f, int num_c, short *buffer, int num_shorts);
+extern int stb_vorbis_get_frame_short            (stb_vorbis *f, int num_c, short **buffer, int num_samples);
+#endif
+// decode the next frame and return the number of *samples* per channel.
+// Note that for interleaved data, you pass in the number of shorts (the
+// size of your array), but the return value is the number of samples per
+// channel, not the total number of samples.
+//
+// The data is coerced to the number of channels you request according to the
+// channel coercion rules (see below). You must pass in the size of your
+// buffer(s) so that stb_vorbis will not overwrite the end of the buffer.
+// The maximum buffer size needed can be gotten from get_info(); however,
+// the Vorbis I specification implies an absolute maximum of 4096 samples
+// per channel.
+
+// Channel coercion rules:
+//    Let M be the number of channels requested, and N the number of channels present,
+//    and Cn be the nth channel; let stereo L be the sum of all L and center channels,
+//    and stereo R be the sum of all R and center channels (channel assignment from the
+//    vorbis spec).
+//        M    N       output
+//        1    k      sum(Ck) for all k
+//        2    *      stereo L, stereo R
+//        k    l      k > l, the first l channels, then 0s
+//        k    l      k <= l, the first k channels
+//    Note that this is not _good_ surround etc. mixing at all! It's just so
+//    you get something useful.
+
+extern int stb_vorbis_get_samples_float_interleaved(stb_vorbis *f, int channels, float *buffer, int num_floats);
+extern int stb_vorbis_get_samples_float(stb_vorbis *f, int channels, float **buffer, int num_samples);
+// gets num_samples samples, not necessarily on a frame boundary--this requires
+// buffering so you have to supply the buffers. DOES NOT APPLY THE COERCION RULES.
+// Returns the number of samples stored per channel; it may be less than requested
+// at the end of the file. If there are no more samples in the file, returns 0.
+
+#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
+extern int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short *buffer, int num_shorts);
+extern int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, int num_samples);
+#endif
+// gets num_samples samples, not necessarily on a frame boundary--this requires
+// buffering so you have to supply the buffers. Applies the coercion rules above
+// to produce 'channels' channels. Returns the number of samples stored per channel;
+// it may be less than requested at the end of the file. If there are no more
+// samples in the file, returns 0.
+
+#endif
+
+////////   ERROR CODES
+
+enum STBVorbisError
+{
+   VORBIS__no_error,
+
+   VORBIS_need_more_data=1,             // not a real error
+
+   VORBIS_invalid_api_mixing,           // can't mix API modes
+   VORBIS_outofmem,                     // not enough memory
+   VORBIS_feature_not_supported,        // uses floor 0
+   VORBIS_too_many_channels,            // STB_VORBIS_MAX_CHANNELS is too small
+   VORBIS_file_open_failure,            // fopen() failed
+   VORBIS_seek_without_length,          // can't seek in unknown-length file
+
+   VORBIS_unexpected_eof=10,            // file is truncated?
+   VORBIS_seek_invalid,                 // seek past EOF
+
+   // decoding errors (corrupt/invalid stream) -- you probably
+   // don't care about the exact details of these
+
+   // vorbis errors:
+   VORBIS_invalid_setup=20,
+   VORBIS_invalid_stream,
+
+   // ogg errors:
+   VORBIS_missing_capture_pattern=30,
+   VORBIS_invalid_stream_structure_version,
+   VORBIS_continued_packet_flag_invalid,
+   VORBIS_incorrect_stream_serial_number,
+   VORBIS_invalid_first_page,
+   VORBIS_bad_packet_type,
+   VORBIS_cant_find_last_page,
+   VORBIS_seek_failed
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STB_VORBIS_INCLUDE_STB_VORBIS_H
+//
+//  HEADER ENDS HERE
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#ifndef STB_VORBIS_HEADER_ONLY
+
+// global configuration settings (e.g. set these in the project/makefile),
+// or just set them in this file at the top (although ideally the first few
+// should be visible when the header file is compiled too, although it's not
+// crucial)
+
+// STB_VORBIS_NO_PUSHDATA_API
+//     does not compile the code for the various stb_vorbis_*_pushdata()
+//     functions
+// #define STB_VORBIS_NO_PUSHDATA_API
+
+// STB_VORBIS_NO_PULLDATA_API
+//     does not compile the code for the non-pushdata APIs
+// #define STB_VORBIS_NO_PULLDATA_API
+
+// STB_VORBIS_NO_STDIO
+//     does not compile the code for the APIs that use FILE *s internally
+//     or externally (implied by STB_VORBIS_NO_PULLDATA_API)
+// #define STB_VORBIS_NO_STDIO
+
+// STB_VORBIS_NO_INTEGER_CONVERSION
+//     does not compile the code for converting audio sample data from
+//     float to integer (implied by STB_VORBIS_NO_PULLDATA_API)
+// #define STB_VORBIS_NO_INTEGER_CONVERSION
+
+// STB_VORBIS_NO_FAST_SCALED_FLOAT
+//      does not use a fast float-to-int trick to accelerate float-to-int on
+//      most platforms which requires endianness be defined correctly.
+//#define STB_VORBIS_NO_FAST_SCALED_FLOAT
+
+
+// STB_VORBIS_MAX_CHANNELS [number]
+//     globally define this to the maximum number of channels you need.
+//     The spec does not put a restriction on channels except that
+//     the count is stored in a byte, so 255 is the hard limit.
+//     Reducing this saves about 16 bytes per value, so using 16 saves
+//     (255-16)*16 or around 4KB. Plus anything other memory usage
+//     I forgot to account for. Can probably go as low as 8 (7.1 audio),
+//     6 (5.1 audio), or 2 (stereo only).
+#ifndef STB_VORBIS_MAX_CHANNELS
+#define STB_VORBIS_MAX_CHANNELS    16  // enough for anyone?
+#endif
+
+// STB_VORBIS_PUSHDATA_CRC_COUNT [number]
+//     after a flush_pushdata(), stb_vorbis begins scanning for the
+//     next valid page, without backtracking. when it finds something
+//     that looks like a page, it streams through it and verifies its
+//     CRC32. Should that validation fail, it keeps scanning. But it's
+//     possible that _while_ streaming through to check the CRC32 of
+//     one candidate page, it sees another candidate page. This #define
+//     determines how many "overlapping" candidate pages it can search
+//     at once. Note that "real" pages are typically ~4KB to ~8KB, whereas
+//     garbage pages could be as big as 64KB, but probably average ~16KB.
+//     So don't hose ourselves by scanning an apparent 64KB page and
+//     missing a ton of real ones in the interim; so minimum of 2
+#ifndef STB_VORBIS_PUSHDATA_CRC_COUNT
+#define STB_VORBIS_PUSHDATA_CRC_COUNT  4
+#endif
+
+// STB_VORBIS_FAST_HUFFMAN_LENGTH [number]
+//     sets the log size of the huffman-acceleration table.  Maximum
+//     supported value is 24. with larger numbers, more decodings are O(1),
+//     but the table size is larger so worse cache missing, so you'll have
+//     to probe (and try multiple ogg vorbis files) to find the sweet spot.
+#ifndef STB_VORBIS_FAST_HUFFMAN_LENGTH
+#define STB_VORBIS_FAST_HUFFMAN_LENGTH   10
+#endif
+
+// STB_VORBIS_FAST_BINARY_LENGTH [number]
+//     sets the log size of the binary-search acceleration table. this
+//     is used in similar fashion to the fast-huffman size to set initial
+//     parameters for the binary search
+
+// STB_VORBIS_FAST_HUFFMAN_INT
+//     The fast huffman tables are much more efficient if they can be
+//     stored as 16-bit results instead of 32-bit results. This restricts
+//     the codebooks to having only 65535 possible outcomes, though.
+//     (At least, accelerated by the huffman table.)
+#ifndef STB_VORBIS_FAST_HUFFMAN_INT
+#define STB_VORBIS_FAST_HUFFMAN_SHORT
+#endif
+
+// STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
+//     If the 'fast huffman' search doesn't succeed, then stb_vorbis falls
+//     back on binary searching for the correct one. This requires storing
+//     extra tables with the huffman codes in sorted order. Defining this
+//     symbol trades off space for speed by forcing a linear search in the
+//     non-fast case, except for "sparse" codebooks.
+// #define STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
+
+// STB_VORBIS_DIVIDES_IN_RESIDUE
+//     stb_vorbis precomputes the result of the scalar residue decoding
+//     that would otherwise require a divide per chunk. you can trade off
+//     space for time by defining this symbol.
+// #define STB_VORBIS_DIVIDES_IN_RESIDUE
+
+// STB_VORBIS_DIVIDES_IN_CODEBOOK
+//     vorbis VQ codebooks can be encoded two ways: with every case explicitly
+//     stored, or with all elements being chosen from a small range of values,
+//     and all values possible in all elements. By default, stb_vorbis expands
+//     this latter kind out to look like the former kind for ease of decoding,
+//     because otherwise an integer divide-per-vector-element is required to
+//     unpack the index. If you define STB_VORBIS_DIVIDES_IN_CODEBOOK, you can
+//     trade off storage for speed.
+//#define STB_VORBIS_DIVIDES_IN_CODEBOOK
+
+#ifdef STB_VORBIS_CODEBOOK_SHORTS
+#error "STB_VORBIS_CODEBOOK_SHORTS is no longer supported as it produced incorrect results for some input formats"
+#endif
+
+// STB_VORBIS_DIVIDE_TABLE
+//     this replaces small integer divides in the floor decode loop with
+//     table lookups. made less than 1% difference, so disabled by default.
+
+// STB_VORBIS_NO_INLINE_DECODE
+//     disables the inlining of the scalar codebook fast-huffman decode.
+//     might save a little codespace; useful for debugging
+// #define STB_VORBIS_NO_INLINE_DECODE
+
+// STB_VORBIS_NO_DEFER_FLOOR
+//     Normally we only decode the floor without synthesizing the actual
+//     full curve. We can instead synthesize the curve immediately. This
+//     requires more memory and is very likely slower, so I don't think
+//     you'd ever want to do it except for debugging.
+// #define STB_VORBIS_NO_DEFER_FLOOR
+
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+
+#ifdef STB_VORBIS_NO_PULLDATA_API
+   #define STB_VORBIS_NO_INTEGER_CONVERSION
+   #define STB_VORBIS_NO_STDIO
+#endif
+
+#if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
+   #define STB_VORBIS_NO_STDIO 1
+#endif
+
+#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
+#ifndef STB_VORBIS_NO_FAST_SCALED_FLOAT
+
+   // only need endianness for fast-float-to-int, which we don't
+   // use for pushdata
+
+   #ifndef STB_VORBIS_BIG_ENDIAN
+     #define STB_VORBIS_ENDIAN  0
+   #else
+     #define STB_VORBIS_ENDIAN  1
+   #endif
+
+#endif
+#endif
+
+
+#ifndef STB_VORBIS_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STB_VORBIS_NO_CRT
+   #include <stdlib.h>
+   #include <string.h>
+   #include <assert.h>
+   #include <math.h>
+
+   // find definition of alloca if it's not in stdlib.h:
+   #ifdef _MSC_VER
+      #include <malloc.h>
+   #endif
+   #if defined(__linux__) || defined(__linux) || defined(__EMSCRIPTEN__)
+      #include <alloca.h>
+   #endif
+#else // STB_VORBIS_NO_CRT
+   #define NULL 0
+   #define malloc(s)   0
+   #define free(s)     ((void) 0)
+   #define realloc(s)  0
+#endif // STB_VORBIS_NO_CRT
+
+#include <limits.h>
+
+#ifdef __MINGW32__
+   // eff you mingw:
+   //     "fixed":
+   //         http://sourceforge.net/p/mingw-w64/mailman/message/32882927/
+   //     "no that broke the build, reverted, who cares about C":
+   //         http://sourceforge.net/p/mingw-w64/mailman/message/32890381/
+   #ifdef __forceinline
+   #undef __forceinline
+   #endif
+   #define __forceinline
+#elif !defined(_MSC_VER)
+   #if __GNUC__
+      #define __forceinline inline
+   #else
+      #define __forceinline
+   #endif
+#endif
+
+#if STB_VORBIS_MAX_CHANNELS > 256
+#error "Value of STB_VORBIS_MAX_CHANNELS outside of allowed range"
+#endif
+
+#if STB_VORBIS_FAST_HUFFMAN_LENGTH > 24
+#error "Value of STB_VORBIS_FAST_HUFFMAN_LENGTH outside of allowed range"
+#endif
+
+
+#if 0
+#include <crtdbg.h>
+#define CHECK(f)   _CrtIsValidHeapPointer(f->channel_buffers[1])
+#else
+#define CHECK(f)   ((void) 0)
+#endif
+
+#define MAX_BLOCKSIZE_LOG  13   // from specification
+#define MAX_BLOCKSIZE      (1 << MAX_BLOCKSIZE_LOG)
+
+
+typedef unsigned char  uint8;
+typedef   signed char   int8;
+typedef unsigned short uint16;
+typedef   signed short  int16;
+typedef unsigned int   uint32;
+typedef   signed int    int32;
+
+#ifndef TRUE
+#define TRUE 1
+#define FALSE 0
+#endif
+
+typedef float codetype;
+
+// @NOTE
+//
+// Some arrays below are tagged "//varies", which means it's actually
+// a variable-sized piece of data, but rather than malloc I assume it's
+// small enough it's better to just allocate it all together with the
+// main thing
+//
+// Most of the variables are specified with the smallest size I could pack
+// them into. It might give better performance to make them all full-sized
+// integers. It should be safe to freely rearrange the structures or change
+// the sizes larger--nothing relies on silently truncating etc., nor the
+// order of variables.
+
+#define FAST_HUFFMAN_TABLE_SIZE   (1 << STB_VORBIS_FAST_HUFFMAN_LENGTH)
+#define FAST_HUFFMAN_TABLE_MASK   (FAST_HUFFMAN_TABLE_SIZE - 1)
+
+typedef struct
+{
+   int dimensions, entries;
+   uint8 *codeword_lengths;
+   float  minimum_value;
+   float  delta_value;
+   uint8  value_bits;
+   uint8  lookup_type;
+   uint8  sequence_p;
+   uint8  sparse;
+   uint32 lookup_values;
+   codetype *multiplicands;
+   uint32 *codewords;
+   #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
+    int16  fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
+   #else
+    int32  fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
+   #endif
+   uint32 *sorted_codewords;
+   int    *sorted_values;
+   int     sorted_entries;
+} Codebook;
+
+typedef struct
+{
+   uint8 order;
+   uint16 rate;
+   uint16 bark_map_size;
+   uint8 amplitude_bits;
+   uint8 amplitude_offset;
+   uint8 number_of_books;
+   uint8 book_list[16]; // varies
+} Floor0;
+
+typedef struct
+{
+   uint8 partitions;
+   uint8 partition_class_list[32]; // varies
+   uint8 class_dimensions[16]; // varies
+   uint8 class_subclasses[16]; // varies
+   uint8 class_masterbooks[16]; // varies
+   int16 subclass_books[16][8]; // varies
+   uint16 Xlist[31*8+2]; // varies
+   uint8 sorted_order[31*8+2];
+   uint8 neighbors[31*8+2][2];
+   uint8 floor1_multiplier;
+   uint8 rangebits;
+   int values;
+} Floor1;
+
+typedef union
+{
+   Floor0 floor0;
+   Floor1 floor1;
+} Floor;
+
+typedef struct
+{
+   uint32 begin, end;
+   uint32 part_size;
+   uint8 classifications;
+   uint8 classbook;
+   uint8 **classdata;
+   int16 (*residue_books)[8];
+} Residue;
+
+typedef struct
+{
+   uint8 magnitude;
+   uint8 angle;
+   uint8 mux;
+} MappingChannel;
+
+typedef struct
+{
+   uint16 coupling_steps;
+   MappingChannel *chan;
+   uint8  submaps;
+   uint8  submap_floor[15]; // varies
+   uint8  submap_residue[15]; // varies
+} Mapping;
+
+typedef struct
+{
+   uint8 blockflag;
+   uint8 mapping;
+   uint16 windowtype;
+   uint16 transformtype;
+} Mode;
+
+typedef struct
+{
+   uint32  goal_crc;    // expected crc if match
+   int     bytes_left;  // bytes left in packet
+   uint32  crc_so_far;  // running crc
+   int     bytes_done;  // bytes processed in _current_ chunk
+   uint32  sample_loc;  // granule pos encoded in page
+} CRCscan;
+
+typedef struct
+{
+   uint32 page_start, page_end;
+   uint32 last_decoded_sample;
+} ProbedPage;
+
+struct stb_vorbis
+{
+  // user-accessible info
+   unsigned int sample_rate;
+   int channels;
+
+   unsigned int setup_memory_required;
+   unsigned int temp_memory_required;
+   unsigned int setup_temp_memory_required;
+
+  // input config
+#ifndef STB_VORBIS_NO_STDIO
+   FILE *f;
+   uint32 f_start;
+   int close_on_free;
+#endif
+
+   uint8 *stream;
+   uint8 *stream_start;
+   uint8 *stream_end;
+
+   uint32 stream_len;
+
+   uint8  push_mode;
+
+   uint32 first_audio_page_offset;
+
+   ProbedPage p_first, p_last;
+
+  // memory management
+   stb_vorbis_alloc alloc;
+   int setup_offset;
+   int temp_offset;
+
+  // run-time results
+   int eof;
+   enum STBVorbisError error;
+
+  // user-useful data
+
+  // header info
+   int blocksize[2];
+   int blocksize_0, blocksize_1;
+   int codebook_count;
+   Codebook *codebooks;
+   int floor_count;
+   uint16 floor_types[64]; // varies
+   Floor *floor_config;
+   int residue_count;
+   uint16 residue_types[64]; // varies
+   Residue *residue_config;
+   int mapping_count;
+   Mapping *mapping;
+   int mode_count;
+   Mode mode_config[64];  // varies
+
+   uint32 total_samples;
+
+  // decode buffer
+   float *channel_buffers[STB_VORBIS_MAX_CHANNELS];
+   float *outputs        [STB_VORBIS_MAX_CHANNELS];
+
+   float *previous_window[STB_VORBIS_MAX_CHANNELS];
+   int previous_length;
+
+   #ifndef STB_VORBIS_NO_DEFER_FLOOR
+   int16 *finalY[STB_VORBIS_MAX_CHANNELS];
+   #else
+   float *floor_buffers[STB_VORBIS_MAX_CHANNELS];
+   #endif
+
+   uint32 current_loc; // sample location of next frame to decode
+   int    current_loc_valid;
+
+  // per-blocksize precomputed data
+   
+   // twiddle factors
+   float *A[2],*B[2],*C[2];
+   float *window[2];
+   uint16 *bit_reverse[2];
+
+  // current page/packet/segment streaming info
+   uint32 serial; // stream serial number for verification
+   int last_page;
+   int segment_count;
+   uint8 segments[255];
+   uint8 page_flag;
+   uint8 bytes_in_seg;
+   uint8 first_decode;
+   int next_seg;
+   int last_seg;  // flag that we're on the last segment
+   int last_seg_which; // what was the segment number of the last seg?
+   uint32 acc;
+   int valid_bits;
+   int packet_bytes;
+   int end_seg_with_known_loc;
+   uint32 known_loc_for_packet;
+   int discard_samples_deferred;
+   uint32 samples_output;
+
+  // push mode scanning
+   int page_crc_tests; // only in push_mode: number of tests active; -1 if not searching
+#ifndef STB_VORBIS_NO_PUSHDATA_API
+   CRCscan scan[STB_VORBIS_PUSHDATA_CRC_COUNT];
+#endif
+
+  // sample-access
+   int channel_buffer_start;
+   int channel_buffer_end;
+};
+
+#if defined(STB_VORBIS_NO_PUSHDATA_API)
+   #define IS_PUSH_MODE(f)   FALSE
+#elif defined(STB_VORBIS_NO_PULLDATA_API)
+   #define IS_PUSH_MODE(f)   TRUE
+#else
+   #define IS_PUSH_MODE(f)   ((f)->push_mode)
+#endif
+
+typedef struct stb_vorbis vorb;
+
+static int error(vorb *f, enum STBVorbisError e)
+{
+   f->error = e;
+   if (!f->eof && e != VORBIS_need_more_data) {
+      f->error=e; // breakpoint for debugging
+   }
+   return 0;
+}
+
+
+// these functions are used for allocating temporary memory
+// while decoding. if you can afford the stack space, use
+// alloca(); otherwise, provide a temp buffer and it will
+// allocate out of those.
+
+#define array_size_required(count,size)  (count*(sizeof(void *)+(size)))
+
+#define temp_alloc(f,size)              (f->alloc.alloc_buffer ? setup_temp_malloc(f,size) : alloca(size))
+#ifdef dealloca
+#define temp_free(f,p)                  (f->alloc.alloc_buffer ? 0 : dealloca(size))
+#else
+#define temp_free(f,p)                  0
+#endif
+#define temp_alloc_save(f)              ((f)->temp_offset)
+#define temp_alloc_restore(f,p)         ((f)->temp_offset = (p))
+
+#define temp_block_array(f,count,size)  make_block_array(temp_alloc(f,array_size_required(count,size)), count, size)
+
+// given a sufficiently large block of memory, make an array of pointers to subblocks of it
+static void *make_block_array(void *mem, int count, int size)
+{
+   int i;
+   void ** p = (void **) mem;
+   char *q = (char *) (p + count);
+   for (i=0; i < count; ++i) {
+      p[i] = q;
+      q += size;
+   }
+   return p;
+}
+
+static void *setup_malloc(vorb *f, int sz)
+{
+   sz = (sz+3) & ~3;
+   f->setup_memory_required += sz;
+   if (f->alloc.alloc_buffer) {
+      void *p = (char *) f->alloc.alloc_buffer + f->setup_offset;
+      if (f->setup_offset + sz > f->temp_offset) return NULL;
+      f->setup_offset += sz;
+      return p;
+   }
+   return sz ? malloc(sz) : NULL;
+}
+
+static void setup_free(vorb *f, void *p)
+{
+   if (f->alloc.alloc_buffer) return; // do nothing; setup mem is a stack
+   free(p);
+}
+
+static void *setup_temp_malloc(vorb *f, int sz)
+{
+   sz = (sz+3) & ~3;
+   if (f->alloc.alloc_buffer) {
+      if (f->temp_offset - sz < f->setup_offset) return NULL;
+      f->temp_offset -= sz;
+      return (char *) f->alloc.alloc_buffer + f->temp_offset;
+   }
+   return malloc(sz);
+}
+
+static void setup_temp_free(vorb *f, void *p, int sz)
+{
+   if (f->alloc.alloc_buffer) {
+      f->temp_offset += (sz+3)&~3;
+      return;
+   }
+   free(p);
+}
+
+#define CRC32_POLY    0x04c11db7   // from spec
+
+static uint32 crc_table[256];
+static void crc32_init(void)
+{
+   int i,j;
+   uint32 s;
+   for(i=0; i < 256; i++) {
+      for (s=(uint32) i << 24, j=0; j < 8; ++j)
+         s = (s << 1) ^ (s >= (1U<<31) ? CRC32_POLY : 0);
+      crc_table[i] = s;
+   }
+}
+
+static __forceinline uint32 crc32_update(uint32 crc, uint8 byte)
+{
+   return (crc << 8) ^ crc_table[byte ^ (crc >> 24)];
+}
+
+
+// used in setup, and for huffman that doesn't go fast path
+static unsigned int bit_reverse(unsigned int n)
+{
+  n = ((n & 0xAAAAAAAA) >>  1) | ((n & 0x55555555) << 1);
+  n = ((n & 0xCCCCCCCC) >>  2) | ((n & 0x33333333) << 2);
+  n = ((n & 0xF0F0F0F0) >>  4) | ((n & 0x0F0F0F0F) << 4);
+  n = ((n & 0xFF00FF00) >>  8) | ((n & 0x00FF00FF) << 8);
+  return (n >> 16) | (n << 16);
+}
+
+static float square(float x)
+{
+   return x*x;
+}
+
+// this is a weird definition of log2() for which log2(1) = 1, log2(2) = 2, log2(4) = 3
+// as required by the specification. fast(?) implementation from stb.h
+// @OPTIMIZE: called multiple times per-packet with "constants"; move to setup
+static int ilog(int32 n)
+{
+   static signed char log2_4[16] = { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
+
+   // 2 compares if n < 16, 3 compares otherwise (4 if signed or n > 1<<29)
+   if (n < (1 << 14))
+        if (n < (1 <<  4))        return     0 + log2_4[n      ];
+        else if (n < (1 <<  9))      return  5 + log2_4[n >>  5];
+             else                     return 10 + log2_4[n >> 10];
+   else if (n < (1 << 24))
+             if (n < (1 << 19))      return 15 + log2_4[n >> 15];
+             else                     return 20 + log2_4[n >> 20];
+        else if (n < (1 << 29))      return 25 + log2_4[n >> 25];
+             else if (n < (1 << 31)) return 30 + log2_4[n >> 30];
+                  else                return 0; // signed n returns 0
+}
+
+#ifndef M_PI
+  #define M_PI  3.14159265358979323846264f  // from CRC
+#endif
+
+// code length assigned to a value with no huffman encoding
+#define NO_CODE   255
+
+/////////////////////// LEAF SETUP FUNCTIONS //////////////////////////
+//
+// these functions are only called at setup, and only a few times
+// per file
+
+static float float32_unpack(uint32 x)
+{
+   // from the specification
+   uint32 mantissa = x & 0x1fffff;
+   uint32 sign = x & 0x80000000;
+   uint32 exp = (x & 0x7fe00000) >> 21;
+   double res = sign ? -(double)mantissa : (double)mantissa;
+   return (float) ldexp((float)res, exp-788);
+}
+
+
+// zlib & jpeg huffman tables assume that the output symbols
+// can either be arbitrarily arranged, or have monotonically
+// increasing frequencies--they rely on the lengths being sorted;
+// this makes for a very simple generation algorithm.
+// vorbis allows a huffman table with non-sorted lengths. This
+// requires a more sophisticated construction, since symbols in
+// order do not map to huffman codes "in order".
+static void add_entry(Codebook *c, uint32 huff_code, int symbol, int count, int len, uint32 *values)
+{
+   if (!c->sparse) {
+      c->codewords      [symbol] = huff_code;
+   } else {
+      c->codewords       [count] = huff_code;
+      c->codeword_lengths[count] = len;
+      values             [count] = symbol;
+   }
+}
+
+static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
+{
+   int i,k,m=0;
+   uint32 available[32];
+
+   memset(available, 0, sizeof(available));
+   // find the first entry
+   for (k=0; k < n; ++k) if (len[k] < NO_CODE) break;
+   if (k == n) { assert(c->sorted_entries == 0); return TRUE; }
+   // add to the list
+   add_entry(c, 0, k, m++, len[k], values);
+   // add all available leaves
+   for (i=1; i <= len[k]; ++i)
+      available[i] = 1U << (32-i);
+   // note that the above code treats the first case specially,
+   // but it's really the same as the following code, so they
+   // could probably be combined (except the initial code is 0,
+   // and I use 0 in available[] to mean 'empty')
+   for (i=k+1; i < n; ++i) {
+      uint32 res;
+      int z = len[i], y;
+      if (z == NO_CODE) continue;
+      // find lowest available leaf (should always be earliest,
+      // which is what the specification calls for)
+      // note that this property, and the fact we can never have
+      // more than one free leaf at a given level, isn't totally
+      // trivial to prove, but it seems true and the assert never
+      // fires, so!
+      while (z > 0 && !available[z]) --z;
+      if (z == 0) { return FALSE; }
+      res = available[z];
+      assert(z >= 0 && z < 32);
+      available[z] = 0;
+      add_entry(c, bit_reverse(res), i, m++, len[i], values);
+      // propogate availability up the tree
+      if (z != len[i]) {
+         assert(len[i] >= 0 && len[i] < 32);
+         for (y=len[i]; y > z; --y) {
+            assert(available[y] == 0);
+            available[y] = res + (1 << (32-y));
+         }
+      }
+   }
+   return TRUE;
+}
+
+// accelerated huffman table allows fast O(1) match of all symbols
+// of length <= STB_VORBIS_FAST_HUFFMAN_LENGTH
+static void compute_accelerated_huffman(Codebook *c)
+{
+   int i, len;
+   for (i=0; i < FAST_HUFFMAN_TABLE_SIZE; ++i)
+      c->fast_huffman[i] = -1;
+
+   len = c->sparse ? c->sorted_entries : c->entries;
+   #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
+   if (len > 32767) len = 32767; // largest possible value we can encode!
+   #endif
+   for (i=0; i < len; ++i) {
+      if (c->codeword_lengths[i] <= STB_VORBIS_FAST_HUFFMAN_LENGTH) {
+         uint32 z = c->sparse ? bit_reverse(c->sorted_codewords[i]) : c->codewords[i];
+         // set table entries for all bit combinations in the higher bits
+         while (z < FAST_HUFFMAN_TABLE_SIZE) {
+             c->fast_huffman[z] = i;
+             z += 1 << c->codeword_lengths[i];
+         }
+      }
+   }
+}
+
+#ifdef _MSC_VER
+#define STBV_CDECL __cdecl
+#else
+#define STBV_CDECL
+#endif
+
+static int STBV_CDECL uint32_compare(const void *p, const void *q)
+{
+   uint32 x = * (uint32 *) p;
+   uint32 y = * (uint32 *) q;
+   return x < y ? -1 : x > y;
+}
+
+static int include_in_sort(Codebook *c, uint8 len)
+{
+   if (c->sparse) { assert(len != NO_CODE); return TRUE; }
+   if (len == NO_CODE) return FALSE;
+   if (len > STB_VORBIS_FAST_HUFFMAN_LENGTH) return TRUE;
+   return FALSE;
+}
+
+// if the fast table above doesn't work, we want to binary
+// search them... need to reverse the bits
+static void compute_sorted_huffman(Codebook *c, uint8 *lengths, uint32 *values)
+{
+   int i, len;
+   // build a list of all the entries
+   // OPTIMIZATION: don't include the short ones, since they'll be caught by FAST_HUFFMAN.
+   // this is kind of a frivolous optimization--I don't see any performance improvement,
+   // but it's like 4 extra lines of code, so.
+   if (!c->sparse) {
+      int k = 0;
+      for (i=0; i < c->entries; ++i)
+         if (include_in_sort(c, lengths[i])) 
+            c->sorted_codewords[k++] = bit_reverse(c->codewords[i]);
+      assert(k == c->sorted_entries);
+   } else {
+      for (i=0; i < c->sorted_entries; ++i)
+         c->sorted_codewords[i] = bit_reverse(c->codewords[i]);
+   }
+
+   qsort(c->sorted_codewords, c->sorted_entries, sizeof(c->sorted_codewords[0]), uint32_compare);
+   c->sorted_codewords[c->sorted_entries] = 0xffffffff;
+
+   len = c->sparse ? c->sorted_entries : c->entries;
+   // now we need to indicate how they correspond; we could either
+   //   #1: sort a different data structure that says who they correspond to
+   //   #2: for each sorted entry, search the original list to find who corresponds
+   //   #3: for each original entry, find the sorted entry
+   // #1 requires extra storage, #2 is slow, #3 can use binary search!
+   for (i=0; i < len; ++i) {
+      int huff_len = c->sparse ? lengths[values[i]] : lengths[i];
+      if (include_in_sort(c,huff_len)) {
+         uint32 code = bit_reverse(c->codewords[i]);
+         int x=0, n=c->sorted_entries;
+         while (n > 1) {
+            // invariant: sc[x] <= code < sc[x+n]
+            int m = x + (n >> 1);
+            if (c->sorted_codewords[m] <= code) {
+               x = m;
+               n -= (n>>1);
+            } else {
+               n >>= 1;
+            }
+         }
+         assert(c->sorted_codewords[x] == code);
+         if (c->sparse) {
+            c->sorted_values[x] = values[i];
+            c->codeword_lengths[x] = huff_len;
+         } else {
+            c->sorted_values[x] = i;
+         }
+      }
+   }
+}
+
+// only run while parsing the header (3 times)
+static int vorbis_validate(uint8 *data)
+{
+   static uint8 vorbis[6] = { 'v', 'o', 'r', 'b', 'i', 's' };
+   return memcmp(data, vorbis, 6) == 0;
+}
+
+// called from setup only, once per code book
+// (formula implied by specification)
+static int lookup1_values(int entries, int dim)
+{
+   int r = (int) floor(exp((float) log((float) entries) / dim));
+   if ((int) floor(pow((float) r+1, dim)) <= entries)   // (int) cast for MinGW warning;
+      ++r;                                              // floor() to avoid _ftol() when non-CRT
+   assert(pow((float) r+1, dim) > entries);
+   assert((int) floor(pow((float) r, dim)) <= entries); // (int),floor() as above
+   return r;
+}
+
+// called twice per file
+static void compute_twiddle_factors(int n, float *A, float *B, float *C)
+{
+   int n4 = n >> 2, n8 = n >> 3;
+   int k,k2;
+
+   for (k=k2=0; k < n4; ++k,k2+=2) {
+      A[k2  ] = (float)  cos(4*k*M_PI/n);
+      A[k2+1] = (float) -sin(4*k*M_PI/n);
+      B[k2  ] = (float)  cos((k2+1)*M_PI/n/2) * 0.5f;
+      B[k2+1] = (float)  sin((k2+1)*M_PI/n/2) * 0.5f;
+   }
+   for (k=k2=0; k < n8; ++k,k2+=2) {
+      C[k2  ] = (float)  cos(2*(k2+1)*M_PI/n);
+      C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
+   }
+}
+
+static void compute_window(int n, float *window)
+{
+   int n2 = n >> 1, i;
+   for (i=0; i < n2; ++i)
+      window[i] = (float) sin(0.5 * M_PI * square((float) sin((i - 0 + 0.5) / n2 * 0.5 * M_PI)));
+}
+
+static void compute_bitreverse(int n, uint16 *rev)
+{
+   int ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
+   int i, n8 = n >> 3;
+   for (i=0; i < n8; ++i)
+      rev[i] = (bit_reverse(i) >> (32-ld+3)) << 2;
+}
+
+static int init_blocksize(vorb *f, int b, int n)
+{
+   int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3;
+   f->A[b] = (float *) setup_malloc(f, sizeof(float) * n2);
+   f->B[b] = (float *) setup_malloc(f, sizeof(float) * n2);
+   f->C[b] = (float *) setup_malloc(f, sizeof(float) * n4);
+   if (!f->A[b] || !f->B[b] || !f->C[b]) return error(f, VORBIS_outofmem);
+   compute_twiddle_factors(n, f->A[b], f->B[b], f->C[b]);
+   f->window[b] = (float *) setup_malloc(f, sizeof(float) * n2);
+   if (!f->window[b]) return error(f, VORBIS_outofmem);
+   compute_window(n, f->window[b]);
+   f->bit_reverse[b] = (uint16 *) setup_malloc(f, sizeof(uint16) * n8);
+   if (!f->bit_reverse[b]) return error(f, VORBIS_outofmem);
+   compute_bitreverse(n, f->bit_reverse[b]);
+   return TRUE;
+}
+
+static void neighbors(uint16 *x, int n, int *plow, int *phigh)
+{
+   int low = -1;
+   int high = 65536;
+   int i;
+   for (i=0; i < n; ++i) {
+      if (x[i] > low  && x[i] < x[n]) { *plow  = i; low = x[i]; }
+      if (x[i] < high && x[i] > x[n]) { *phigh = i; high = x[i]; }
+   }
+}
+
+// this has been repurposed so y is now the original index instead of y
+typedef struct
+{
+   uint16 x,y;
+} Point;
+
+static int STBV_CDECL point_compare(const void *p, const void *q)
+{
+   Point *a = (Point *) p;
+   Point *b = (Point *) q;
+   return a->x < b->x ? -1 : a->x > b->x;
+}
+
+//
+/////////////////////// END LEAF SETUP FUNCTIONS //////////////////////////
+
+
+#if defined(STB_VORBIS_NO_STDIO)
+   #define USE_MEMORY(z)    TRUE
+#else
+   #define USE_MEMORY(z)    ((z)->stream)
+#endif
+
+static uint8 get8(vorb *z)
+{
+   if (USE_MEMORY(z)) {
+      if (z->stream >= z->stream_end) { z->eof = TRUE; return 0; }
+      return *z->stream++;
+   }
+
+   #ifndef STB_VORBIS_NO_STDIO
+   {
+   int c = fgetc(z->f);
+   if (c == EOF) { z->eof = TRUE; return 0; }
+   return c;
+   }
+   #endif
+}
+
+static uint32 get32(vorb *f)
+{
+   uint32 x;
+   x = get8(f);
+   x += get8(f) << 8;
+   x += get8(f) << 16;
+   x += (uint32) get8(f) << 24;
+   return x;
+}
+
+static int getn(vorb *z, uint8 *data, int n)
+{
+   if (USE_MEMORY(z)) {
+      if (z->stream+n > z->stream_end) { z->eof = 1; return 0; }
+      memcpy(data, z->stream, n);
+      z->stream += n;
+      return 1;
+   }
+
+   #ifndef STB_VORBIS_NO_STDIO   
+   if (fread(data, n, 1, z->f) == 1)
+      return 1;
+   else {
+      z->eof = 1;
+      return 0;
+   }
+   #endif
+}
+
+static void skip(vorb *z, int n)
+{
+   if (USE_MEMORY(z)) {
+      z->stream += n;
+      if (z->stream >= z->stream_end) z->eof = 1;
+      return;
+   }
+   #ifndef STB_VORBIS_NO_STDIO
+   {
+      long x = ftell(z->f);
+      fseek(z->f, x+n, SEEK_SET);
+   }
+   #endif
+}
+
+static int set_file_offset(stb_vorbis *f, unsigned int loc)
+{
+   #ifndef STB_VORBIS_NO_PUSHDATA_API
+   if (f->push_mode) return 0;
+   #endif
+   f->eof = 0;
+   if (USE_MEMORY(f)) {
+      if (f->stream_start + loc >= f->stream_end || f->stream_start + loc < f->stream_start) {
+         f->stream = f->stream_end;
+         f->eof = 1;
+         return 0;
+      } else {
+         f->stream = f->stream_start + loc;
+         return 1;
+      }
+   }
+   #ifndef STB_VORBIS_NO_STDIO
+   if (loc + f->f_start < loc || loc >= 0x80000000) {
+      loc = 0x7fffffff;
+      f->eof = 1;
+   } else {
+      loc += f->f_start;
+   }
+   if (!fseek(f->f, loc, SEEK_SET))
+      return 1;
+   f->eof = 1;
+   fseek(f->f, f->f_start, SEEK_END);
+   return 0;
+   #endif
+}
+
+
+static uint8 ogg_page_header[4] = { 0x4f, 0x67, 0x67, 0x53 };
+
+static int capture_pattern(vorb *f)
+{
+   if (0x4f != get8(f)) return FALSE;
+   if (0x67 != get8(f)) return FALSE;
+   if (0x67 != get8(f)) return FALSE;
+   if (0x53 != get8(f)) return FALSE;
+   return TRUE;
+}
+
+#define PAGEFLAG_continued_packet   1
+#define PAGEFLAG_first_page         2
+#define PAGEFLAG_last_page          4
+
+static int start_page_no_capturepattern(vorb *f)
+{
+   uint32 loc0,loc1,n;
+   // stream structure version
+   if (0 != get8(f)) return error(f, VORBIS_invalid_stream_structure_version);
+   // header flag
+   f->page_flag = get8(f);
+   // absolute granule position
+   loc0 = get32(f); 
+   loc1 = get32(f);
+   // @TODO: validate loc0,loc1 as valid positions?
+   // stream serial number -- vorbis doesn't interleave, so discard
+   get32(f);
+   //if (f->serial != get32(f)) return error(f, VORBIS_incorrect_stream_serial_number);
+   // page sequence number
+   n = get32(f);
+   f->last_page = n;
+   // CRC32
+   get32(f);
+   // page_segments
+   f->segment_count = get8(f);
+   if (!getn(f, f->segments, f->segment_count))
+      return error(f, VORBIS_unexpected_eof);
+   // assume we _don't_ know any the sample position of any segments
+   f->end_seg_with_known_loc = -2;
+   if (loc0 != ~0U || loc1 != ~0U) {
+      int i;
+      // determine which packet is the last one that will complete
+      for (i=f->segment_count-1; i >= 0; --i)
+         if (f->segments[i] < 255)
+            break;
+      // 'i' is now the index of the _last_ segment of a packet that ends
+      if (i >= 0) {
+         f->end_seg_with_known_loc = i;
+         f->known_loc_for_packet   = loc0;
+      }
+   }
+   if (f->first_decode) {
+      int i,len;
+      ProbedPage p;
+      len = 0;
+      for (i=0; i < f->segment_count; ++i)
+         len += f->segments[i];
+      len += 27 + f->segment_count;
+      p.page_start = f->first_audio_page_offset;
+      p.page_end = p.page_start + len;
+      p.last_decoded_sample = loc0;
+      f->p_first = p;
+   }
+   f->next_seg = 0;
+   return TRUE;
+}
+
+static int start_page(vorb *f)
+{
+   if (!capture_pattern(f)) return error(f, VORBIS_missing_capture_pattern);
+   return start_page_no_capturepattern(f);
+}
+
+static int start_packet(vorb *f)
+{
+   while (f->next_seg == -1) {
+      if (!start_page(f)) return FALSE;
+      if (f->page_flag & PAGEFLAG_continued_packet)
+         return error(f, VORBIS_continued_packet_flag_invalid);
+   }
+   f->last_seg = FALSE;
+   f->valid_bits = 0;
+   f->packet_bytes = 0;
+   f->bytes_in_seg = 0;
+   // f->next_seg is now valid
+   return TRUE;
+}
+
+static int maybe_start_packet(vorb *f)
+{
+   if (f->next_seg == -1) {
+      int x = get8(f);
+      if (f->eof) return FALSE; // EOF at page boundary is not an error!
+      if (0x4f != x      ) return error(f, VORBIS_missing_capture_pattern);
+      if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
+      if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
+      if (0x53 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
+      if (!start_page_no_capturepattern(f)) return FALSE;
+      if (f->page_flag & PAGEFLAG_continued_packet) {
+         // set up enough state that we can read this packet if we want,
+         // e.g. during recovery
+         f->last_seg = FALSE;
+         f->bytes_in_seg = 0;
+         return error(f, VORBIS_continued_packet_flag_invalid);
+      }
+   }
+   return start_packet(f);
+}
+
+static int next_segment(vorb *f)
+{
+   int len;
+   if (f->last_seg) return 0;
+   if (f->next_seg == -1) {
+      f->last_seg_which = f->segment_count-1; // in case start_page fails
+      if (!start_page(f)) { f->last_seg = 1; return 0; }
+      if (!(f->page_flag & PAGEFLAG_continued_packet)) return error(f, VORBIS_continued_packet_flag_invalid);
+   }
+   len = f->segments[f->next_seg++];
+   if (len < 255) {
+      f->last_seg = TRUE;
+      f->last_seg_which = f->next_seg-1;
+   }
+   if (f->next_seg >= f->segment_count)
+      f->next_seg = -1;
+   assert(f->bytes_in_seg == 0);
+   f->bytes_in_seg = len;
+   return len;
+}
+
+#define EOP    (-1)
+#define INVALID_BITS  (-1)
+
+static int get8_packet_raw(vorb *f)
+{
+   if (!f->bytes_in_seg) {  // CLANG!
+      if (f->last_seg) return EOP;
+      else if (!next_segment(f)) return EOP;
+   }
+   assert(f->bytes_in_seg > 0);
+   --f->bytes_in_seg;
+   ++f->packet_bytes;
+   return get8(f);
+}
+
+static int get8_packet(vorb *f)
+{
+   int x = get8_packet_raw(f);
+   f->valid_bits = 0;
+   return x;
+}
+
+static void flush_packet(vorb *f)
+{
+   while (get8_packet_raw(f) != EOP);
+}
+
+// @OPTIMIZE: this is the secondary bit decoder, so it's probably not as important
+// as the huffman decoder?
+static uint32 get_bits(vorb *f, int n)
+{
+   uint32 z;
+
+   if (f->valid_bits < 0) return 0;
+   if (f->valid_bits < n) {
+      if (n > 24) {
+         // the accumulator technique below would not work correctly in this case
+         z = get_bits(f, 24);
+         z += get_bits(f, n-24) << 24;
+         return z;
+      }
+      if (f->valid_bits == 0) f->acc = 0;
+      while (f->valid_bits < n) {
+         int z = get8_packet_raw(f);
+         if (z == EOP) {
+            f->valid_bits = INVALID_BITS;
+            return 0;
+         }
+         f->acc += z << f->valid_bits;
+         f->valid_bits += 8;
+      }
+   }
+   if (f->valid_bits < 0) return 0;
+   z = f->acc & ((1 << n)-1);
+   f->acc >>= n;
+   f->valid_bits -= n;
+   return z;
+}
+
+// @OPTIMIZE: primary accumulator for huffman
+// expand the buffer to as many bits as possible without reading off end of packet
+// it might be nice to allow f->valid_bits and f->acc to be stored in registers,
+// e.g. cache them locally and decode locally
+static __forceinline void prep_huffman(vorb *f)
+{
+   if (f->valid_bits <= 24) {
+      if (f->valid_bits == 0) f->acc = 0;
+      do {
+         int z;
+         if (f->last_seg && !f->bytes_in_seg) return;
+         z = get8_packet_raw(f);
+         if (z == EOP) return;
+         f->acc += (unsigned) z << f->valid_bits;
+         f->valid_bits += 8;
+      } while (f->valid_bits <= 24);
+   }
+}
+
+enum
+{
+   VORBIS_packet_id = 1,
+   VORBIS_packet_comment = 3,
+   VORBIS_packet_setup = 5
+};
+
+static int codebook_decode_scalar_raw(vorb *f, Codebook *c)
+{
+   int i;
+   prep_huffman(f);
+
+   if (c->codewords == NULL && c->sorted_codewords == NULL)
+      return -1;
+
+   // cases to use binary search: sorted_codewords && !c->codewords
+   //                             sorted_codewords && c->entries > 8
+   if (c->entries > 8 ? c->sorted_codewords!=NULL : !c->codewords) {
+      // binary search
+      uint32 code = bit_reverse(f->acc);
+      int x=0, n=c->sorted_entries, len;
+
+      while (n > 1) {
+         // invariant: sc[x] <= code < sc[x+n]
+         int m = x + (n >> 1);
+         if (c->sorted_codewords[m] <= code) {
+            x = m;
+            n -= (n>>1);
+         } else {
+            n >>= 1;
+         }
+      }
+      // x is now the sorted index
+      if (!c->sparse) x = c->sorted_values[x];
+      // x is now sorted index if sparse, or symbol otherwise
+      len = c->codeword_lengths[x];
+      if (f->valid_bits >= len) {
+         f->acc >>= len;
+         f->valid_bits -= len;
+         return x;
+      }
+
+      f->valid_bits = 0;
+      return -1;
+   }
+
+   // if small, linear search
+   assert(!c->sparse);
+   for (i=0; i < c->entries; ++i) {
+      if (c->codeword_lengths[i] == NO_CODE) continue;
+      if (c->codewords[i] == (f->acc & ((1 << c->codeword_lengths[i])-1))) {
+         if (f->valid_bits >= c->codeword_lengths[i]) {
+            f->acc >>= c->codeword_lengths[i];
+            f->valid_bits -= c->codeword_lengths[i];
+            return i;
+         }
+         f->valid_bits = 0;
+         return -1;
+      }
+   }
+
+   error(f, VORBIS_invalid_stream);
+   f->valid_bits = 0;
+   return -1;
+}
+
+#ifndef STB_VORBIS_NO_INLINE_DECODE
+
+#define DECODE_RAW(var, f,c)                                  \
+   if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH)        \
+      prep_huffman(f);                                        \
+   var = f->acc & FAST_HUFFMAN_TABLE_MASK;                    \
+   var = c->fast_huffman[var];                                \
+   if (var >= 0) {                                            \
+      int n = c->codeword_lengths[var];                       \
+      f->acc >>= n;                                           \
+      f->valid_bits -= n;                                     \
+      if (f->valid_bits < 0) { f->valid_bits = 0; var = -1; } \
+   } else {                                                   \
+      var = codebook_decode_scalar_raw(f,c);                  \
+   }
+
+#else
+
+static int codebook_decode_scalar(vorb *f, Codebook *c)
+{
+   int i;
+   if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH)
+      prep_huffman(f);
+   // fast huffman table lookup
+   i = f->acc & FAST_HUFFMAN_TABLE_MASK;
+   i = c->fast_huffman[i];
+   if (i >= 0) {
+      f->acc >>= c->codeword_lengths[i];
+      f->valid_bits -= c->codeword_lengths[i];
+      if (f->valid_bits < 0) { f->valid_bits = 0; return -1; }
+      return i;
+   }
+   return codebook_decode_scalar_raw(f,c);
+}
+
+#define DECODE_RAW(var,f,c)    var = codebook_decode_scalar(f,c);
+
+#endif
+
+#define DECODE(var,f,c)                                       \
+   DECODE_RAW(var,f,c)                                        \
+   if (c->sparse) var = c->sorted_values[var];
+
+#ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
+  #define DECODE_VQ(var,f,c)   DECODE_RAW(var,f,c)
+#else
+  #define DECODE_VQ(var,f,c)   DECODE(var,f,c)
+#endif
+
+
+
+
+
+
+// CODEBOOK_ELEMENT_FAST is an optimization for the CODEBOOK_FLOATS case
+// where we avoid one addition
+#define CODEBOOK_ELEMENT(c,off)          (c->multiplicands[off])
+#define CODEBOOK_ELEMENT_FAST(c,off)     (c->multiplicands[off])
+#define CODEBOOK_ELEMENT_BASE(c)         (0)
+
+static int codebook_decode_start(vorb *f, Codebook *c)
+{
+   int z = -1;
+
+   // type 0 is only legal in a scalar context
+   if (c->lookup_type == 0)
+      error(f, VORBIS_invalid_stream);
+   else {
+      DECODE_VQ(z,f,c);
+      if (c->sparse) assert(z < c->sorted_entries);
+      if (z < 0) {  // check for EOP
+         if (!f->bytes_in_seg)
+            if (f->last_seg)
+               return z;
+         error(f, VORBIS_invalid_stream);
+      }
+   }
+   return z;
+}
+
+static int codebook_decode(vorb *f, Codebook *c, float *output, int len)
+{
+   int i,z = codebook_decode_start(f,c);
+   if (z < 0) return FALSE;
+   if (len > c->dimensions) len = c->dimensions;
+
+#ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
+   if (c->lookup_type == 1) {
+      float last = CODEBOOK_ELEMENT_BASE(c);
+      int div = 1;
+      for (i=0; i < len; ++i) {
+         int off = (z / div) % c->lookup_values;
+         float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
+         output[i] += val;
+         if (c->sequence_p) last = val + c->minimum_value;
+         div *= c->lookup_values;
+      }
+      return TRUE;
+   }
+#endif
+
+   z *= c->dimensions;
+   if (c->sequence_p) {
+      float last = CODEBOOK_ELEMENT_BASE(c);
+      for (i=0; i < len; ++i) {
+         float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
+         output[i] += val;
+         last = val + c->minimum_value;
+      }
+   } else {
+      float last = CODEBOOK_ELEMENT_BASE(c);
+      for (i=0; i < len; ++i) {
+         output[i] += CODEBOOK_ELEMENT_FAST(c,z+i) + last;
+      }
+   }
+
+   return TRUE;
+}
+
+static int codebook_decode_step(vorb *f, Codebook *c, float *output, int len, int step)
+{
+   int i,z = codebook_decode_start(f,c);
+   float last = CODEBOOK_ELEMENT_BASE(c);
+   if (z < 0) return FALSE;
+   if (len > c->dimensions) len = c->dimensions;
+
+#ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
+   if (c->lookup_type == 1) {
+      int div = 1;
+      for (i=0; i < len; ++i) {
+         int off = (z / div) % c->lookup_values;
+         float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
+         output[i*step] += val;
+         if (c->sequence_p) last = val;
+         div *= c->lookup_values;
+      }
+      return TRUE;
+   }
+#endif
+
+   z *= c->dimensions;
+   for (i=0; i < len; ++i) {
+      float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
+      output[i*step] += val;
+      if (c->sequence_p) last = val;
+   }
+
+   return TRUE;
+}
+
+static int codebook_decode_deinterleave_repeat(vorb *f, Codebook *c, float **outputs, int ch, int *c_inter_p, int *p_inter_p, int len, int total_decode)
+{
+   int c_inter = *c_inter_p;
+   int p_inter = *p_inter_p;
+   int i,z, effective = c->dimensions;
+
+   // type 0 is only legal in a scalar context
+   if (c->lookup_type == 0)   return error(f, VORBIS_invalid_stream);
+
+   while (total_decode > 0) {
+      float last = CODEBOOK_ELEMENT_BASE(c);
+      DECODE_VQ(z,f,c);
+      #ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
+      assert(!c->sparse || z < c->sorted_entries);
+      #endif
+      if (z < 0) {
+         if (!f->bytes_in_seg)
+            if (f->last_seg) return FALSE;
+         return error(f, VORBIS_invalid_stream);
+      }
+
+      // if this will take us off the end of the buffers, stop short!
+      // we check by computing the length of the virtual interleaved
+      // buffer (len*ch), our current offset within it (p_inter*ch)+(c_inter),
+      // and the length we'll be using (effective)
+      if (c_inter + p_inter*ch + effective > len * ch) {
+         effective = len*ch - (p_inter*ch - c_inter);
+      }
+
+   #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
+      if (c->lookup_type == 1) {
+         int div = 1;
+         for (i=0; i < effective; ++i) {
+            int off = (z / div) % c->lookup_values;
+            float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
+            if (outputs[c_inter])
+               outputs[c_inter][p_inter] += val;
+            if (++c_inter == ch) { c_inter = 0; ++p_inter; }
+            if (c->sequence_p) last = val;
+            div *= c->lookup_values;
+         }
+      } else
+   #endif
+      {
+         z *= c->dimensions;
+         if (c->sequence_p) {
+            for (i=0; i < effective; ++i) {
+               float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
+               if (outputs[c_inter])
+                  outputs[c_inter][p_inter] += val;
+               if (++c_inter == ch) { c_inter = 0; ++p_inter; }
+               last = val;
+            }
+         } else {
+            for (i=0; i < effective; ++i) {
+               float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
+               if (outputs[c_inter])
+                  outputs[c_inter][p_inter] += val;
+               if (++c_inter == ch) { c_inter = 0; ++p_inter; }
+            }
+         }
+      }
+
+      total_decode -= effective;
+   }
+   *c_inter_p = c_inter;
+   *p_inter_p = p_inter;
+   return TRUE;
+}
+
+static int predict_point(int x, int x0, int x1, int y0, int y1)
+{
+   int dy = y1 - y0;
+   int adx = x1 - x0;
+   // @OPTIMIZE: force int division to round in the right direction... is this necessary on x86?
+   int err = abs(dy) * (x - x0);
+   int off = err / adx;
+   return dy < 0 ? y0 - off : y0 + off;
+}
+
+// the following table is block-copied from the specification
+static float inverse_db_table[256] =
+{
+  1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f, 
+  1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f, 
+  1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f, 
+  2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f, 
+  2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f, 
+  3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f, 
+  4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f, 
+  6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f, 
+  7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f, 
+  1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f, 
+  1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f, 
+  1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f, 
+  2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f, 
+  2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f, 
+  3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f, 
+  4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f, 
+  5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f, 
+  7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f, 
+  9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f, 
+  1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f, 
+  1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f, 
+  2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f, 
+  2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f, 
+  3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f, 
+  4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f, 
+  5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f, 
+  7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f, 
+  9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f, 
+  0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f, 
+  0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f, 
+  0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f, 
+  0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f, 
+  0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f, 
+  0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f, 
+  0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f, 
+  0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f, 
+  0.00092223983f, 0.00098217216f, 0.0010459992f,  0.0011139742f, 
+  0.0011863665f,  0.0012634633f,  0.0013455702f,  0.0014330129f, 
+  0.0015261382f,  0.0016253153f,  0.0017309374f,  0.0018434235f, 
+  0.0019632195f,  0.0020908006f,  0.0022266726f,  0.0023713743f, 
+  0.0025254795f,  0.0026895994f,  0.0028643847f,  0.0030505286f, 
+  0.0032487691f,  0.0034598925f,  0.0036847358f,  0.0039241906f, 
+  0.0041792066f,  0.0044507950f,  0.0047400328f,  0.0050480668f, 
+  0.0053761186f,  0.0057254891f,  0.0060975636f,  0.0064938176f, 
+  0.0069158225f,  0.0073652516f,  0.0078438871f,  0.0083536271f, 
+  0.0088964928f,  0.009474637f,   0.010090352f,   0.010746080f, 
+  0.011444421f,   0.012188144f,   0.012980198f,   0.013823725f, 
+  0.014722068f,   0.015678791f,   0.016697687f,   0.017782797f, 
+  0.018938423f,   0.020169149f,   0.021479854f,   0.022875735f, 
+  0.024362330f,   0.025945531f,   0.027631618f,   0.029427276f, 
+  0.031339626f,   0.033376252f,   0.035545228f,   0.037855157f, 
+  0.040315199f,   0.042935108f,   0.045725273f,   0.048696758f, 
+  0.051861348f,   0.055231591f,   0.058820850f,   0.062643361f, 
+  0.066714279f,   0.071049749f,   0.075666962f,   0.080584227f, 
+  0.085821044f,   0.091398179f,   0.097337747f,   0.10366330f, 
+  0.11039993f,    0.11757434f,    0.12521498f,    0.13335215f, 
+  0.14201813f,    0.15124727f,    0.16107617f,    0.17154380f, 
+  0.18269168f,    0.19456402f,    0.20720788f,    0.22067342f, 
+  0.23501402f,    0.25028656f,    0.26655159f,    0.28387361f, 
+  0.30232132f,    0.32196786f,    0.34289114f,    0.36517414f, 
+  0.38890521f,    0.41417847f,    0.44109412f,    0.46975890f, 
+  0.50028648f,    0.53279791f,    0.56742212f,    0.60429640f, 
+  0.64356699f,    0.68538959f,    0.72993007f,    0.77736504f, 
+  0.82788260f,    0.88168307f,    0.9389798f,     1.0f
+};
+
+
+// @OPTIMIZE: if you want to replace this bresenham line-drawing routine,
+// note that you must produce bit-identical output to decode correctly;
+// this specific sequence of operations is specified in the spec (it's
+// drawing integer-quantized frequency-space lines that the encoder
+// expects to be exactly the same)
+//     ... also, isn't the whole point of Bresenham's algorithm to NOT
+// have to divide in the setup? sigh.
+#ifndef STB_VORBIS_NO_DEFER_FLOOR
+#define LINE_OP(a,b)   a *= b
+#else
+#define LINE_OP(a,b)   a = b
+#endif
+
+#ifdef STB_VORBIS_DIVIDE_TABLE
+#define DIVTAB_NUMER   32
+#define DIVTAB_DENOM   64
+int8 integer_divide_table[DIVTAB_NUMER][DIVTAB_DENOM]; // 2KB
+#endif
+
+static __forceinline void draw_line(float *output, int x0, int y0, int x1, int y1, int n)
+{
+   int dy = y1 - y0;
+   int adx = x1 - x0;
+   int ady = abs(dy);
+   int base;
+   int x=x0,y=y0;
+   int err = 0;
+   int sy;
+
+#ifdef STB_VORBIS_DIVIDE_TABLE
+   if (adx < DIVTAB_DENOM && ady < DIVTAB_NUMER) {
+      if (dy < 0) {
+         base = -integer_divide_table[ady][adx];
+         sy = base-1;
+      } else {
+         base =  integer_divide_table[ady][adx];
+         sy = base+1;
+      }
+   } else {
+      base = dy / adx;
+      if (dy < 0)
+         sy = base - 1;
+      else
+         sy = base+1;
+   }
+#else
+   base = dy / adx;
+   if (dy < 0)
+      sy = base - 1;
+   else
+      sy = base+1;
+#endif
+   ady -= abs(base) * adx;
+   if (x1 > n) x1 = n;
+   if (x < x1) {
+      LINE_OP(output[x], inverse_db_table[y]);
+      for (++x; x < x1; ++x) {
+         err += ady;
+         if (err >= adx) {
+            err -= adx;
+            y += sy;
+         } else
+            y += base;
+         LINE_OP(output[x], inverse_db_table[y]);
+      }
+   }
+}
+
+static int residue_decode(vorb *f, Codebook *book, float *target, int offset, int n, int rtype)
+{
+   int k;
+   if (rtype == 0) {
+      int step = n / book->dimensions;
+      for (k=0; k < step; ++k)
+         if (!codebook_decode_step(f, book, target+offset+k, n-offset-k, step))
+            return FALSE;
+   } else {
+      for (k=0; k < n; ) {
+         if (!codebook_decode(f, book, target+offset, n-k))
+            return FALSE;
+         k += book->dimensions;
+         offset += book->dimensions;
+      }
+   }
+   return TRUE;
+}
+
+static void decode_residue(vorb *f, float *residue_buffers[], int ch, int n, int rn, uint8 *do_not_decode)
+{
+   int i,j,pass;
+   Residue *r = f->residue_config + rn;
+   int rtype = f->residue_types[rn];
+   int c = r->classbook;
+   int classwords = f->codebooks[c].dimensions;
+   int n_read = r->end - r->begin;
+   int part_read = n_read / r->part_size;
+   int temp_alloc_point = temp_alloc_save(f);
+   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+   uint8 ***part_classdata = (uint8 ***) temp_block_array(f,f->channels, part_read * sizeof(**part_classdata));
+   #else
+   int **classifications = (int **) temp_block_array(f,f->channels, part_read * sizeof(**classifications));
+   #endif
+
+   CHECK(f);
+
+   for (i=0; i < ch; ++i)
+      if (!do_not_decode[i])
+         memset(residue_buffers[i], 0, sizeof(float) * n);
+
+   if (rtype == 2 && ch != 1) {
+      for (j=0; j < ch; ++j)
+         if (!do_not_decode[j])
+            break;
+      if (j == ch)
+         goto done;
+
+      for (pass=0; pass < 8; ++pass) {
+         int pcount = 0, class_set = 0;
+         if (ch == 2) {
+            while (pcount < part_read) {
+               int z = r->begin + pcount*r->part_size;
+               int c_inter = (z & 1), p_inter = z>>1;
+               if (pass == 0) {
+                  Codebook *c = f->codebooks+r->classbook;
+                  int q;
+                  DECODE(q,f,c);
+                  if (q == EOP) goto done;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  part_classdata[0][class_set] = r->classdata[q];
+                  #else
+                  for (i=classwords-1; i >= 0; --i) {
+                     classifications[0][i+pcount] = q % r->classifications;
+                     q /= r->classifications;
+                  }
+                  #endif
+               }
+               for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
+                  int z = r->begin + pcount*r->part_size;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  int c = part_classdata[0][class_set][i];
+                  #else
+                  int c = classifications[0][pcount];
+                  #endif
+                  int b = r->residue_books[c][pass];
+                  if (b >= 0) {
+                     Codebook *book = f->codebooks + b;
+                     #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
+                     if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
+                        goto done;
+                     #else
+                     // saves 1%
+                     if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
+                        goto done;
+                     #endif
+                  } else {
+                     z += r->part_size;
+                     c_inter = z & 1;
+                     p_inter = z >> 1;
+                  }
+               }
+               #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+               ++class_set;
+               #endif
+            }
+         } else if (ch == 1) {
+            while (pcount < part_read) {
+               int z = r->begin + pcount*r->part_size;
+               int c_inter = 0, p_inter = z;
+               if (pass == 0) {
+                  Codebook *c = f->codebooks+r->classbook;
+                  int q;
+                  DECODE(q,f,c);
+                  if (q == EOP) goto done;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  part_classdata[0][class_set] = r->classdata[q];
+                  #else
+                  for (i=classwords-1; i >= 0; --i) {
+                     classifications[0][i+pcount] = q % r->classifications;
+                     q /= r->classifications;
+                  }
+                  #endif
+               }
+               for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
+                  int z = r->begin + pcount*r->part_size;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  int c = part_classdata[0][class_set][i];
+                  #else
+                  int c = classifications[0][pcount];
+                  #endif
+                  int b = r->residue_books[c][pass];
+                  if (b >= 0) {
+                     Codebook *book = f->codebooks + b;
+                     if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
+                        goto done;
+                  } else {
+                     z += r->part_size;
+                     c_inter = 0;
+                     p_inter = z;
+                  }
+               }
+               #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+               ++class_set;
+               #endif
+            }
+         } else {
+            while (pcount < part_read) {
+               int z = r->begin + pcount*r->part_size;
+               int c_inter = z % ch, p_inter = z/ch;
+               if (pass == 0) {
+                  Codebook *c = f->codebooks+r->classbook;
+                  int q;
+                  DECODE(q,f,c);
+                  if (q == EOP) goto done;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  part_classdata[0][class_set] = r->classdata[q];
+                  #else
+                  for (i=classwords-1; i >= 0; --i) {
+                     classifications[0][i+pcount] = q % r->classifications;
+                     q /= r->classifications;
+                  }
+                  #endif
+               }
+               for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
+                  int z = r->begin + pcount*r->part_size;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  int c = part_classdata[0][class_set][i];
+                  #else
+                  int c = classifications[0][pcount];
+                  #endif
+                  int b = r->residue_books[c][pass];
+                  if (b >= 0) {
+                     Codebook *book = f->codebooks + b;
+                     if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
+                        goto done;
+                  } else {
+                     z += r->part_size;
+                     c_inter = z % ch;
+                     p_inter = z / ch;
+                  }
+               }
+               #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+               ++class_set;
+               #endif
+            }
+         }
+      }
+      goto done;
+   }
+   CHECK(f);
+
+   for (pass=0; pass < 8; ++pass) {
+      int pcount = 0, class_set=0;
+      while (pcount < part_read) {
+         if (pass == 0) {
+            for (j=0; j < ch; ++j) {
+               if (!do_not_decode[j]) {
+                  Codebook *c = f->codebooks+r->classbook;
+                  int temp;
+                  DECODE(temp,f,c);
+                  if (temp == EOP) goto done;
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  part_classdata[j][class_set] = r->classdata[temp];
+                  #else
+                  for (i=classwords-1; i >= 0; --i) {
+                     classifications[j][i+pcount] = temp % r->classifications;
+                     temp /= r->classifications;
+                  }
+                  #endif
+               }
+            }
+         }
+         for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
+            for (j=0; j < ch; ++j) {
+               if (!do_not_decode[j]) {
+                  #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+                  int c = part_classdata[j][class_set][i];
+                  #else
+                  int c = classifications[j][pcount];
+                  #endif
+                  int b = r->residue_books[c][pass];
+                  if (b >= 0) {
+                     float *target = residue_buffers[j];
+                     int offset = r->begin + pcount * r->part_size;
+                     int n = r->part_size;
+                     Codebook *book = f->codebooks + b;
+                     if (!residue_decode(f, book, target, offset, n, rtype))
+                        goto done;
+                  }
+               }
+            }
+         }
+         #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+         ++class_set;
+         #endif
+      }
+   }
+  done:
+   CHECK(f);
+   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+   temp_free(f,part_classdata);
+   #else
+   temp_free(f,classifications);
+   #endif
+   temp_alloc_restore(f,temp_alloc_point);
+}
+
+
+#if 0
+// slow way for debugging
+void inverse_mdct_slow(float *buffer, int n)
+{
+   int i,j;
+   int n2 = n >> 1;
+   float *x = (float *) malloc(sizeof(*x) * n2);
+   memcpy(x, buffer, sizeof(*x) * n2);
+   for (i=0; i < n; ++i) {
+      float acc = 0;
+      for (j=0; j < n2; ++j)
+         // formula from paper:
+         //acc += n/4.0f * x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
+         // formula from wikipedia
+         //acc += 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
+         // these are equivalent, except the formula from the paper inverts the multiplier!
+         // however, what actually works is NO MULTIPLIER!?!
+         //acc += 64 * 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
+         acc += x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
+      buffer[i] = acc;
+   }
+   free(x);
+}
+#elif 0
+// same as above, but just barely able to run in real time on modern machines
+void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
+{
+   float mcos[16384];
+   int i,j;
+   int n2 = n >> 1, nmask = (n << 2) -1;
+   float *x = (float *) malloc(sizeof(*x) * n2);
+   memcpy(x, buffer, sizeof(*x) * n2);
+   for (i=0; i < 4*n; ++i)
+      mcos[i] = (float) cos(M_PI / 2 * i / n);
+
+   for (i=0; i < n; ++i) {
+      float acc = 0;
+      for (j=0; j < n2; ++j)
+         acc += x[j] * mcos[(2 * i + 1 + n2)*(2*j+1) & nmask];
+      buffer[i] = acc;
+   }
+   free(x);
+}
+#elif 0
+// transform to use a slow dct-iv; this is STILL basically trivial,
+// but only requires half as many ops
+void dct_iv_slow(float *buffer, int n)
+{
+   float mcos[16384];
+   float x[2048];
+   int i,j;
+   int n2 = n >> 1, nmask = (n << 3) - 1;
+   memcpy(x, buffer, sizeof(*x) * n);
+   for (i=0; i < 8*n; ++i)
+      mcos[i] = (float) cos(M_PI / 4 * i / n);
+   for (i=0; i < n; ++i) {
+      float acc = 0;
+      for (j=0; j < n; ++j)
+         acc += x[j] * mcos[((2 * i + 1)*(2*j+1)) & nmask];
+      buffer[i] = acc;
+   }
+}
+
+void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
+{
+   int i, n4 = n >> 2, n2 = n >> 1, n3_4 = n - n4;
+   float temp[4096];
+
+   memcpy(temp, buffer, n2 * sizeof(float));
+   dct_iv_slow(temp, n2);  // returns -c'-d, a-b'
+
+   for (i=0; i < n4  ; ++i) buffer[i] = temp[i+n4];            // a-b'
+   for (   ; i < n3_4; ++i) buffer[i] = -temp[n3_4 - i - 1];   // b-a', c+d'
+   for (   ; i < n   ; ++i) buffer[i] = -temp[i - n3_4];       // c'+d
+}
+#endif
+
+#ifndef LIBVORBIS_MDCT
+#define LIBVORBIS_MDCT 0
+#endif
+
+#if LIBVORBIS_MDCT
+// directly call the vorbis MDCT using an interface documented
+// by Jeff Roberts... useful for performance comparison
+typedef struct 
+{
+  int n;
+  int log2n;
+  
+  float *trig;
+  int   *bitrev;
+
+  float scale;
+} mdct_lookup;
+
+extern void mdct_init(mdct_lookup *lookup, int n);
+extern void mdct_clear(mdct_lookup *l);
+extern void mdct_backward(mdct_lookup *init, float *in, float *out);
+
+mdct_lookup M1,M2;
+
+void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
+{
+   mdct_lookup *M;
+   if (M1.n == n) M = &M1;
+   else if (M2.n == n) M = &M2;
+   else if (M1.n == 0) { mdct_init(&M1, n); M = &M1; }
+   else { 
+      if (M2.n) __asm int 3;
+      mdct_init(&M2, n);
+      M = &M2;
+   }
+
+   mdct_backward(M, buffer, buffer);
+}
+#endif
+
+
+// the following were split out into separate functions while optimizing;
+// they could be pushed back up but eh. __forceinline showed no change;
+// they're probably already being inlined.
+static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A)
+{
+   float *ee0 = e + i_off;
+   float *ee2 = ee0 + k_off;
+   int i;
+
+   assert((n & 3) == 0);
+   for (i=(n>>2); i > 0; --i) {
+      float k00_20, k01_21;
+      k00_20  = ee0[ 0] - ee2[ 0];
+      k01_21  = ee0[-1] - ee2[-1];
+      ee0[ 0] += ee2[ 0];//ee0[ 0] = ee0[ 0] + ee2[ 0];
+      ee0[-1] += ee2[-1];//ee0[-1] = ee0[-1] + ee2[-1];
+      ee2[ 0] = k00_20 * A[0] - k01_21 * A[1];
+      ee2[-1] = k01_21 * A[0] + k00_20 * A[1];
+      A += 8;
+
+      k00_20  = ee0[-2] - ee2[-2];
+      k01_21  = ee0[-3] - ee2[-3];
+      ee0[-2] += ee2[-2];//ee0[-2] = ee0[-2] + ee2[-2];
+      ee0[-3] += ee2[-3];//ee0[-3] = ee0[-3] + ee2[-3];
+      ee2[-2] = k00_20 * A[0] - k01_21 * A[1];
+      ee2[-3] = k01_21 * A[0] + k00_20 * A[1];
+      A += 8;
+
+      k00_20  = ee0[-4] - ee2[-4];
+      k01_21  = ee0[-5] - ee2[-5];
+      ee0[-4] += ee2[-4];//ee0[-4] = ee0[-4] + ee2[-4];
+      ee0[-5] += ee2[-5];//ee0[-5] = ee0[-5] + ee2[-5];
+      ee2[-4] = k00_20 * A[0] - k01_21 * A[1];
+      ee2[-5] = k01_21 * A[0] + k00_20 * A[1];
+      A += 8;
+
+      k00_20  = ee0[-6] - ee2[-6];
+      k01_21  = ee0[-7] - ee2[-7];
+      ee0[-6] += ee2[-6];//ee0[-6] = ee0[-6] + ee2[-6];
+      ee0[-7] += ee2[-7];//ee0[-7] = ee0[-7] + ee2[-7];
+      ee2[-6] = k00_20 * A[0] - k01_21 * A[1];
+      ee2[-7] = k01_21 * A[0] + k00_20 * A[1];
+      A += 8;
+      ee0 -= 8;
+      ee2 -= 8;
+   }
+}
+
+static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1)
+{
+   int i;
+   float k00_20, k01_21;
+
+   float *e0 = e + d0;
+   float *e2 = e0 + k_off;
+
+   for (i=lim >> 2; i > 0; --i) {
+      k00_20 = e0[-0] - e2[-0];
+      k01_21 = e0[-1] - e2[-1];
+      e0[-0] += e2[-0];//e0[-0] = e0[-0] + e2[-0];
+      e0[-1] += e2[-1];//e0[-1] = e0[-1] + e2[-1];
+      e2[-0] = (k00_20)*A[0] - (k01_21) * A[1];
+      e2[-1] = (k01_21)*A[0] + (k00_20) * A[1];
+
+      A += k1;
+
+      k00_20 = e0[-2] - e2[-2];
+      k01_21 = e0[-3] - e2[-3];
+      e0[-2] += e2[-2];//e0[-2] = e0[-2] + e2[-2];
+      e0[-3] += e2[-3];//e0[-3] = e0[-3] + e2[-3];
+      e2[-2] = (k00_20)*A[0] - (k01_21) * A[1];
+      e2[-3] = (k01_21)*A[0] + (k00_20) * A[1];
+
+      A += k1;
+
+      k00_20 = e0[-4] - e2[-4];
+      k01_21 = e0[-5] - e2[-5];
+      e0[-4] += e2[-4];//e0[-4] = e0[-4] + e2[-4];
+      e0[-5] += e2[-5];//e0[-5] = e0[-5] + e2[-5];
+      e2[-4] = (k00_20)*A[0] - (k01_21) * A[1];
+      e2[-5] = (k01_21)*A[0] + (k00_20) * A[1];
+
+      A += k1;
+
+      k00_20 = e0[-6] - e2[-6];
+      k01_21 = e0[-7] - e2[-7];
+      e0[-6] += e2[-6];//e0[-6] = e0[-6] + e2[-6];
+      e0[-7] += e2[-7];//e0[-7] = e0[-7] + e2[-7];
+      e2[-6] = (k00_20)*A[0] - (k01_21) * A[1];
+      e2[-7] = (k01_21)*A[0] + (k00_20) * A[1];
+
+      e0 -= 8;
+      e2 -= 8;
+
+      A += k1;
+   }
+}
+
+static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0)
+{
+   int i;
+   float A0 = A[0];
+   float A1 = A[0+1];
+   float A2 = A[0+a_off];
+   float A3 = A[0+a_off+1];
+   float A4 = A[0+a_off*2+0];
+   float A5 = A[0+a_off*2+1];
+   float A6 = A[0+a_off*3+0];
+   float A7 = A[0+a_off*3+1];
+
+   float k00,k11;
+
+   float *ee0 = e  +i_off;
+   float *ee2 = ee0+k_off;
+
+   for (i=n; i > 0; --i) {
+      k00     = ee0[ 0] - ee2[ 0];
+      k11     = ee0[-1] - ee2[-1];
+      ee0[ 0] =  ee0[ 0] + ee2[ 0];
+      ee0[-1] =  ee0[-1] + ee2[-1];
+      ee2[ 0] = (k00) * A0 - (k11) * A1;
+      ee2[-1] = (k11) * A0 + (k00) * A1;
+
+      k00     = ee0[-2] - ee2[-2];
+      k11     = ee0[-3] - ee2[-3];
+      ee0[-2] =  ee0[-2] + ee2[-2];
+      ee0[-3] =  ee0[-3] + ee2[-3];
+      ee2[-2] = (k00) * A2 - (k11) * A3;
+      ee2[-3] = (k11) * A2 + (k00) * A3;
+
+      k00     = ee0[-4] - ee2[-4];
+      k11     = ee0[-5] - ee2[-5];
+      ee0[-4] =  ee0[-4] + ee2[-4];
+      ee0[-5] =  ee0[-5] + ee2[-5];
+      ee2[-4] = (k00) * A4 - (k11) * A5;
+      ee2[-5] = (k11) * A4 + (k00) * A5;
+
+      k00     = ee0[-6] - ee2[-6];
+      k11     = ee0[-7] - ee2[-7];
+      ee0[-6] =  ee0[-6] + ee2[-6];
+      ee0[-7] =  ee0[-7] + ee2[-7];
+      ee2[-6] = (k00) * A6 - (k11) * A7;
+      ee2[-7] = (k11) * A6 + (k00) * A7;
+
+      ee0 -= k0;
+      ee2 -= k0;
+   }
+}
+
+static __forceinline void iter_54(float *z)
+{
+   float k00,k11,k22,k33;
+   float y0,y1,y2,y3;
+
+   k00  = z[ 0] - z[-4];
+   y0   = z[ 0] + z[-4];
+   y2   = z[-2] + z[-6];
+   k22  = z[-2] - z[-6];
+
+   z[-0] = y0 + y2;      // z0 + z4 + z2 + z6
+   z[-2] = y0 - y2;      // z0 + z4 - z2 - z6
+
+   // done with y0,y2
+
+   k33  = z[-3] - z[-7];
+
+   z[-4] = k00 + k33;    // z0 - z4 + z3 - z7
+   z[-6] = k00 - k33;    // z0 - z4 - z3 + z7
+
+   // done with k33
+
+   k11  = z[-1] - z[-5];
+   y1   = z[-1] + z[-5];
+   y3   = z[-3] + z[-7];
+
+   z[-1] = y1 + y3;      // z1 + z5 + z3 + z7
+   z[-3] = y1 - y3;      // z1 + z5 - z3 - z7
+   z[-5] = k11 - k22;    // z1 - z5 + z2 - z6
+   z[-7] = k11 + k22;    // z1 - z5 - z2 + z6
+}
+
+static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A, int base_n)
+{
+   int a_off = base_n >> 3;
+   float A2 = A[0+a_off];
+   float *z = e + i_off;
+   float *base = z - 16 * n;
+
+   while (z > base) {
+      float k00,k11;
+
+      k00   = z[-0] - z[-8];
+      k11   = z[-1] - z[-9];
+      z[-0] = z[-0] + z[-8];
+      z[-1] = z[-1] + z[-9];
+      z[-8] =  k00;
+      z[-9] =  k11 ;
+
+      k00    = z[ -2] - z[-10];
+      k11    = z[ -3] - z[-11];
+      z[ -2] = z[ -2] + z[-10];
+      z[ -3] = z[ -3] + z[-11];
+      z[-10] = (k00+k11) * A2;
+      z[-11] = (k11-k00) * A2;
+
+      k00    = z[-12] - z[ -4];  // reverse to avoid a unary negation
+      k11    = z[ -5] - z[-13];
+      z[ -4] = z[ -4] + z[-12];
+      z[ -5] = z[ -5] + z[-13];
+      z[-12] = k11;
+      z[-13] = k00;
+
+      k00    = z[-14] - z[ -6];  // reverse to avoid a unary negation
+      k11    = z[ -7] - z[-15];
+      z[ -6] = z[ -6] + z[-14];
+      z[ -7] = z[ -7] + z[-15];
+      z[-14] = (k00+k11) * A2;
+      z[-15] = (k00-k11) * A2;
+
+      iter_54(z);
+      iter_54(z-8);
+      z -= 16;
+   }
+}
+
+static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
+{
+   int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
+   int ld;
+   // @OPTIMIZE: reduce register pressure by using fewer variables?
+   int save_point = temp_alloc_save(f);
+   float *buf2 = (float *) temp_alloc(f, n2 * sizeof(*buf2));
+   float *u=NULL,*v=NULL;
+   // twiddle factors
+   float *A = f->A[blocktype];
+
+   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
+   // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' after this function.
+
+   // kernel from paper
+
+
+   // merged:
+   //   copy and reflect spectral data
+   //   step 0
+
+   // note that it turns out that the items added together during
+   // this step are, in fact, being added to themselves (as reflected
+   // by step 0). inexplicable inefficiency! this became obvious
+   // once I combined the passes.
+
+   // so there's a missing 'times 2' here (for adding X to itself).
+   // this propogates through linearly to the end, where the numbers
+   // are 1/2 too small, and need to be compensated for.
+
+   {
+      float *d,*e, *AA, *e_stop;
+      d = &buf2[n2-2];
+      AA = A;
+      e = &buffer[0];
+      e_stop = &buffer[n2];
+      while (e != e_stop) {
+         d[1] = (e[0] * AA[0] - e[2]*AA[1]);
+         d[0] = (e[0] * AA[1] + e[2]*AA[0]);
+         d -= 2;
+         AA += 2;
+         e += 4;
+      }
+
+      e = &buffer[n2-3];
+      while (d >= buf2) {
+         d[1] = (-e[2] * AA[0] - -e[0]*AA[1]);
+         d[0] = (-e[2] * AA[1] + -e[0]*AA[0]);
+         d -= 2;
+         AA += 2;
+         e -= 4;
+      }
+   }
+
+   // now we use symbolic names for these, so that we can
+   // possibly swap their meaning as we change which operations
+   // are in place
+
+   u = buffer;
+   v = buf2;
+
+   // step 2    (paper output is w, now u)
+   // this could be in place, but the data ends up in the wrong
+   // place... _somebody_'s got to swap it, so this is nominated
+   {
+      float *AA = &A[n2-8];
+      float *d0,*d1, *e0, *e1;
+
+      e0 = &v[n4];
+      e1 = &v[0];
+
+      d0 = &u[n4];
+      d1 = &u[0];
+
+      while (AA >= A) {
+         float v40_20, v41_21;
+
+         v41_21 = e0[1] - e1[1];
+         v40_20 = e0[0] - e1[0];
+         d0[1]  = e0[1] + e1[1];
+         d0[0]  = e0[0] + e1[0];
+         d1[1]  = v41_21*AA[4] - v40_20*AA[5];
+         d1[0]  = v40_20*AA[4] + v41_21*AA[5];
+
+         v41_21 = e0[3] - e1[3];
+         v40_20 = e0[2] - e1[2];
+         d0[3]  = e0[3] + e1[3];
+         d0[2]  = e0[2] + e1[2];
+         d1[3]  = v41_21*AA[0] - v40_20*AA[1];
+         d1[2]  = v40_20*AA[0] + v41_21*AA[1];
+
+         AA -= 8;
+
+         d0 += 4;
+         d1 += 4;
+         e0 += 4;
+         e1 += 4;
+      }
+   }
+
+   // step 3
+   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
+
+   // optimized step 3:
+
+   // the original step3 loop can be nested r inside s or s inside r;
+   // it's written originally as s inside r, but this is dumb when r
+   // iterates many times, and s few. So I have two copies of it and
+   // switch between them halfway.
+
+   // this is iteration 0 of step 3
+   imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A);
+   imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A);
+
+   // this is iteration 1 of step 3
+   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16);
+   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16);
+   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16);
+   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16);
+
+   l=2;
+   for (; l < (ld-3)>>1; ++l) {
+      int k0 = n >> (l+2), k0_2 = k0>>1;
+      int lim = 1 << (l+1);
+      int i;
+      for (i=0; i < lim; ++i)
+         imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
+   }
+
+   for (; l < ld-6; ++l) {
+      int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
+      int rlim = n >> (l+6), r;
+      int lim = 1 << (l+1);
+      int i_off;
+      float *A0 = A;
+      i_off = n2-1;
+      for (r=rlim; r > 0; --r) {
+         imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, A0, k1, k0);
+         A0 += k1*4;
+         i_off -= 8;
+      }
+   }
+
+   // iterations with count:
+   //   ld-6,-5,-4 all interleaved together
+   //       the big win comes from getting rid of needless flops
+   //         due to the constants on pass 5 & 4 being all 1 and 0;
+   //       combining them to be simultaneous to improve cache made little difference
+   imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n);
+
+   // output is u
+
+   // step 4, 5, and 6
+   // cannot be in-place because of step 5
+   {
+      uint16 *bitrev = f->bit_reverse[blocktype];
+      // weirdly, I'd have thought reading sequentially and writing
+      // erratically would have been better than vice-versa, but in
+      // fact that's not what my testing showed. (That is, with
+      // j = bitreverse(i), do you read i and write j, or read j and write i.)
+
+      float *d0 = &v[n4-4];
+      float *d1 = &v[n2-4];
+      while (d0 >= v) {
+         int k4;
+
+         k4 = bitrev[0];
+         d1[3] = u[k4+0];
+         d1[2] = u[k4+1];
+         d0[3] = u[k4+2];
+         d0[2] = u[k4+3];
+
+         k4 = bitrev[1];
+         d1[1] = u[k4+0];
+         d1[0] = u[k4+1];
+         d0[1] = u[k4+2];
+         d0[0] = u[k4+3];
+         
+         d0 -= 4;
+         d1 -= 4;
+         bitrev += 2;
+      }
+   }
+   // (paper output is u, now v)
+
+
+   // data must be in buf2
+   assert(v == buf2);
+
+   // step 7   (paper output is v, now v)
+   // this is now in place
+   {
+      float *C = f->C[blocktype];
+      float *d, *e;
+
+      d = v;
+      e = v + n2 - 4;
+
+      while (d < e) {
+         float a02,a11,b0,b1,b2,b3;
+
+         a02 = d[0] - e[2];
+         a11 = d[1] + e[3];
+
+         b0 = C[1]*a02 + C[0]*a11;
+         b1 = C[1]*a11 - C[0]*a02;
+
+         b2 = d[0] + e[ 2];
+         b3 = d[1] - e[ 3];
+
+         d[0] = b2 + b0;
+         d[1] = b3 + b1;
+         e[2] = b2 - b0;
+         e[3] = b1 - b3;
+
+         a02 = d[2] - e[0];
+         a11 = d[3] + e[1];
+
+         b0 = C[3]*a02 + C[2]*a11;
+         b1 = C[3]*a11 - C[2]*a02;
+
+         b2 = d[2] + e[ 0];
+         b3 = d[3] - e[ 1];
+
+         d[2] = b2 + b0;
+         d[3] = b3 + b1;
+         e[0] = b2 - b0;
+         e[1] = b1 - b3;
+
+         C += 4;
+         d += 4;
+         e -= 4;
+      }
+   }
+
+   // data must be in buf2
+
+
+   // step 8+decode   (paper output is X, now buffer)
+   // this generates pairs of data a la 8 and pushes them directly through
+   // the decode kernel (pushing rather than pulling) to avoid having
+   // to make another pass later
+
+   // this cannot POSSIBLY be in place, so we refer to the buffers directly
+
+   {
+      float *d0,*d1,*d2,*d3;
+
+      float *B = f->B[blocktype] + n2 - 8;
+      float *e = buf2 + n2 - 8;
+      d0 = &buffer[0];
+      d1 = &buffer[n2-4];
+      d2 = &buffer[n2];
+      d3 = &buffer[n-4];
+      while (e >= v) {
+         float p0,p1,p2,p3;
+
+         p3 =  e[6]*B[7] - e[7]*B[6];
+         p2 = -e[6]*B[6] - e[7]*B[7]; 
+
+         d0[0] =   p3;
+         d1[3] = - p3;
+         d2[0] =   p2;
+         d3[3] =   p2;
+
+         p1 =  e[4]*B[5] - e[5]*B[4];
+         p0 = -e[4]*B[4] - e[5]*B[5]; 
+
+         d0[1] =   p1;
+         d1[2] = - p1;
+         d2[1] =   p0;
+         d3[2] =   p0;
+
+         p3 =  e[2]*B[3] - e[3]*B[2];
+         p2 = -e[2]*B[2] - e[3]*B[3]; 
+
+         d0[2] =   p3;
+         d1[1] = - p3;
+         d2[2] =   p2;
+         d3[1] =   p2;
+
+         p1 =  e[0]*B[1] - e[1]*B[0];
+         p0 = -e[0]*B[0] - e[1]*B[1]; 
+
+         d0[3] =   p1;
+         d1[0] = - p1;
+         d2[3] =   p0;
+         d3[0] =   p0;
+
+         B -= 8;
+         e -= 8;
+         d0 += 4;
+         d2 += 4;
+         d1 -= 4;
+         d3 -= 4;
+      }
+   }
+
+   temp_free(f,buf2);
+   temp_alloc_restore(f,save_point);
+}
+
+#if 0
+// this is the original version of the above code, if you want to optimize it from scratch
+void inverse_mdct_naive(float *buffer, int n)
+{
+   float s;
+   float A[1 << 12], B[1 << 12], C[1 << 11];
+   int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
+   int n3_4 = n - n4, ld;
+   // how can they claim this only uses N words?!
+   // oh, because they're only used sparsely, whoops
+   float u[1 << 13], X[1 << 13], v[1 << 13], w[1 << 13];
+   // set up twiddle factors
+
+   for (k=k2=0; k < n4; ++k,k2+=2) {
+      A[k2  ] = (float)  cos(4*k*M_PI/n);
+      A[k2+1] = (float) -sin(4*k*M_PI/n);
+      B[k2  ] = (float)  cos((k2+1)*M_PI/n/2);
+      B[k2+1] = (float)  sin((k2+1)*M_PI/n/2);
+   }
+   for (k=k2=0; k < n8; ++k,k2+=2) {
+      C[k2  ] = (float)  cos(2*(k2+1)*M_PI/n);
+      C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
+   }
+
+   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
+   // Note there are bugs in that pseudocode, presumably due to them attempting
+   // to rename the arrays nicely rather than representing the way their actual
+   // implementation bounces buffers back and forth. As a result, even in the
+   // "some formulars corrected" version, a direct implementation fails. These
+   // are noted below as "paper bug".
+
+   // copy and reflect spectral data
+   for (k=0; k < n2; ++k) u[k] = buffer[k];
+   for (   ; k < n ; ++k) u[k] = -buffer[n - k - 1];
+   // kernel from paper
+   // step 1
+   for (k=k2=k4=0; k < n4; k+=1, k2+=2, k4+=4) {
+      v[n-k4-1] = (u[k4] - u[n-k4-1]) * A[k2]   - (u[k4+2] - u[n-k4-3])*A[k2+1];
+      v[n-k4-3] = (u[k4] - u[n-k4-1]) * A[k2+1] + (u[k4+2] - u[n-k4-3])*A[k2];
+   }
+   // step 2
+   for (k=k4=0; k < n8; k+=1, k4+=4) {
+      w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
+      w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
+      w[k4+3]    = (v[n2+3+k4] - v[k4+3])*A[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*A[n2-3-k4];
+      w[k4+1]    = (v[n2+1+k4] - v[k4+1])*A[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*A[n2-3-k4];
+   }
+   // step 3
+   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
+   for (l=0; l < ld-3; ++l) {
+      int k0 = n >> (l+2), k1 = 1 << (l+3);
+      int rlim = n >> (l+4), r4, r;
+      int s2lim = 1 << (l+2), s2;
+      for (r=r4=0; r < rlim; r4+=4,++r) {
+         for (s2=0; s2 < s2lim; s2+=2) {
+            u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
+            u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
+            u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1]
+                                - (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1+1];
+            u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1]
+                                + (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1+1];
+         }
+      }
+      if (l+1 < ld-3) {
+         // paper bug: ping-ponging of u&w here is omitted
+         memcpy(w, u, sizeof(u));
+      }
+   }
+
+   // step 4
+   for (i=0; i < n8; ++i) {
+      int j = bit_reverse(i) >> (32-ld+3);
+      assert(j < n8);
+      if (i == j) {
+         // paper bug: original code probably swapped in place; if copying,
+         //            need to directly copy in this case
+         int i8 = i << 3;
+         v[i8+1] = u[i8+1];
+         v[i8+3] = u[i8+3];
+         v[i8+5] = u[i8+5];
+         v[i8+7] = u[i8+7];
+      } else if (i < j) {
+         int i8 = i << 3, j8 = j << 3;
+         v[j8+1] = u[i8+1], v[i8+1] = u[j8 + 1];
+         v[j8+3] = u[i8+3], v[i8+3] = u[j8 + 3];
+         v[j8+5] = u[i8+5], v[i8+5] = u[j8 + 5];
+         v[j8+7] = u[i8+7], v[i8+7] = u[j8 + 7];
+      }
+   }
+   // step 5
+   for (k=0; k < n2; ++k) {
+      w[k] = v[k*2+1];
+   }
+   // step 6
+   for (k=k2=k4=0; k < n8; ++k, k2 += 2, k4 += 4) {
+      u[n-1-k2] = w[k4];
+      u[n-2-k2] = w[k4+1];
+      u[n3_4 - 1 - k2] = w[k4+2];
+      u[n3_4 - 2 - k2] = w[k4+3];
+   }
+   // step 7
+   for (k=k2=0; k < n8; ++k, k2 += 2) {
+      v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + C[k2+1]*(u[n2+k2]-u[n-2-k2]) + C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
+      v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - C[k2+1]*(u[n2+k2]-u[n-2-k2]) - C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
+      v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
+      v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
+   }
+   // step 8
+   for (k=k2=0; k < n4; ++k,k2 += 2) {
+      X[k]      = v[k2+n2]*B[k2  ] + v[k2+1+n2]*B[k2+1];
+      X[n2-1-k] = v[k2+n2]*B[k2+1] - v[k2+1+n2]*B[k2  ];
+   }
+
+   // decode kernel to output
+   // determined the following value experimentally
+   // (by first figuring out what made inverse_mdct_slow work); then matching that here
+   // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?)
+   s = 0.5; // theoretically would be n4
+
+   // [[[ note! the s value of 0.5 is compensated for by the B[] in the current code,
+   //     so it needs to use the "old" B values to behave correctly, or else
+   //     set s to 1.0 ]]]
+   for (i=0; i < n4  ; ++i) buffer[i] = s * X[i+n4];
+   for (   ; i < n3_4; ++i) buffer[i] = -s * X[n3_4 - i - 1];
+   for (   ; i < n   ; ++i) buffer[i] = -s * X[i - n3_4];
+}
+#endif
+
+static float *get_window(vorb *f, int len)
+{
+   len <<= 1;
+   if (len == f->blocksize_0) return f->window[0];
+   if (len == f->blocksize_1) return f->window[1];
+   assert(0);
+   return NULL;
+}
+
+#ifndef STB_VORBIS_NO_DEFER_FLOOR
+typedef int16 YTYPE;
+#else
+typedef int YTYPE;
+#endif
+static int do_floor(vorb *f, Mapping *map, int i, int n, float *target, YTYPE *finalY, uint8 *step2_flag)
+{
+   int n2 = n >> 1;
+   int s = map->chan[i].mux, floor;
+   floor = map->submap_floor[s];
+   if (f->floor_types[floor] == 0) {
+      return error(f, VORBIS_invalid_stream);
+   } else {
+      Floor1 *g = &f->floor_config[floor].floor1;
+      int j,q;
+      int lx = 0, ly = finalY[0] * g->floor1_multiplier;
+      for (q=1; q < g->values; ++q) {
+         j = g->sorted_order[q];
+         #ifndef STB_VORBIS_NO_DEFER_FLOOR
+         if (finalY[j] >= 0)
+         #else
+         if (step2_flag[j])
+         #endif
+         {
+            int hy = finalY[j] * g->floor1_multiplier;
+            int hx = g->Xlist[j];
+            if (lx != hx)
+               draw_line(target, lx,ly, hx,hy, n2);
+            CHECK(f);
+            lx = hx, ly = hy;
+         }
+      }
+      if (lx < n2) {
+         // optimization of: draw_line(target, lx,ly, n,ly, n2);
+         for (j=lx; j < n2; ++j)
+            LINE_OP(target[j], inverse_db_table[ly]);
+         CHECK(f);
+      }
+   }
+   return TRUE;
+}
+
+// The meaning of "left" and "right"
+//
+// For a given frame:
+//     we compute samples from 0..n
+//     window_center is n/2
+//     we'll window and mix the samples from left_start to left_end with data from the previous frame
+//     all of the samples from left_end to right_start can be output without mixing; however,
+//        this interval is 0-length except when transitioning between short and long frames
+//     all of the samples from right_start to right_end need to be mixed with the next frame,
+//        which we don't have, so those get saved in a buffer
+//     frame N's right_end-right_start, the number of samples to mix with the next frame,
+//        has to be the same as frame N+1's left_end-left_start (which they are by
+//        construction)
+
+static int vorbis_decode_initial(vorb *f, int *p_left_start, int *p_left_end, int *p_right_start, int *p_right_end, int *mode)
+{
+   Mode *m;
+   int i, n, prev, next, window_center;
+   f->channel_buffer_start = f->channel_buffer_end = 0;
+
+  retry:
+   if (f->eof) return FALSE;
+   if (!maybe_start_packet(f))
+      return FALSE;
+   // check packet type
+   if (get_bits(f,1) != 0) {
+      if (IS_PUSH_MODE(f))
+         return error(f,VORBIS_bad_packet_type);
+      while (EOP != get8_packet(f));
+      goto retry;
+   }
+
+   if (f->alloc.alloc_buffer)
+      assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
+
+   i = get_bits(f, ilog(f->mode_count-1));
+   if (i == EOP) return FALSE;
+   if (i >= f->mode_count) return FALSE;
+   *mode = i;
+   m = f->mode_config + i;
+   if (m->blockflag) {
+      n = f->blocksize_1;
+      prev = get_bits(f,1);
+      next = get_bits(f,1);
+   } else {
+      prev = next = 0;
+      n = f->blocksize_0;
+   }
+
+// WINDOWING
+
+   window_center = n >> 1;
+   if (m->blockflag && !prev) {
+      *p_left_start = (n - f->blocksize_0) >> 2;
+      *p_left_end   = (n + f->blocksize_0) >> 2;
+   } else {
+      *p_left_start = 0;
+      *p_left_end   = window_center;
+   }
+   if (m->blockflag && !next) {
+      *p_right_start = (n*3 - f->blocksize_0) >> 2;
+      *p_right_end   = (n*3 + f->blocksize_0) >> 2;
+   } else {
+      *p_right_start = window_center;
+      *p_right_end   = n;
+   }
+
+   return TRUE;
+}
+
+static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start, int left_end, int right_start, int right_end, int *p_left)
+{
+   Mapping *map;
+   int i,j,k,n,n2;
+   int zero_channel[256];
+   int really_zero_channel[256];
+
+// WINDOWING
+
+   n = f->blocksize[m->blockflag];
+   map = &f->mapping[m->mapping];
+
+// FLOORS
+   n2 = n >> 1;
+
+   CHECK(f);
+
+   for (i=0; i < f->channels; ++i) {
+      int s = map->chan[i].mux, floor;
+      zero_channel[i] = FALSE;
+      floor = map->submap_floor[s];
+      if (f->floor_types[floor] == 0) {
+         return error(f, VORBIS_invalid_stream);
+      } else {
+         Floor1 *g = &f->floor_config[floor].floor1;
+         if (get_bits(f, 1)) {
+            short *finalY;
+            uint8 step2_flag[256];
+            static int range_list[4] = { 256, 128, 86, 64 };
+            int range = range_list[g->floor1_multiplier-1];
+            int offset = 2;
+            finalY = f->finalY[i];
+            finalY[0] = get_bits(f, ilog(range)-1);
+            finalY[1] = get_bits(f, ilog(range)-1);
+            for (j=0; j < g->partitions; ++j) {
+               int pclass = g->partition_class_list[j];
+               int cdim = g->class_dimensions[pclass];
+               int cbits = g->class_subclasses[pclass];
+               int csub = (1 << cbits)-1;
+               int cval = 0;
+               if (cbits) {
+                  Codebook *c = f->codebooks + g->class_masterbooks[pclass];
+                  DECODE(cval,f,c);
+               }
+               for (k=0; k < cdim; ++k) {
+                  int book = g->subclass_books[pclass][cval & csub];
+                  cval = cval >> cbits;
+                  if (book >= 0) {
+                     int temp;
+                     Codebook *c = f->codebooks + book;
+                     DECODE(temp,f,c);
+                     finalY[offset++] = temp;
+                  } else
+                     finalY[offset++] = 0;
+               }
+            }
+            if (f->valid_bits == INVALID_BITS) goto error; // behavior according to spec
+            step2_flag[0] = step2_flag[1] = 1;
+            for (j=2; j < g->values; ++j) {
+               int low, high, pred, highroom, lowroom, room, val;
+               low = g->neighbors[j][0];
+               high = g->neighbors[j][1];
+               //neighbors(g->Xlist, j, &low, &high);
+               pred = predict_point(g->Xlist[j], g->Xlist[low], g->Xlist[high], finalY[low], finalY[high]);
+               val = finalY[j];
+               highroom = range - pred;
+               lowroom = pred;
+               if (highroom < lowroom)
+                  room = highroom * 2;
+               else
+                  room = lowroom * 2;
+               if (val) {
+                  step2_flag[low] = step2_flag[high] = 1;
+                  step2_flag[j] = 1;
+                  if (val >= room)
+                     if (highroom > lowroom)
+                        finalY[j] = val - lowroom + pred;
+                     else
+                        finalY[j] = pred - val + highroom - 1;
+                  else
+                     if (val & 1)
+                        finalY[j] = pred - ((val+1)>>1);
+                     else
+                        finalY[j] = pred + (val>>1);
+               } else {
+                  step2_flag[j] = 0;
+                  finalY[j] = pred;
+               }
+            }
+
+#ifdef STB_VORBIS_NO_DEFER_FLOOR
+            do_floor(f, map, i, n, f->floor_buffers[i], finalY, step2_flag);
+#else
+            // defer final floor computation until _after_ residue
+            for (j=0; j < g->values; ++j) {
+               if (!step2_flag[j])
+                  finalY[j] = -1;
+            }
+#endif
+         } else {
+           error:
+            zero_channel[i] = TRUE;
+         }
+         // So we just defer everything else to later
+
+         // at this point we've decoded the floor into buffer
+      }
+   }
+   CHECK(f);
+   // at this point we've decoded all floors
+
+   if (f->alloc.alloc_buffer)
+      assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
+
+   // re-enable coupled channels if necessary
+   memcpy(really_zero_channel, zero_channel, sizeof(really_zero_channel[0]) * f->channels);
+   for (i=0; i < map->coupling_steps; ++i)
+      if (!zero_channel[map->chan[i].magnitude] || !zero_channel[map->chan[i].angle]) {
+         zero_channel[map->chan[i].magnitude] = zero_channel[map->chan[i].angle] = FALSE;
+      }
+
+   CHECK(f);
+// RESIDUE DECODE
+   for (i=0; i < map->submaps; ++i) {
+      float *residue_buffers[STB_VORBIS_MAX_CHANNELS];
+      int r;
+      uint8 do_not_decode[256];
+      int ch = 0;
+      for (j=0; j < f->channels; ++j) {
+         if (map->chan[j].mux == i) {
+            if (zero_channel[j]) {
+               do_not_decode[ch] = TRUE;
+               residue_buffers[ch] = NULL;
+            } else {
+               do_not_decode[ch] = FALSE;
+               residue_buffers[ch] = f->channel_buffers[j];
+            }
+            ++ch;
+         }
+      }
+      r = map->submap_residue[i];
+      decode_residue(f, residue_buffers, ch, n2, r, do_not_decode);
+   }
+
+   if (f->alloc.alloc_buffer)
+      assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
+   CHECK(f);
+
+// INVERSE COUPLING
+   for (i = map->coupling_steps-1; i >= 0; --i) {
+      int n2 = n >> 1;
+      float *m = f->channel_buffers[map->chan[i].magnitude];
+      float *a = f->channel_buffers[map->chan[i].angle    ];
+      for (j=0; j < n2; ++j) {
+         float a2,m2;
+         if (m[j] > 0)
+            if (a[j] > 0)
+               m2 = m[j], a2 = m[j] - a[j];
+            else
+               a2 = m[j], m2 = m[j] + a[j];
+         else
+            if (a[j] > 0)
+               m2 = m[j], a2 = m[j] + a[j];
+            else
+               a2 = m[j], m2 = m[j] - a[j];
+         m[j] = m2;
+         a[j] = a2;
+      }
+   }
+   CHECK(f);
+
+   // finish decoding the floors
+#ifndef STB_VORBIS_NO_DEFER_FLOOR
+   for (i=0; i < f->channels; ++i) {
+      if (really_zero_channel[i]) {
+         memset(f->channel_buffers[i], 0, sizeof(*f->channel_buffers[i]) * n2);
+      } else {
+         do_floor(f, map, i, n, f->channel_buffers[i], f->finalY[i], NULL);
+      }
+   }
+#else
+   for (i=0; i < f->channels; ++i) {
+      if (really_zero_channel[i]) {
+         memset(f->channel_buffers[i], 0, sizeof(*f->channel_buffers[i]) * n2);
+      } else {
+         for (j=0; j < n2; ++j)
+            f->channel_buffers[i][j] *= f->floor_buffers[i][j];
+      }
+   }
+#endif
+
+// INVERSE MDCT
+   CHECK(f);
+   for (i=0; i < f->channels; ++i)
+      inverse_mdct(f->channel_buffers[i], n, f, m->blockflag);
+   CHECK(f);
+
+   // this shouldn't be necessary, unless we exited on an error
+   // and want to flush to get to the next packet
+   flush_packet(f);
+
+   if (f->first_decode) {
+      // assume we start so first non-discarded sample is sample 0
+      // this isn't to spec, but spec would require us to read ahead
+      // and decode the size of all current frames--could be done,
+      // but presumably it's not a commonly used feature
+      f->current_loc = -n2; // start of first frame is positioned for discard
+      // we might have to discard samples "from" the next frame too,
+      // if we're lapping a large block then a small at the start?
+      f->discard_samples_deferred = n - right_end;
+      f->current_loc_valid = TRUE;
+      f->first_decode = FALSE;
+   } else if (f->discard_samples_deferred) {
+      if (f->discard_samples_deferred >= right_start - left_start) {
+         f->discard_samples_deferred -= (right_start - left_start);
+         left_start = right_start;
+         *p_left = left_start;
+      } else {
+         left_start += f->discard_samples_deferred;
+         *p_left = left_start;
+         f->discard_samples_deferred = 0;
+      }
+   } else if (f->previous_length == 0 && f->current_loc_valid) {
+      // we're recovering from a seek... that means we're going to discard
+      // the samples from this packet even though we know our position from
+      // the last page header, so we need to update the position based on
+      // the discarded samples here
+      // but wait, the code below is going to add this in itself even
+      // on a discard, so we don't need to do it here...
+   }
+
+   // check if we have ogg information about the sample # for this packet
+   if (f->last_seg_which == f->end_seg_with_known_loc) {
+      // if we have a valid current loc, and this is final:
+      if (f->current_loc_valid && (f->page_flag & PAGEFLAG_last_page)) {
+         uint32 current_end = f->known_loc_for_packet - (n-right_end);
+         // then let's infer the size of the (probably) short final frame
+         if (current_end < f->current_loc + (right_end-left_start)) {
+            if (current_end < f->current_loc) {
+               // negative truncation, that's impossible!
+               *len = 0;
+            } else {
+               *len = current_end - f->current_loc;
+            }
+            *len += left_start;
+            if (*len > right_end) *len = right_end; // this should never happen
+            f->current_loc += *len;
+            return TRUE;
+         }
+      }
+      // otherwise, just set our sample loc
+      // guess that the ogg granule pos refers to the _middle_ of the
+      // last frame?
+      // set f->current_loc to the position of left_start
+      f->current_loc = f->known_loc_for_packet - (n2-left_start);
+      f->current_loc_valid = TRUE;
+   }
+   if (f->current_loc_valid)
+      f->current_loc += (right_start - left_start);
+
+   if (f->alloc.alloc_buffer)
+      assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
+   *len = right_end;  // ignore samples after the window goes to 0
+   CHECK(f);
+
+   return TRUE;
+}
+
+static int vorbis_decode_packet(vorb *f, int *len, int *p_left, int *p_right)
+{
+   int mode, left_end, right_end;
+   if (!vorbis_decode_initial(f, p_left, &left_end, p_right, &right_end, &mode)) return 0;
+   return vorbis_decode_packet_rest(f, len, f->mode_config + mode, *p_left, left_end, *p_right, right_end, p_left);
+}
+
+static int vorbis_finish_frame(stb_vorbis *f, int len, int left, int right)
+{
+   int prev,i,j;
+   // we use right&left (the start of the right- and left-window sin()-regions)
+   // to determine how much to return, rather than inferring from the rules
+   // (same result, clearer code); 'left' indicates where our sin() window
+   // starts, therefore where the previous window's right edge starts, and
+   // therefore where to start mixing from the previous buffer. 'right'
+   // indicates where our sin() ending-window starts, therefore that's where
+   // we start saving, and where our returned-data ends.
+
+   // mixin from previous window
+   if (f->previous_length) {
+      int i,j, n = f->previous_length;
+      float *w = get_window(f, n);
+      for (i=0; i < f->channels; ++i) {
+         for (j=0; j < n; ++j)
+            f->channel_buffers[i][left+j] =
+               f->channel_buffers[i][left+j]*w[    j] +
+               f->previous_window[i][     j]*w[n-1-j];
+      }
+   }
+
+   prev = f->previous_length;
+
+   // last half of this data becomes previous window
+   f->previous_length = len - right;
+
+   // @OPTIMIZE: could avoid this copy by double-buffering the
+   // output (flipping previous_window with channel_buffers), but
+   // then previous_window would have to be 2x as large, and
+   // channel_buffers couldn't be temp mem (although they're NOT
+   // currently temp mem, they could be (unless we want to level
+   // performance by spreading out the computation))
+   for (i=0; i < f->channels; ++i)
+      for (j=0; right+j < len; ++j)
+         f->previous_window[i][j] = f->channel_buffers[i][right+j];
+
+   if (!prev)
+      // there was no previous packet, so this data isn't valid...
+      // this isn't entirely true, only the would-have-overlapped data
+      // isn't valid, but this seems to be what the spec requires
+      return 0;
+
+   // truncate a short frame
+   if (len < right) right = len;
+
+   f->samples_output += right-left;
+
+   return right - left;
+}
+
+static void vorbis_pump_first_frame(stb_vorbis *f)
+{
+   int len, right, left;
+   if (vorbis_decode_packet(f, &len, &left, &right))
+      vorbis_finish_frame(f, len, left, right);
+}
+
+#ifndef STB_VORBIS_NO_PUSHDATA_API
+static int is_whole_packet_present(stb_vorbis *f, int end_page)
+{
+   // make sure that we have the packet available before continuing...
+   // this requires a full ogg parse, but we know we can fetch from f->stream
+
+   // instead of coding this out explicitly, we could save the current read state,
+   // read the next packet with get8() until end-of-packet, check f->eof, then
+   // reset the state? but that would be slower, esp. since we'd have over 256 bytes
+   // of state to restore (primarily the page segment table)
+
+   int s = f->next_seg, first = TRUE;
+   uint8 *p = f->stream;
+
+   if (s != -1) { // if we're not starting the packet with a 'continue on next page' flag
+      for (; s < f->segment_count; ++s) {
+         p += f->segments[s];
+         if (f->segments[s] < 255)               // stop at first short segment
+            break;
+      }
+      // either this continues, or it ends it...
+      if (end_page)
+         if (s < f->segment_count-1)             return error(f, VORBIS_invalid_stream);
+      if (s == f->segment_count)
+         s = -1; // set 'crosses page' flag
+      if (p > f->stream_end)                     return error(f, VORBIS_need_more_data);
+      first = FALSE;
+   }
+   for (; s == -1;) {
+      uint8 *q; 
+      int n;
+
+      // check that we have the page header ready
+      if (p + 26 >= f->stream_end)               return error(f, VORBIS_need_more_data);
+      // validate the page
+      if (memcmp(p, ogg_page_header, 4))         return error(f, VORBIS_invalid_stream);
+      if (p[4] != 0)                             return error(f, VORBIS_invalid_stream);
+      if (first) { // the first segment must NOT have 'continued_packet', later ones MUST
+         if (f->previous_length)
+            if ((p[5] & PAGEFLAG_continued_packet))  return error(f, VORBIS_invalid_stream);
+         // if no previous length, we're resynching, so we can come in on a continued-packet,
+         // which we'll just drop
+      } else {
+         if (!(p[5] & PAGEFLAG_continued_packet)) return error(f, VORBIS_invalid_stream);
+      }
+      n = p[26]; // segment counts
+      q = p+27;  // q points to segment table
+      p = q + n; // advance past header
+      // make sure we've read the segment table
+      if (p > f->stream_end)                     return error(f, VORBIS_need_more_data);
+      for (s=0; s < n; ++s) {
+         p += q[s];
+         if (q[s] < 255)
+            break;
+      }
+      if (end_page)
+         if (s < n-1)                            return error(f, VORBIS_invalid_stream);
+      if (s == n)
+         s = -1; // set 'crosses page' flag
+      if (p > f->stream_end)                     return error(f, VORBIS_need_more_data);
+      first = FALSE;
+   }
+   return TRUE;
+}
+#endif // !STB_VORBIS_NO_PUSHDATA_API
+
+static int start_decoder(vorb *f)
+{
+   uint8 header[6], x,y;
+   int len,i,j,k, max_submaps = 0;
+   int longest_floorlist=0;
+
+   // first page, first packet
+
+   if (!start_page(f))                              return FALSE;
+   // validate page flag
+   if (!(f->page_flag & PAGEFLAG_first_page))       return error(f, VORBIS_invalid_first_page);
+   if (f->page_flag & PAGEFLAG_last_page)           return error(f, VORBIS_invalid_first_page);
+   if (f->page_flag & PAGEFLAG_continued_packet)    return error(f, VORBIS_invalid_first_page);
+   // check for expected packet length
+   if (f->segment_count != 1)                       return error(f, VORBIS_invalid_first_page);
+   if (f->segments[0] != 30)                        return error(f, VORBIS_invalid_first_page);
+   // read packet
+   // check packet header
+   if (get8(f) != VORBIS_packet_id)                 return error(f, VORBIS_invalid_first_page);
+   if (!getn(f, header, 6))                         return error(f, VORBIS_unexpected_eof);
+   if (!vorbis_validate(header))                    return error(f, VORBIS_invalid_first_page);
+   // vorbis_version
+   if (get32(f) != 0)                               return error(f, VORBIS_invalid_first_page);
+   f->channels = get8(f); if (!f->channels)         return error(f, VORBIS_invalid_first_page);
+   if (f->channels > STB_VORBIS_MAX_CHANNELS)       return error(f, VORBIS_too_many_channels);
+   f->sample_rate = get32(f); if (!f->sample_rate)  return error(f, VORBIS_invalid_first_page);
+   get32(f); // bitrate_maximum
+   get32(f); // bitrate_nominal
+   get32(f); // bitrate_minimum
+   x = get8(f);
+   {
+      int log0,log1;
+      log0 = x & 15;
+      log1 = x >> 4;
+      f->blocksize_0 = 1 << log0;
+      f->blocksize_1 = 1 << log1;
+      if (log0 < 6 || log0 > 13)                       return error(f, VORBIS_invalid_setup);
+      if (log1 < 6 || log1 > 13)                       return error(f, VORBIS_invalid_setup);
+      if (log0 > log1)                                 return error(f, VORBIS_invalid_setup);
+   }
+
+   // framing_flag
+   x = get8(f);
+   if (!(x & 1))                                    return error(f, VORBIS_invalid_first_page);
+
+   // second packet!
+   if (!start_page(f))                              return FALSE;
+
+   if (!start_packet(f))                            return FALSE;
+   do {
+      len = next_segment(f);
+      skip(f, len);
+      f->bytes_in_seg = 0;
+   } while (len);
+
+   // third packet!
+   if (!start_packet(f))                            return FALSE;
+
+   #ifndef STB_VORBIS_NO_PUSHDATA_API
+   if (IS_PUSH_MODE(f)) {
+      if (!is_whole_packet_present(f, TRUE)) {
+         // convert error in ogg header to write type
+         if (f->error == VORBIS_invalid_stream)
+            f->error = VORBIS_invalid_setup;
+         return FALSE;
+      }
+   }
+   #endif
+
+   crc32_init(); // always init it, to avoid multithread race conditions
+
+   if (get8_packet(f) != VORBIS_packet_setup)       return error(f, VORBIS_invalid_setup);
+   for (i=0; i < 6; ++i) header[i] = get8_packet(f);
+   if (!vorbis_validate(header))                    return error(f, VORBIS_invalid_setup);
+
+   // codebooks
+
+   f->codebook_count = get_bits(f,8) + 1;
+   f->codebooks = (Codebook *) setup_malloc(f, sizeof(*f->codebooks) * f->codebook_count);
+   if (f->codebooks == NULL)                        return error(f, VORBIS_outofmem);
+   memset(f->codebooks, 0, sizeof(*f->codebooks) * f->codebook_count);
+   for (i=0; i < f->codebook_count; ++i) {
+      uint32 *values;
+      int ordered, sorted_count;
+      int total=0;
+      uint8 *lengths;
+      Codebook *c = f->codebooks+i;
+      CHECK(f);
+      x = get_bits(f, 8); if (x != 0x42)            return error(f, VORBIS_invalid_setup);
+      x = get_bits(f, 8); if (x != 0x43)            return error(f, VORBIS_invalid_setup);
+      x = get_bits(f, 8); if (x != 0x56)            return error(f, VORBIS_invalid_setup);
+      x = get_bits(f, 8);
+      c->dimensions = (get_bits(f, 8)<<8) + x;
+      x = get_bits(f, 8);
+      y = get_bits(f, 8);
+      c->entries = (get_bits(f, 8)<<16) + (y<<8) + x;
+      ordered = get_bits(f,1);
+      c->sparse = ordered ? 0 : get_bits(f,1);
+
+      if (c->dimensions == 0 && c->entries != 0)    return error(f, VORBIS_invalid_setup);
+
+      if (c->sparse)
+         lengths = (uint8 *) setup_temp_malloc(f, c->entries);
+      else
+         lengths = c->codeword_lengths = (uint8 *) setup_malloc(f, c->entries);
+
+      if (!lengths) return error(f, VORBIS_outofmem);
+
+      if (ordered) {
+         int current_entry = 0;
+         int current_length = get_bits(f,5) + 1;
+         while (current_entry < c->entries) {
+            int limit = c->entries - current_entry;
+            int n = get_bits(f, ilog(limit));
+            if (current_entry + n > (int) c->entries) { return error(f, VORBIS_invalid_setup); }
+            memset(lengths + current_entry, current_length, n);
+            current_entry += n;
+            ++current_length;
+         }
+      } else {
+         for (j=0; j < c->entries; ++j) {
+            int present = c->sparse ? get_bits(f,1) : 1;
+            if (present) {
+               lengths[j] = get_bits(f, 5) + 1;
+               ++total;
+               if (lengths[j] == 32)
+                  return error(f, VORBIS_invalid_setup);
+            } else {
+               lengths[j] = NO_CODE;
+            }
+         }
+      }
+
+      if (c->sparse && total >= c->entries >> 2) {
+         // convert sparse items to non-sparse!
+         if (c->entries > (int) f->setup_temp_memory_required)
+            f->setup_temp_memory_required = c->entries;
+
+         c->codeword_lengths = (uint8 *) setup_malloc(f, c->entries);
+         if (c->codeword_lengths == NULL) return error(f, VORBIS_outofmem);
+         memcpy(c->codeword_lengths, lengths, c->entries);
+         setup_temp_free(f, lengths, c->entries); // note this is only safe if there have been no intervening temp mallocs!
+         lengths = c->codeword_lengths;
+         c->sparse = 0;
+      }
+
+      // compute the size of the sorted tables
+      if (c->sparse) {
+         sorted_count = total;
+      } else {
+         sorted_count = 0;
+         #ifndef STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
+         for (j=0; j < c->entries; ++j)
+            if (lengths[j] > STB_VORBIS_FAST_HUFFMAN_LENGTH && lengths[j] != NO_CODE)
+               ++sorted_count;
+         #endif
+      }
+
+      c->sorted_entries = sorted_count;
+      values = NULL;
+
+      CHECK(f);
+      if (!c->sparse) {
+         c->codewords = (uint32 *) setup_malloc(f, sizeof(c->codewords[0]) * c->entries);
+         if (!c->codewords)                  return error(f, VORBIS_outofmem);
+      } else {
+         unsigned int size;
+         if (c->sorted_entries) {
+            c->codeword_lengths = (uint8 *) setup_malloc(f, c->sorted_entries);
+            if (!c->codeword_lengths)           return error(f, VORBIS_outofmem);
+            c->codewords = (uint32 *) setup_temp_malloc(f, sizeof(*c->codewords) * c->sorted_entries);
+            if (!c->codewords)                  return error(f, VORBIS_outofmem);
+            values = (uint32 *) setup_temp_malloc(f, sizeof(*values) * c->sorted_entries);
+            if (!values)                        return error(f, VORBIS_outofmem);
+         }
+         size = c->entries + (sizeof(*c->codewords) + sizeof(*values)) * c->sorted_entries;
+         if (size > f->setup_temp_memory_required)
+            f->setup_temp_memory_required = size;
+      }
+
+      if (!compute_codewords(c, lengths, c->entries, values)) {
+         if (c->sparse) setup_temp_free(f, values, 0);
+         return error(f, VORBIS_invalid_setup);
+      }
+
+      if (c->sorted_entries) {
+         // allocate an extra slot for sentinels
+         c->sorted_codewords = (uint32 *) setup_malloc(f, sizeof(*c->sorted_codewords) * (c->sorted_entries+1));
+         if (c->sorted_codewords == NULL) return error(f, VORBIS_outofmem);
+         // allocate an extra slot at the front so that c->sorted_values[-1] is defined
+         // so that we can catch that case without an extra if
+         c->sorted_values    = ( int   *) setup_malloc(f, sizeof(*c->sorted_values   ) * (c->sorted_entries+1));
+         if (c->sorted_values == NULL) return error(f, VORBIS_outofmem);
+         ++c->sorted_values;
+         c->sorted_values[-1] = -1;
+         compute_sorted_huffman(c, lengths, values);
+      }
+
+      if (c->sparse) {
+         setup_temp_free(f, values, sizeof(*values)*c->sorted_entries);
+         setup_temp_free(f, c->codewords, sizeof(*c->codewords)*c->sorted_entries);
+         setup_temp_free(f, lengths, c->entries);
+         c->codewords = NULL;
+      }
+
+      compute_accelerated_huffman(c);
+
+      CHECK(f);
+      c->lookup_type = get_bits(f, 4);
+      if (c->lookup_type > 2) return error(f, VORBIS_invalid_setup);
+      if (c->lookup_type > 0) {
+         uint16 *mults;
+         c->minimum_value = float32_unpack(get_bits(f, 32));
+         c->delta_value = float32_unpack(get_bits(f, 32));
+         c->value_bits = get_bits(f, 4)+1;
+         c->sequence_p = get_bits(f,1);
+         if (c->lookup_type == 1) {
+            c->lookup_values = lookup1_values(c->entries, c->dimensions);
+         } else {
+            c->lookup_values = c->entries * c->dimensions;
+         }
+         if (c->lookup_values == 0) return error(f, VORBIS_invalid_setup);
+         mults = (uint16 *) setup_temp_malloc(f, sizeof(mults[0]) * c->lookup_values);
+         if (mults == NULL) return error(f, VORBIS_outofmem);
+         for (j=0; j < (int) c->lookup_values; ++j) {
+            int q = get_bits(f, c->value_bits);
+            if (q == EOP) { setup_temp_free(f,mults,sizeof(mults[0])*c->lookup_values); return error(f, VORBIS_invalid_setup); }
+            mults[j] = q;
+         }
+
+#ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
+         if (c->lookup_type == 1) {
+            int len, sparse = c->sparse;
+            float last=0;
+            // pre-expand the lookup1-style multiplicands, to avoid a divide in the inner loop
+            if (sparse) {
+               if (c->sorted_entries == 0) goto skip;
+               c->multiplicands = (codetype *) setup_malloc(f, sizeof(c->multiplicands[0]) * c->sorted_entries * c->dimensions);
+            } else
+               c->multiplicands = (codetype *) setup_malloc(f, sizeof(c->multiplicands[0]) * c->entries        * c->dimensions);
+            if (c->multiplicands == NULL) { setup_temp_free(f,mults,sizeof(mults[0])*c->lookup_values); return error(f, VORBIS_outofmem); }
+            len = sparse ? c->sorted_entries : c->entries;
+            for (j=0; j < len; ++j) {
+               unsigned int z = sparse ? c->sorted_values[j] : j;
+               unsigned int div=1;
+               for (k=0; k < c->dimensions; ++k) {
+                  int off = (z / div) % c->lookup_values;
+                  float val = mults[off];
+                  val = mults[off]*c->delta_value + c->minimum_value + last;
+                  c->multiplicands[j*c->dimensions + k] = val;
+                  if (c->sequence_p)
+                     last = val;
+                  if (k+1 < c->dimensions) {
+                     if (div > UINT_MAX / (unsigned int) c->lookup_values) {
+                        setup_temp_free(f, mults,sizeof(mults[0])*c->lookup_values);
+                        return error(f, VORBIS_invalid_setup);
+                     }
+                     div *= c->lookup_values;
+                  }
+               }
+            }
+            c->lookup_type = 2;
+         }
+         else
+#endif
+         {
+            float last=0;
+            CHECK(f);
+            c->multiplicands = (codetype *) setup_malloc(f, sizeof(c->multiplicands[0]) * c->lookup_values);
+            if (c->multiplicands == NULL) { setup_temp_free(f, mults,sizeof(mults[0])*c->lookup_values); return error(f, VORBIS_outofmem); }
+            for (j=0; j < (int) c->lookup_values; ++j) {
+               float val = mults[j] * c->delta_value + c->minimum_value + last;
+               c->multiplicands[j] = val;
+               if (c->sequence_p)
+                  last = val;
+            }
+         }
+#ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
+        skip:;
+#endif
+         setup_temp_free(f, mults, sizeof(mults[0])*c->lookup_values);
+
+         CHECK(f);
+      }
+      CHECK(f);
+   }
+
+   // time domain transfers (notused)
+
+   x = get_bits(f, 6) + 1;
+   for (i=0; i < x; ++i) {
+      uint32 z = get_bits(f, 16);
+      if (z != 0) return error(f, VORBIS_invalid_setup);
+   }
+
+   // Floors
+   f->floor_count = get_bits(f, 6)+1;
+   f->floor_config = (Floor *)  setup_malloc(f, f->floor_count * sizeof(*f->floor_config));
+   if (f->floor_config == NULL) return error(f, VORBIS_outofmem);
+   for (i=0; i < f->floor_count; ++i) {
+      f->floor_types[i] = get_bits(f, 16);
+      if (f->floor_types[i] > 1) return error(f, VORBIS_invalid_setup);
+      if (f->floor_types[i] == 0) {
+         Floor0 *g = &f->floor_config[i].floor0;
+         g->order = get_bits(f,8);
+         g->rate = get_bits(f,16);
+         g->bark_map_size = get_bits(f,16);
+         g->amplitude_bits = get_bits(f,6);
+         g->amplitude_offset = get_bits(f,8);
+         g->number_of_books = get_bits(f,4) + 1;
+         for (j=0; j < g->number_of_books; ++j)
+            g->book_list[j] = get_bits(f,8);
+         return error(f, VORBIS_feature_not_supported);
+      } else {
+         Point p[31*8+2];
+         Floor1 *g = &f->floor_config[i].floor1;
+         int max_class = -1; 
+         g->partitions = get_bits(f, 5);
+         for (j=0; j < g->partitions; ++j) {
+            g->partition_class_list[j] = get_bits(f, 4);
+            if (g->partition_class_list[j] > max_class)
+               max_class = g->partition_class_list[j];
+         }
+         for (j=0; j <= max_class; ++j) {
+            g->class_dimensions[j] = get_bits(f, 3)+1;
+            g->class_subclasses[j] = get_bits(f, 2);
+            if (g->class_subclasses[j]) {
+               g->class_masterbooks[j] = get_bits(f, 8);
+               if (g->class_masterbooks[j] >= f->codebook_count) return error(f, VORBIS_invalid_setup);
+            }
+            for (k=0; k < 1 << g->class_subclasses[j]; ++k) {
+               g->subclass_books[j][k] = get_bits(f,8)-1;
+               if (g->subclass_books[j][k] >= f->codebook_count) return error(f, VORBIS_invalid_setup);
+            }
+         }
+         g->floor1_multiplier = get_bits(f,2)+1;
+         g->rangebits = get_bits(f,4);
+         g->Xlist[0] = 0;
+         g->Xlist[1] = 1 << g->rangebits;
+         g->values = 2;
+         for (j=0; j < g->partitions; ++j) {
+            int c = g->partition_class_list[j];
+            for (k=0; k < g->class_dimensions[c]; ++k) {
+               g->Xlist[g->values] = get_bits(f, g->rangebits);
+               ++g->values;
+            }
+         }
+         // precompute the sorting
+         for (j=0; j < g->values; ++j) {
+            p[j].x = g->Xlist[j];
+            p[j].y = j;
+         }
+         qsort(p, g->values, sizeof(p[0]), point_compare);
+         for (j=0; j < g->values; ++j)
+            g->sorted_order[j] = (uint8) p[j].y;
+         // precompute the neighbors
+         for (j=2; j < g->values; ++j) {
+            int low,hi;
+            neighbors(g->Xlist, j, &low,&hi);
+            g->neighbors[j][0] = low;
+            g->neighbors[j][1] = hi;
+         }
+
+         if (g->values > longest_floorlist)
+            longest_floorlist = g->values;
+      }
+   }
+
+   // Residue
+   f->residue_count = get_bits(f, 6)+1;
+   f->residue_config = (Residue *) setup_malloc(f, f->residue_count * sizeof(f->residue_config[0]));
+   if (f->residue_config == NULL) return error(f, VORBIS_outofmem);
+   memset(f->residue_config, 0, f->residue_count * sizeof(f->residue_config[0]));
+   for (i=0; i < f->residue_count; ++i) {
+      uint8 residue_cascade[64];
+      Residue *r = f->residue_config+i;
+      f->residue_types[i] = get_bits(f, 16);
+      if (f->residue_types[i] > 2) return error(f, VORBIS_invalid_setup);
+      r->begin = get_bits(f, 24);
+      r->end = get_bits(f, 24);
+      if (r->end < r->begin) return error(f, VORBIS_invalid_setup);
+      r->part_size = get_bits(f,24)+1;
+      r->classifications = get_bits(f,6)+1;
+      r->classbook = get_bits(f,8);
+      if (r->classbook >= f->codebook_count) return error(f, VORBIS_invalid_setup);
+      for (j=0; j < r->classifications; ++j) {
+         uint8 high_bits=0;
+         uint8 low_bits=get_bits(f,3);
+         if (get_bits(f,1))
+            high_bits = get_bits(f,5);
+         residue_cascade[j] = high_bits*8 + low_bits;
+      }
+      r->residue_books = (short (*)[8]) setup_malloc(f, sizeof(r->residue_books[0]) * r->classifications);
+      if (r->residue_books == NULL) return error(f, VORBIS_outofmem);
+      for (j=0; j < r->classifications; ++j) {
+         for (k=0; k < 8; ++k) {
+            if (residue_cascade[j] & (1 << k)) {
+               r->residue_books[j][k] = get_bits(f, 8);
+               if (r->residue_books[j][k] >= f->codebook_count) return error(f, VORBIS_invalid_setup);
+            } else {
+               r->residue_books[j][k] = -1;
+            }
+         }
+      }
+      // precompute the classifications[] array to avoid inner-loop mod/divide
+      // call it 'classdata' since we already have r->classifications
+      r->classdata = (uint8 **) setup_malloc(f, sizeof(*r->classdata) * f->codebooks[r->classbook].entries);
+      if (!r->classdata) return error(f, VORBIS_outofmem);
+      memset(r->classdata, 0, sizeof(*r->classdata) * f->codebooks[r->classbook].entries);
+      for (j=0; j < f->codebooks[r->classbook].entries; ++j) {
+         int classwords = f->codebooks[r->classbook].dimensions;
+         int temp = j;
+         r->classdata[j] = (uint8 *) setup_malloc(f, sizeof(r->classdata[j][0]) * classwords);
+         if (r->classdata[j] == NULL) return error(f, VORBIS_outofmem);
+         for (k=classwords-1; k >= 0; --k) {
+            r->classdata[j][k] = temp % r->classifications;
+            temp /= r->classifications;
+         }
+      }
+   }
+
+   f->mapping_count = get_bits(f,6)+1;
+   f->mapping = (Mapping *) setup_malloc(f, f->mapping_count * sizeof(*f->mapping));
+   if (f->mapping == NULL) return error(f, VORBIS_outofmem);
+   memset(f->mapping, 0, f->mapping_count * sizeof(*f->mapping));
+   for (i=0; i < f->mapping_count; ++i) {
+      Mapping *m = f->mapping + i;      
+      int mapping_type = get_bits(f,16);
+      if (mapping_type != 0) return error(f, VORBIS_invalid_setup);
+      m->chan = (MappingChannel *) setup_malloc(f, f->channels * sizeof(*m->chan));
+      if (m->chan == NULL) return error(f, VORBIS_outofmem);
+      if (get_bits(f,1))
+         m->submaps = get_bits(f,4)+1;
+      else
+         m->submaps = 1;
+      if (m->submaps > max_submaps)
+         max_submaps = m->submaps;
+      if (get_bits(f,1)) {
+         m->coupling_steps = get_bits(f,8)+1;
+         for (k=0; k < m->coupling_steps; ++k) {
+            m->chan[k].magnitude = get_bits(f, ilog(f->channels-1));
+            m->chan[k].angle = get_bits(f, ilog(f->channels-1));
+            if (m->chan[k].magnitude >= f->channels)        return error(f, VORBIS_invalid_setup);
+            if (m->chan[k].angle     >= f->channels)        return error(f, VORBIS_invalid_setup);
+            if (m->chan[k].magnitude == m->chan[k].angle)   return error(f, VORBIS_invalid_setup);
+         }
+      } else
+         m->coupling_steps = 0;
+
+      // reserved field
+      if (get_bits(f,2)) return error(f, VORBIS_invalid_setup);
+      if (m->submaps > 1) {
+         for (j=0; j < f->channels; ++j) {
+            m->chan[j].mux = get_bits(f, 4);
+            if (m->chan[j].mux >= m->submaps)                return error(f, VORBIS_invalid_setup);
+         }
+      } else
+         // @SPECIFICATION: this case is missing from the spec
+         for (j=0; j < f->channels; ++j)
+            m->chan[j].mux = 0;
+
+      for (j=0; j < m->submaps; ++j) {
+         get_bits(f,8); // discard
+         m->submap_floor[j] = get_bits(f,8);
+         m->submap_residue[j] = get_bits(f,8);
+         if (m->submap_floor[j] >= f->floor_count)      return error(f, VORBIS_invalid_setup);
+         if (m->submap_residue[j] >= f->residue_count)  return error(f, VORBIS_invalid_setup);
+      }
+   }
+
+   // Modes
+   f->mode_count = get_bits(f, 6)+1;
+   for (i=0; i < f->mode_count; ++i) {
+      Mode *m = f->mode_config+i;
+      m->blockflag = get_bits(f,1);
+      m->windowtype = get_bits(f,16);
+      m->transformtype = get_bits(f,16);
+      m->mapping = get_bits(f,8);
+      if (m->windowtype != 0)                 return error(f, VORBIS_invalid_setup);
+      if (m->transformtype != 0)              return error(f, VORBIS_invalid_setup);
+      if (m->mapping >= f->mapping_count)     return error(f, VORBIS_invalid_setup);
+   }
+
+   flush_packet(f);
+
+   f->previous_length = 0;
+
+   for (i=0; i < f->channels; ++i) {
+      f->channel_buffers[i] = (float *) setup_malloc(f, sizeof(float) * f->blocksize_1);
+      f->previous_window[i] = (float *) setup_malloc(f, sizeof(float) * f->blocksize_1/2);
+      f->finalY[i]          = (int16 *) setup_malloc(f, sizeof(int16) * longest_floorlist);
+      if (f->channel_buffers[i] == NULL || f->previous_window[i] == NULL || f->finalY[i] == NULL) return error(f, VORBIS_outofmem);
+      #ifdef STB_VORBIS_NO_DEFER_FLOOR
+      f->floor_buffers[i]   = (float *) setup_malloc(f, sizeof(float) * f->blocksize_1/2);
+      if (f->floor_buffers[i] == NULL) return error(f, VORBIS_outofmem);
+      #endif
+   }
+
+   if (!init_blocksize(f, 0, f->blocksize_0)) return FALSE;
+   if (!init_blocksize(f, 1, f->blocksize_1)) return FALSE;
+   f->blocksize[0] = f->blocksize_0;
+   f->blocksize[1] = f->blocksize_1;
+
+#ifdef STB_VORBIS_DIVIDE_TABLE
+   if (integer_divide_table[1][1]==0)
+      for (i=0; i < DIVTAB_NUMER; ++i)
+         for (j=1; j < DIVTAB_DENOM; ++j)
+            integer_divide_table[i][j] = i / j;
+#endif
+
+   // compute how much temporary memory is needed
+
+   // 1.
+   {
+      uint32 imdct_mem = (f->blocksize_1 * sizeof(float) >> 1);
+      uint32 classify_mem;
+      int i,max_part_read=0;
+      for (i=0; i < f->residue_count; ++i) {
+         Residue *r = f->residue_config + i;
+         int n_read = r->end - r->begin;
+         int part_read = n_read / r->part_size;
+         if (part_read > max_part_read)
+            max_part_read = part_read;
+      }
+      #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
+      classify_mem = f->channels * (sizeof(void*) + max_part_read * sizeof(uint8 *));
+      #else
+      classify_mem = f->channels * (sizeof(void*) + max_part_read * sizeof(int *));
+      #endif
+
+      f->temp_memory_required = classify_mem;
+      if (imdct_mem > f->temp_memory_required)
+         f->temp_memory_required = imdct_mem;
+   }
+
+   f->first_decode = TRUE;
+
+   if (f->alloc.alloc_buffer) {
+      assert(f->temp_offset == f->alloc.alloc_buffer_length_in_bytes);
+      // check if there's enough temp memory so we don't error later
+      if (f->setup_offset + sizeof(*f) + f->temp_memory_required > (unsigned) f->temp_offset)
+         return error(f, VORBIS_outofmem);
+   }
+
+   f->first_audio_page_offset = stb_vorbis_get_file_offset(f);
+
+   return TRUE;
+}
+
+static void vorbis_deinit(stb_vorbis *p)
+{
+   int i,j;
+   if (p->residue_config) {
+      for (i=0; i < p->residue_count; ++i) {
+         Residue *r = p->residue_config+i;
+         if (r->classdata) {
+            for (j=0; j < p->codebooks[r->classbook].entries; ++j)
+               setup_free(p, r->classdata[j]);
+            setup_free(p, r->classdata);
+         }
+         setup_free(p, r->residue_books);
+      }
+   }
+
+   if (p->codebooks) {
+      CHECK(p);
+      for (i=0; i < p->codebook_count; ++i) {
+         Codebook *c = p->codebooks + i;
+         setup_free(p, c->codeword_lengths);
+         setup_free(p, c->multiplicands);
+         setup_free(p, c->codewords);
+         setup_free(p, c->sorted_codewords);
+         // c->sorted_values[-1] is the first entry in the array
+         setup_free(p, c->sorted_values ? c->sorted_values-1 : NULL);
+      }
+      setup_free(p, p->codebooks);
+   }
+   setup_free(p, p->floor_config);
+   setup_free(p, p->residue_config);
+   if (p->mapping) {
+      for (i=0; i < p->mapping_count; ++i)
+         setup_free(p, p->mapping[i].chan);
+      setup_free(p, p->mapping);
+   }
+   CHECK(p);
+   for (i=0; i < p->channels && i < STB_VORBIS_MAX_CHANNELS; ++i) {
+      setup_free(p, p->channel_buffers[i]);
+      setup_free(p, p->previous_window[i]);
+      #ifdef STB_VORBIS_NO_DEFER_FLOOR
+      setup_free(p, p->floor_buffers[i]);
+      #endif
+      setup_free(p, p->finalY[i]);
+   }
+   for (i=0; i < 2; ++i) {
+      setup_free(p, p->A[i]);
+      setup_free(p, p->B[i]);
+      setup_free(p, p->C[i]);
+      setup_free(p, p->window[i]);
+      setup_free(p, p->bit_reverse[i]);
+   }
+   #ifndef STB_VORBIS_NO_STDIO
+   if (p->close_on_free) fclose(p->f);
+   #endif
+}
+
+void stb_vorbis_close(stb_vorbis *p)
+{
+   if (p == NULL) return;
+   vorbis_deinit(p);
+   setup_free(p,p);
+}
+
+static void vorbis_init(stb_vorbis *p, const stb_vorbis_alloc *z)
+{
+   memset(p, 0, sizeof(*p)); // NULL out all malloc'd pointers to start
+   if (z) {
+      p->alloc = *z;
+      p->alloc.alloc_buffer_length_in_bytes = (p->alloc.alloc_buffer_length_in_bytes+3) & ~3;
+      p->temp_offset = p->alloc.alloc_buffer_length_in_bytes;
+   }
+   p->eof = 0;
+   p->error = VORBIS__no_error;
+   p->stream = NULL;
+   p->codebooks = NULL;
+   p->page_crc_tests = -1;
+   #ifndef STB_VORBIS_NO_STDIO
+   p->close_on_free = FALSE;
+   p->f = NULL;
+   #endif
+}
+
+int stb_vorbis_get_sample_offset(stb_vorbis *f)
+{
+   if (f->current_loc_valid)
+      return f->current_loc;
+   else
+      return -1;
+}
+
+stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f)
+{
+   stb_vorbis_info d;
+   d.channels = f->channels;
+   d.sample_rate = f->sample_rate;
+   d.setup_memory_required = f->setup_memory_required;
+   d.setup_temp_memory_required = f->setup_temp_memory_required;
+   d.temp_memory_required = f->temp_memory_required;
+   d.max_frame_size = f->blocksize_1 >> 1;
+   return d;
+}
+
+int stb_vorbis_get_error(stb_vorbis *f)
+{
+   int e = f->error;
+   f->error = VORBIS__no_error;
+   return e;
+}
+
+static stb_vorbis * vorbis_alloc(stb_vorbis *f)
+{
+   stb_vorbis *p = (stb_vorbis *) setup_malloc(f, sizeof(*p));
+   return p;
+}
+
+#ifndef STB_VORBIS_NO_PUSHDATA_API
+
+void stb_vorbis_flush_pushdata(stb_vorbis *f)
+{
+   f->previous_length = 0;
+   f->page_crc_tests  = 0;
+   f->discard_samples_deferred = 0;
+   f->current_loc_valid = FALSE;
+   f->first_decode = FALSE;
+   f->samples_output = 0;
+   f->channel_buffer_start = 0;
+   f->channel_buffer_end = 0;
+}
+
+static int vorbis_search_for_page_pushdata(vorb *f, uint8 *data, int data_len)
+{
+   int i,n;
+   for (i=0; i < f->page_crc_tests; ++i)
+      f->scan[i].bytes_done = 0;
+
+   // if we have room for more scans, search for them first, because
+   // they may cause us to stop early if their header is incomplete
+   if (f->page_crc_tests < STB_VORBIS_PUSHDATA_CRC_COUNT) {
+      if (data_len < 4) return 0;
+      data_len -= 3; // need to look for 4-byte sequence, so don't miss
+                     // one that straddles a boundary
+      for (i=0; i < data_len; ++i) {
+         if (data[i] == 0x4f) {
+            if (0==memcmp(data+i, ogg_page_header, 4)) {
+               int j,len;
+               uint32 crc;
+               // make sure we have the whole page header
+               if (i+26 >= data_len || i+27+data[i+26] >= data_len) {
+                  // only read up to this page start, so hopefully we'll
+                  // have the whole page header start next time
+                  data_len = i;
+                  break;
+               }
+               // ok, we have it all; compute the length of the page
+               len = 27 + data[i+26];
+               for (j=0; j < data[i+26]; ++j)
+                  len += data[i+27+j];
+               // scan everything up to the embedded crc (which we must 0)
+               crc = 0;
+               for (j=0; j < 22; ++j)
+                  crc = crc32_update(crc, data[i+j]);
+               // now process 4 0-bytes
+               for (   ; j < 26; ++j)
+                  crc = crc32_update(crc, 0);
+               // len is the total number of bytes we need to scan
+               n = f->page_crc_tests++;
+               f->scan[n].bytes_left = len-j;
+               f->scan[n].crc_so_far = crc;
+               f->scan[n].goal_crc = data[i+22] + (data[i+23] << 8) + (data[i+24]<<16) + (data[i+25]<<24);
+               // if the last frame on a page is continued to the next, then
+               // we can't recover the sample_loc immediately
+               if (data[i+27+data[i+26]-1] == 255)
+                  f->scan[n].sample_loc = ~0;
+               else
+                  f->scan[n].sample_loc = data[i+6] + (data[i+7] << 8) + (data[i+ 8]<<16) + (data[i+ 9]<<24);
+               f->scan[n].bytes_done = i+j;
+               if (f->page_crc_tests == STB_VORBIS_PUSHDATA_CRC_COUNT)
+                  break;
+               // keep going if we still have room for more
+            }
+         }
+      }
+   }
+
+   for (i=0; i < f->page_crc_tests;) {
+      uint32 crc;
+      int j;
+      int n = f->scan[i].bytes_done;
+      int m = f->scan[i].bytes_left;
+      if (m > data_len - n) m = data_len - n;
+      // m is the bytes to scan in the current chunk
+      crc = f->scan[i].crc_so_far;
+      for (j=0; j < m; ++j)
+         crc = crc32_update(crc, data[n+j]);
+      f->scan[i].bytes_left -= m;
+      f->scan[i].crc_so_far = crc;
+      if (f->scan[i].bytes_left == 0) {
+         // does it match?
+         if (f->scan[i].crc_so_far == f->scan[i].goal_crc) {
+            // Houston, we have page
+            data_len = n+m; // consumption amount is wherever that scan ended
+            f->page_crc_tests = -1; // drop out of page scan mode
+            f->previous_length = 0; // decode-but-don't-output one frame
+            f->next_seg = -1;       // start a new page
+            f->current_loc = f->scan[i].sample_loc; // set the current sample location
+                                    // to the amount we'd have decoded had we decoded this page
+            f->current_loc_valid = f->current_loc != ~0U;
+            return data_len;
+         }
+         // delete entry
+         f->scan[i] = f->scan[--f->page_crc_tests];
+      } else {
+         ++i;
+      }
+   }
+
+   return data_len;
+}
+
+// return value: number of bytes we used
+int stb_vorbis_decode_frame_pushdata(
+         stb_vorbis *f,                   // the file we're decoding
+         const uint8 *data, int data_len, // the memory available for decoding
+         int *channels,                   // place to write number of float * buffers
+         float ***output,                 // place to write float ** array of float * buffers
+         int *samples                     // place to write number of output samples
+     )
+{
+   int i;
+   int len,right,left;
+
+   if (!IS_PUSH_MODE(f)) return error(f, VORBIS_invalid_api_mixing);
+
+   if (f->page_crc_tests >= 0) {
+      *samples = 0;
+      return vorbis_search_for_page_pushdata(f, (uint8 *) data, data_len);
+   }
+
+   f->stream     = (uint8 *) data;
+   f->stream_end = (uint8 *) data + data_len;
+   f->error      = VORBIS__no_error;
+
+   // check that we have the entire packet in memory
+   if (!is_whole_packet_present(f, FALSE)) {
+      *samples = 0;
+      return 0;
+   }
+
+   if (!vorbis_decode_packet(f, &len, &left, &right)) {
+      // save the actual error we encountered
+      enum STBVorbisError error = f->error;
+      if (error == VORBIS_bad_packet_type) {
+         // flush and resynch
+         f->error = VORBIS__no_error;
+         while (get8_packet(f) != EOP)
+            if (f->eof) break;
+         *samples = 0;
+         return (int) (f->stream - data);
+      }
+      if (error == VORBIS_continued_packet_flag_invalid) {
+         if (f->previous_length == 0) {
+            // we may be resynching, in which case it's ok to hit one
+            // of these; just discard the packet
+            f->error = VORBIS__no_error;
+            while (get8_packet(f) != EOP)
+               if (f->eof) break;
+            *samples = 0;
+            return (int) (f->stream - data);
+         }
+      }
+      // if we get an error while parsing, what to do?
+      // well, it DEFINITELY won't work to continue from where we are!
+      stb_vorbis_flush_pushdata(f);
+      // restore the error that actually made us bail
+      f->error = error;
+      *samples = 0;
+      return 1;
+   }
+
+   // success!
+   len = vorbis_finish_frame(f, len, left, right);
+   for (i=0; i < f->channels; ++i)
+      f->outputs[i] = f->channel_buffers[i] + left;
+
+   if (channels) *channels = f->channels;
+   *samples = len;
+   *output = f->outputs;
+   return (int) (f->stream - data);
+}
+
+stb_vorbis *stb_vorbis_open_pushdata(
+         const unsigned char *data, int data_len, // the memory available for decoding
+         int *data_used,              // only defined if result is not NULL
+         int *error, const stb_vorbis_alloc *alloc)
+{
+   stb_vorbis *f, p;
+   vorbis_init(&p, alloc);
+   p.stream     = (uint8 *) data;
+   p.stream_end = (uint8 *) data + data_len;
+   p.push_mode  = TRUE;
+   if (!start_decoder(&p)) {
+      if (p.eof)
+         *error = VORBIS_need_more_data;
+      else
+         *error = p.error;
+      return NULL;
+   }
+   f = vorbis_alloc(&p);
+   if (f) {
+      *f = p;
+      *data_used = (int) (f->stream - data);
+      *error = 0;
+      return f;
+   } else {
+      vorbis_deinit(&p);
+      return NULL;
+   }
+}
+#endif // STB_VORBIS_NO_PUSHDATA_API
+
+unsigned int stb_vorbis_get_file_offset(stb_vorbis *f)
+{
+   #ifndef STB_VORBIS_NO_PUSHDATA_API
+   if (f->push_mode) return 0;
+   #endif
+   if (USE_MEMORY(f)) return (unsigned int) (f->stream - f->stream_start);
+   #ifndef STB_VORBIS_NO_STDIO
+   return (unsigned int) (ftell(f->f) - f->f_start);
+   #endif
+}
+
+#ifndef STB_VORBIS_NO_PULLDATA_API
+//
+// DATA-PULLING API
+//
+
+static uint32 vorbis_find_page(stb_vorbis *f, uint32 *end, uint32 *last)
+{
+   for(;;) {
+      int n;
+      if (f->eof) return 0;
+      n = get8(f);
+      if (n == 0x4f) { // page header candidate
+         unsigned int retry_loc = stb_vorbis_get_file_offset(f);
+         int i;
+         // check if we're off the end of a file_section stream
+         if (retry_loc - 25 > f->stream_len)
+            return 0;
+         // check the rest of the header
+         for (i=1; i < 4; ++i)
+            if (get8(f) != ogg_page_header[i])
+               break;
+         if (f->eof) return 0;
+         if (i == 4) {
+            uint8 header[27];
+            uint32 i, crc, goal, len;
+            for (i=0; i < 4; ++i)
+               header[i] = ogg_page_header[i];
+            for (; i < 27; ++i)
+               header[i] = get8(f);
+            if (f->eof) return 0;
+            if (header[4] != 0) goto invalid;
+            goal = header[22] + (header[23] << 8) + (header[24]<<16) + (header[25]<<24);
+            for (i=22; i < 26; ++i)
+               header[i] = 0;
+            crc = 0;
+            for (i=0; i < 27; ++i)
+               crc = crc32_update(crc, header[i]);
+            len = 0;
+            for (i=0; i < header[26]; ++i) {
+               int s = get8(f);
+               crc = crc32_update(crc, s);
+               len += s;
+            }
+            if (len && f->eof) return 0;
+            for (i=0; i < len; ++i)
+               crc = crc32_update(crc, get8(f));
+            // finished parsing probable page
+            if (crc == goal) {
+               // we could now check that it's either got the last
+               // page flag set, OR it's followed by the capture
+               // pattern, but I guess TECHNICALLY you could have
+               // a file with garbage between each ogg page and recover
+               // from it automatically? So even though that paranoia
+               // might decrease the chance of an invalid decode by
+               // another 2^32, not worth it since it would hose those
+               // invalid-but-useful files?
+               if (end)
+                  *end = stb_vorbis_get_file_offset(f);
+               if (last) {
+                  if (header[5] & 0x04)
+                     *last = 1;
+                  else
+                     *last = 0;
+               }
+               set_file_offset(f, retry_loc-1);
+               return 1;
+            }
+         }
+        invalid:
+         // not a valid page, so rewind and look for next one
+         set_file_offset(f, retry_loc);
+      }
+   }
+}
+
+
+#define SAMPLE_unknown  0xffffffff
+
+// seeking is implemented with a binary search, which narrows down the range to
+// 64K, before using a linear search (because finding the synchronization
+// pattern can be expensive, and the chance we'd find the end page again is
+// relatively high for small ranges)
+//
+// two initial interpolation-style probes are used at the start of the search
+// to try to bound either side of the binary search sensibly, while still
+// working in O(log n) time if they fail.
+
+static int get_seek_page_info(stb_vorbis *f, ProbedPage *z)
+{
+   uint8 header[27], lacing[255];
+   int i,len;
+
+   // record where the page starts
+   z->page_start = stb_vorbis_get_file_offset(f);
+
+   // parse the header
+   getn(f, header, 27);
+   if (header[0] != 'O' || header[1] != 'g' || header[2] != 'g' || header[3] != 'S')
+      return 0;
+   getn(f, lacing, header[26]);
+
+   // determine the length of the payload
+   len = 0;
+   for (i=0; i < header[26]; ++i)
+      len += lacing[i];
+
+   // this implies where the page ends
+   z->page_end = z->page_start + 27 + header[26] + len;
+
+   // read the last-decoded sample out of the data
+   z->last_decoded_sample = header[6] + (header[7] << 8) + (header[8] << 16) + (header[9] << 24);
+
+   // restore file state to where we were
+   set_file_offset(f, z->page_start);
+   return 1;
+}
+
+// rarely used function to seek back to the preceeding page while finding the
+// start of a packet
+static int go_to_page_before(stb_vorbis *f, unsigned int limit_offset)
+{
+   unsigned int previous_safe, end;
+
+   // now we want to seek back 64K from the limit
+   if (limit_offset >= 65536 && limit_offset-65536 >= f->first_audio_page_offset)
+      previous_safe = limit_offset - 65536;
+   else
+      previous_safe = f->first_audio_page_offset;
+
+   set_file_offset(f, previous_safe);
+
+   while (vorbis_find_page(f, &end, NULL)) {
+      if (end >= limit_offset && stb_vorbis_get_file_offset(f) < limit_offset)
+         return 1;
+      set_file_offset(f, end);
+   }
+
+   return 0;
+}
+
+// implements the search logic for finding a page and starting decoding. if
+// the function succeeds, current_loc_valid will be true and current_loc will
+// be less than or equal to the provided sample number (the closer the
+// better).
+static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number)
+{
+   ProbedPage left, right, mid;
+   int i, start_seg_with_known_loc, end_pos, page_start;
+   uint32 delta, stream_length, padding;
+   double offset, bytes_per_sample;
+   int probe = 0;
+
+   // find the last page and validate the target sample
+   stream_length = stb_vorbis_stream_length_in_samples(f);
+   if (stream_length == 0)            return error(f, VORBIS_seek_without_length);
+   if (sample_number > stream_length) return error(f, VORBIS_seek_invalid);
+
+   // this is the maximum difference between the window-center (which is the
+   // actual granule position value), and the right-start (which the spec
+   // indicates should be the granule position (give or take one)).
+   padding = ((f->blocksize_1 - f->blocksize_0) >> 2);
+   if (sample_number < padding)
+      sample_number = 0;
+   else
+      sample_number -= padding;
+
+   left = f->p_first;
+   while (left.last_decoded_sample == ~0U) {
+      // (untested) the first page does not have a 'last_decoded_sample'
+      set_file_offset(f, left.page_end);
+      if (!get_seek_page_info(f, &left)) goto error;
+   }
+
+   right = f->p_last;
+   assert(right.last_decoded_sample != ~0U);
+
+   // starting from the start is handled differently
+   if (sample_number <= left.last_decoded_sample) {
+      stb_vorbis_seek_start(f);
+      return 1;
+   }
+
+   while (left.page_end != right.page_start) {
+      assert(left.page_end < right.page_start);
+      // search range in bytes
+      delta = right.page_start - left.page_end;
+      if (delta <= 65536) {
+         // there's only 64K left to search - handle it linearly
+         set_file_offset(f, left.page_end);
+      } else {
+         if (probe < 2) {
+            if (probe == 0) {
+               // first probe (interpolate)
+               double data_bytes = right.page_end - left.page_start;
+               bytes_per_sample = data_bytes / right.last_decoded_sample;
+               offset = left.page_start + bytes_per_sample * (sample_number - left.last_decoded_sample);
+            } else {
+               // second probe (try to bound the other side)
+               double error = ((double) sample_number - mid.last_decoded_sample) * bytes_per_sample;
+               if (error >= 0 && error <  8000) error =  8000;
+               if (error <  0 && error > -8000) error = -8000;
+               offset += error * 2;
+            }
+
+            // ensure the offset is valid
+            if (offset < left.page_end)
+               offset = left.page_end;
+            if (offset > right.page_start - 65536)
+               offset = right.page_start - 65536;
+
+            set_file_offset(f, (unsigned int) offset);
+         } else {
+            // binary search for large ranges (offset by 32K to ensure
+            // we don't hit the right page)
+            set_file_offset(f, left.page_end + (delta / 2) - 32768);
+         }
+
+         if (!vorbis_find_page(f, NULL, NULL)) goto error;
+      }
+
+      for (;;) {
+         if (!get_seek_page_info(f, &mid)) goto error;
+         if (mid.last_decoded_sample != ~0U) break;
+         // (untested) no frames end on this page
+         set_file_offset(f, mid.page_end);
+         assert(mid.page_start < right.page_start);
+      }
+
+      // if we've just found the last page again then we're in a tricky file,
+      // and we're close enough.
+      if (mid.page_start == right.page_start)
+         break;
+
+      if (sample_number < mid.last_decoded_sample)
+         right = mid;
+      else
+         left = mid;
+
+      ++probe;
+   }
+
+   // seek back to start of the last packet
+   page_start = left.page_start;
+   set_file_offset(f, page_start);
+   if (!start_page(f)) return error(f, VORBIS_seek_failed);
+   end_pos = f->end_seg_with_known_loc;
+   assert(end_pos >= 0);
+
+   for (;;) {
+      for (i = end_pos; i > 0; --i)
+         if (f->segments[i-1] != 255)
+            break;
+
+      start_seg_with_known_loc = i;
+
+      if (start_seg_with_known_loc > 0 || !(f->page_flag & PAGEFLAG_continued_packet))
+         break;
+
+      // (untested) the final packet begins on an earlier page
+      if (!go_to_page_before(f, page_start))
+         goto error;
+
+      page_start = stb_vorbis_get_file_offset(f);
+      if (!start_page(f)) goto error;
+      end_pos = f->segment_count - 1;
+   }
+
+   // prepare to start decoding
+   f->current_loc_valid = FALSE;
+   f->last_seg = FALSE;
+   f->valid_bits = 0;
+   f->packet_bytes = 0;
+   f->bytes_in_seg = 0;
+   f->previous_length = 0;
+   f->next_seg = start_seg_with_known_loc;
+
+   for (i = 0; i < start_seg_with_known_loc; i++)
+      skip(f, f->segments[i]);
+
+   // start decoding (optimizable - this frame is generally discarded)
+   vorbis_pump_first_frame(f);
+   return 1;
+
+error:
+   // try to restore the file to a valid state
+   stb_vorbis_seek_start(f);
+   return error(f, VORBIS_seek_failed);
+}
+
+// the same as vorbis_decode_initial, but without advancing
+static int peek_decode_initial(vorb *f, int *p_left_start, int *p_left_end, int *p_right_start, int *p_right_end, int *mode)
+{
+   int bits_read, bytes_read;
+
+   if (!vorbis_decode_initial(f, p_left_start, p_left_end, p_right_start, p_right_end, mode))
+      return 0;
+
+   // either 1 or 2 bytes were read, figure out which so we can rewind
+   bits_read = 1 + ilog(f->mode_count-1);
+   if (f->mode_config[*mode].blockflag)
+      bits_read += 2;
+   bytes_read = (bits_read + 7) / 8;
+
+   f->bytes_in_seg += bytes_read;
+   f->packet_bytes -= bytes_read;
+   skip(f, -bytes_read);
+   if (f->next_seg == -1)
+      f->next_seg = f->segment_count - 1;
+   else
+      f->next_seg--;
+   f->valid_bits = 0;
+
+   return 1;
+}
+
+int stb_vorbis_seek_frame(stb_vorbis *f, unsigned int sample_number)
+{
+   uint32 max_frame_samples;
+
+   if (IS_PUSH_MODE(f)) return error(f, VORBIS_invalid_api_mixing);
+
+   // fast page-level search
+   if (!seek_to_sample_coarse(f, sample_number))
+      return 0;
+
+   assert(f->current_loc_valid);
+   assert(f->current_loc <= sample_number);
+
+   // linear search for the relevant packet
+   max_frame_samples = (f->blocksize_1*3 - f->blocksize_0) >> 2;
+   while (f->current_loc < sample_number) {
+      int left_start, left_end, right_start, right_end, mode, frame_samples;
+      if (!peek_decode_initial(f, &left_start, &left_end, &right_start, &right_end, &mode))
+         return error(f, VORBIS_seek_failed);
+      // calculate the number of samples returned by the next frame
+      frame_samples = right_start - left_start;
+      if (f->current_loc + frame_samples > sample_number) {
+         return 1; // the next frame will contain the sample
+      } else if (f->current_loc + frame_samples + max_frame_samples > sample_number) {
+         // there's a chance the frame after this could contain the sample
+         vorbis_pump_first_frame(f);
+      } else {
+         // this frame is too early to be relevant
+         f->current_loc += frame_samples;
+         f->previous_length = 0;
+         maybe_start_packet(f);
+         flush_packet(f);
+      }
+   }
+   // the next frame will start with the sample
+   assert(f->current_loc == sample_number);
+   return 1;
+}
+
+int stb_vorbis_seek(stb_vorbis *f, unsigned int sample_number)
+{
+   if (!stb_vorbis_seek_frame(f, sample_number))
+      return 0;
+
+   if (sample_number != f->current_loc) {
+      int n;
+      uint32 frame_start = f->current_loc;
+      stb_vorbis_get_frame_float(f, &n, NULL);
+      assert(sample_number > frame_start);
+      assert(f->channel_buffer_start + (int) (sample_number-frame_start) <= f->channel_buffer_end);
+      f->channel_buffer_start += (sample_number - frame_start);
+   }
+
+   return 1;
+}
+
+void stb_vorbis_seek_start(stb_vorbis *f)
+{
+   if (IS_PUSH_MODE(f)) { error(f, VORBIS_invalid_api_mixing); return; }
+   set_file_offset(f, f->first_audio_page_offset);
+   f->previous_length = 0;
+   f->first_decode = TRUE;
+   f->next_seg = -1;
+   vorbis_pump_first_frame(f);
+}
+
+unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f)
+{
+   unsigned int restore_offset, previous_safe;
+   unsigned int end, last_page_loc;
+
+   if (IS_PUSH_MODE(f)) return error(f, VORBIS_invalid_api_mixing);
+   if (!f->total_samples) {
+      unsigned int last;
+      uint32 lo,hi;
+      char header[6];
+
+      // first, store the current decode position so we can restore it
+      restore_offset = stb_vorbis_get_file_offset(f);
+
+      // now we want to seek back 64K from the end (the last page must
+      // be at most a little less than 64K, but let's allow a little slop)
+      if (f->stream_len >= 65536 && f->stream_len-65536 >= f->first_audio_page_offset)
+         previous_safe = f->stream_len - 65536;
+      else
+         previous_safe = f->first_audio_page_offset;
+
+      set_file_offset(f, previous_safe);
+      // previous_safe is now our candidate 'earliest known place that seeking
+      // to will lead to the final page'
+
+      if (!vorbis_find_page(f, &end, &last)) {
+         // if we can't find a page, we're hosed!
+         f->error = VORBIS_cant_find_last_page;
+         f->total_samples = 0xffffffff;
+         goto done;
+      }
+
+      // check if there are more pages
+      last_page_loc = stb_vorbis_get_file_offset(f);
+
+      // stop when the last_page flag is set, not when we reach eof;
+      // this allows us to stop short of a 'file_section' end without
+      // explicitly checking the length of the section
+      while (!last) {
+         set_file_offset(f, end);
+         if (!vorbis_find_page(f, &end, &last)) {
+            // the last page we found didn't have the 'last page' flag
+            // set. whoops!
+            break;
+         }
+         previous_safe = last_page_loc+1;
+         last_page_loc = stb_vorbis_get_file_offset(f);
+      }
+
+      set_file_offset(f, last_page_loc);
+
+      // parse the header
+      getn(f, (unsigned char *)header, 6);
+      // extract the absolute granule position
+      lo = get32(f);
+      hi = get32(f);
+      if (lo == 0xffffffff && hi == 0xffffffff) {
+         f->error = VORBIS_cant_find_last_page;
+         f->total_samples = SAMPLE_unknown;
+         goto done;
+      }
+      if (hi)
+         lo = 0xfffffffe; // saturate
+      f->total_samples = lo;
+
+      f->p_last.page_start = last_page_loc;
+      f->p_last.page_end   = end;
+      f->p_last.last_decoded_sample = lo;
+
+     done:
+      set_file_offset(f, restore_offset);
+   }
+   return f->total_samples == SAMPLE_unknown ? 0 : f->total_samples;
+}
+
+float stb_vorbis_stream_length_in_seconds(stb_vorbis *f)
+{
+   return stb_vorbis_stream_length_in_samples(f) / (float) f->sample_rate;
+}
+
+
+
+int stb_vorbis_get_frame_float(stb_vorbis *f, int *channels, float ***output)
+{
+   int len, right,left,i;
+   if (IS_PUSH_MODE(f)) return error(f, VORBIS_invalid_api_mixing);
+
+   if (!vorbis_decode_packet(f, &len, &left, &right)) {
+      f->channel_buffer_start = f->channel_buffer_end = 0;
+      return 0;
+   }
+
+   len = vorbis_finish_frame(f, len, left, right);
+   for (i=0; i < f->channels; ++i)
+      f->outputs[i] = f->channel_buffers[i] + left;
+
+   f->channel_buffer_start = left;
+   f->channel_buffer_end   = left+len;
+
+   if (channels) *channels = f->channels;
+   if (output)   *output = f->outputs;
+   return len;
+}
+
+#ifndef STB_VORBIS_NO_STDIO
+
+stb_vorbis * stb_vorbis_open_file_section(FILE *file, int close_on_free, int *error, const stb_vorbis_alloc *alloc, unsigned int length)
+{
+   stb_vorbis *f, p;
+   vorbis_init(&p, alloc);
+   p.f = file;
+   p.f_start = (uint32) ftell(file);
+   p.stream_len   = length;
+   p.close_on_free = close_on_free;
+   if (start_decoder(&p)) {
+      f = vorbis_alloc(&p);
+      if (f) {
+         *f = p;
+         vorbis_pump_first_frame(f);
+         return f;
+      }
+   }
+   if (error) *error = p.error;
+   vorbis_deinit(&p);
+   return NULL;
+}
+
+stb_vorbis * stb_vorbis_open_file(FILE *file, int close_on_free, int *error, const stb_vorbis_alloc *alloc)
+{
+   unsigned int len, start;
+   start = (unsigned int) ftell(file);
+   fseek(file, 0, SEEK_END);
+   len = (unsigned int) (ftell(file) - start);
+   fseek(file, start, SEEK_SET);
+   return stb_vorbis_open_file_section(file, close_on_free, error, alloc, len);
+}
+
+stb_vorbis * stb_vorbis_open_filename(const char *filename, int *error, const stb_vorbis_alloc *alloc)
+{
+   FILE *f = fopen(filename, "rb");
+   if (f) 
+      return stb_vorbis_open_file(f, TRUE, error, alloc);
+   if (error) *error = VORBIS_file_open_failure;
+   return NULL;
+}
+#endif // STB_VORBIS_NO_STDIO
+
+stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len, int *error, const stb_vorbis_alloc *alloc)
+{
+   stb_vorbis *f, p;
+   if (data == NULL) return NULL;
+   vorbis_init(&p, alloc);
+   p.stream = (uint8 *) data;
+   p.stream_end = (uint8 *) data + len;
+   p.stream_start = (uint8 *) p.stream;
+   p.stream_len = len;
+   p.push_mode = FALSE;
+   if (start_decoder(&p)) {
+      f = vorbis_alloc(&p);
+      if (f) {
+         *f = p;
+         vorbis_pump_first_frame(f);
+         return f;
+      }
+   }
+   if (error) *error = p.error;
+   vorbis_deinit(&p);
+   return NULL;
+}
+
+#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
+#define PLAYBACK_MONO     1
+#define PLAYBACK_LEFT     2
+#define PLAYBACK_RIGHT    4
+
+#define L  (PLAYBACK_LEFT  | PLAYBACK_MONO)
+#define C  (PLAYBACK_LEFT  | PLAYBACK_RIGHT | PLAYBACK_MONO)
+#define R  (PLAYBACK_RIGHT | PLAYBACK_MONO)
+
+static int8 channel_position[7][6] =
+{
+   { 0 },
+   { C },
+   { L, R },
+   { L, C, R },
+   { L, R, L, R },
+   { L, C, R, L, R },
+   { L, C, R, L, R, C },
+};
+
+
+#ifndef STB_VORBIS_NO_FAST_SCALED_FLOAT
+   typedef union {
+      float f;
+      int i;
+   } float_conv;
+   typedef char stb_vorbis_float_size_test[sizeof(float)==4 && sizeof(int) == 4];
+   #define FASTDEF(x) float_conv x
+   // add (1<<23) to convert to int, then divide by 2^SHIFT, then add 0.5/2^SHIFT to round
+   #define MAGIC(SHIFT) (1.5f * (1 << (23-SHIFT)) + 0.5f/(1 << SHIFT))
+   #define ADDEND(SHIFT) (((150-SHIFT) << 23) + (1 << 22))
+   #define FAST_SCALED_FLOAT_TO_INT(temp,x,s) (temp.f = (x) + MAGIC(s), temp.i - ADDEND(s))
+   #define check_endianness()  
+#else
+   #define FAST_SCALED_FLOAT_TO_INT(temp,x,s) ((int) ((x) * (1 << (s))))
+   #define check_endianness()
+   #define FASTDEF(x)
+#endif
+
+static void copy_samples(short *dest, float *src, int len)
+{
+   int i;
+   check_endianness();
+   for (i=0; i < len; ++i) {
+      FASTDEF(temp);
+      int v = FAST_SCALED_FLOAT_TO_INT(temp, src[i],15);
+      if ((unsigned int) (v + 32768) > 65535)
+         v = v < 0 ? -32768 : 32767;
+      dest[i] = v;
+   }
+}
+
+static void compute_samples(int mask, short *output, int num_c, float **data, int d_offset, int len)
+{
+   #define BUFFER_SIZE  32
+   float buffer[BUFFER_SIZE];
+   int i,j,o,n = BUFFER_SIZE;
+   check_endianness();
+   for (o = 0; o < len; o += BUFFER_SIZE) {
+      memset(buffer, 0, sizeof(buffer));
+      if (o + n > len) n = len - o;
+      for (j=0; j < num_c; ++j) {
+         if (channel_position[num_c][j] & mask) {
+            for (i=0; i < n; ++i)
+               buffer[i] += data[j][d_offset+o+i];
+         }
+      }
+      for (i=0; i < n; ++i) {
+         FASTDEF(temp);
+         int v = FAST_SCALED_FLOAT_TO_INT(temp,buffer[i],15);
+         if ((unsigned int) (v + 32768) > 65535)
+            v = v < 0 ? -32768 : 32767;
+         output[o+i] = v;
+      }
+   }
+}
+
+static void compute_stereo_samples(short *output, int num_c, float **data, int d_offset, int len)
+{
+   #define BUFFER_SIZE  32
+   float buffer[BUFFER_SIZE];
+   int i,j,o,n = BUFFER_SIZE >> 1;
+   // o is the offset in the source data
+   check_endianness();
+   for (o = 0; o < len; o += BUFFER_SIZE >> 1) {
+      // o2 is the offset in the output data
+      int o2 = o << 1;
+      memset(buffer, 0, sizeof(buffer));
+      if (o + n > len) n = len - o;
+      for (j=0; j < num_c; ++j) {
+         int m = channel_position[num_c][j] & (PLAYBACK_LEFT | PLAYBACK_RIGHT);
+         if (m == (PLAYBACK_LEFT | PLAYBACK_RIGHT)) {
+            for (i=0; i < n; ++i) {
+               buffer[i*2+0] += data[j][d_offset+o+i];
+               buffer[i*2+1] += data[j][d_offset+o+i];
+            }
+         } else if (m == PLAYBACK_LEFT) {
+            for (i=0; i < n; ++i) {
+               buffer[i*2+0] += data[j][d_offset+o+i];
+            }
+         } else if (m == PLAYBACK_RIGHT) {
+            for (i=0; i < n; ++i) {
+               buffer[i*2+1] += data[j][d_offset+o+i];
+            }
+         }
+      }
+      for (i=0; i < (n<<1); ++i) {
+         FASTDEF(temp);
+         int v = FAST_SCALED_FLOAT_TO_INT(temp,buffer[i],15);
+         if ((unsigned int) (v + 32768) > 65535)
+            v = v < 0 ? -32768 : 32767;
+         output[o2+i] = v;
+      }
+   }
+}
+
+static void convert_samples_short(int buf_c, short **buffer, int b_offset, int data_c, float **data, int d_offset, int samples)
+{
+   int i;
+   if (buf_c != data_c && buf_c <= 2 && data_c <= 6) {
+      static int channel_selector[3][2] = { {0}, {PLAYBACK_MONO}, {PLAYBACK_LEFT, PLAYBACK_RIGHT} };
+      for (i=0; i < buf_c; ++i)
+         compute_samples(channel_selector[buf_c][i], buffer[i]+b_offset, data_c, data, d_offset, samples);
+   } else {
+      int limit = buf_c < data_c ? buf_c : data_c;
+      for (i=0; i < limit; ++i)
+         copy_samples(buffer[i]+b_offset, data[i]+d_offset, samples);
+      for (   ; i < buf_c; ++i)
+         memset(buffer[i]+b_offset, 0, sizeof(short) * samples);
+   }
+}
+
+int stb_vorbis_get_frame_short(stb_vorbis *f, int num_c, short **buffer, int num_samples)
+{
+   float **output;
+   int len = stb_vorbis_get_frame_float(f, NULL, &output);
+   if (len > num_samples) len = num_samples;
+   if (len)
+      convert_samples_short(num_c, buffer, 0, f->channels, output, 0, len);
+   return len;
+}
+
+static void convert_channels_short_interleaved(int buf_c, short *buffer, int data_c, float **data, int d_offset, int len)
+{
+   int i;
+   check_endianness();
+   if (buf_c != data_c && buf_c <= 2 && data_c <= 6) {
+      assert(buf_c == 2);
+      for (i=0; i < buf_c; ++i)
+         compute_stereo_samples(buffer, data_c, data, d_offset, len);
+   } else {
+      int limit = buf_c < data_c ? buf_c : data_c;
+      int j;
+      for (j=0; j < len; ++j) {
+         for (i=0; i < limit; ++i) {
+            FASTDEF(temp);
+            float f = data[i][d_offset+j];
+            int v = FAST_SCALED_FLOAT_TO_INT(temp, f,15);//data[i][d_offset+j],15);
+            if ((unsigned int) (v + 32768) > 65535)
+               v = v < 0 ? -32768 : 32767;
+            *buffer++ = v;
+         }
+         for (   ; i < buf_c; ++i)
+            *buffer++ = 0;
+      }
+   }
+}
+
+int stb_vorbis_get_frame_short_interleaved(stb_vorbis *f, int num_c, short *buffer, int num_shorts)
+{
+   float **output;
+   int len;
+   if (num_c == 1) return stb_vorbis_get_frame_short(f,num_c,&buffer, num_shorts);
+   len = stb_vorbis_get_frame_float(f, NULL, &output);
+   if (len) {
+      if (len*num_c > num_shorts) len = num_shorts / num_c;
+      convert_channels_short_interleaved(num_c, buffer, f->channels, output, 0, len);
+   }
+   return len;
+}
+
+int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short *buffer, int num_shorts)
+{
+   float **outputs;
+   int len = num_shorts / channels;
+   int n=0;
+   int z = f->channels;
+   if (z > channels) z = channels;
+   while (n < len) {
+      int k = f->channel_buffer_end - f->channel_buffer_start;
+      if (n+k >= len) k = len - n;
+      if (k)
+         convert_channels_short_interleaved(channels, buffer, f->channels, f->channel_buffers, f->channel_buffer_start, k);
+      buffer += k*channels;
+      n += k;
+      f->channel_buffer_start += k;
+      if (n == len) break;
+      if (!stb_vorbis_get_frame_float(f, NULL, &outputs)) break;
+   }
+   return n;
+}
+
+int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, int len)
+{
+   float **outputs;
+   int n=0;
+   int z = f->channels;
+   if (z > channels) z = channels;
+   while (n < len) {
+      int k = f->channel_buffer_end - f->channel_buffer_start;
+      if (n+k >= len) k = len - n;
+      if (k)
+         convert_samples_short(channels, buffer, n, f->channels, f->channel_buffers, f->channel_buffer_start, k);
+      n += k;
+      f->channel_buffer_start += k;
+      if (n == len) break;
+      if (!stb_vorbis_get_frame_float(f, NULL, &outputs)) break;
+   }
+   return n;
+}
+
+#ifndef STB_VORBIS_NO_STDIO
+int stb_vorbis_decode_filename(const char *filename, int *channels, int *sample_rate, short **output)
+{
+   int data_len, offset, total, limit, error;
+   short *data;
+   stb_vorbis *v = stb_vorbis_open_filename(filename, &error, NULL);
+   if (v == NULL) return -1;
+   limit = v->channels * 4096;
+   *channels = v->channels;
+   if (sample_rate)
+      *sample_rate = v->sample_rate;
+   offset = data_len = 0;
+   total = limit;
+   data = (short *) malloc(total * sizeof(*data));
+   if (data == NULL) {
+      stb_vorbis_close(v);
+      return -2;
+   }
+   for (;;) {
+      int n = stb_vorbis_get_frame_short_interleaved(v, v->channels, data+offset, total-offset);
+      if (n == 0) break;
+      data_len += n;
+      offset += n * v->channels;
+      if (offset + limit > total) {
+         short *data2;
+         total *= 2;
+         data2 = (short *) realloc(data, total * sizeof(*data));
+         if (data2 == NULL) {
+            free(data);
+            stb_vorbis_close(v);
+            return -2;
+         }
+         data = data2;
+      }
+   }
+   *output = data;
+   stb_vorbis_close(v);
+   return data_len;
+}
+#endif // NO_STDIO
+
+int stb_vorbis_decode_memory(const uint8 *mem, int len, int *channels, int *sample_rate, short **output)
+{
+   int data_len, offset, total, limit, error;
+   short *data;
+   stb_vorbis *v = stb_vorbis_open_memory(mem, len, &error, NULL);
+   if (v == NULL) return -1;
+   limit = v->channels * 4096;
+   *channels = v->channels;
+   if (sample_rate)
+      *sample_rate = v->sample_rate;
+   offset = data_len = 0;
+   total = limit;
+   data = (short *) malloc(total * sizeof(*data));
+   if (data == NULL) {
+      stb_vorbis_close(v);
+      return -2;
+   }
+   for (;;) {
+      int n = stb_vorbis_get_frame_short_interleaved(v, v->channels, data+offset, total-offset);
+      if (n == 0) break;
+      data_len += n;
+      offset += n * v->channels;
+      if (offset + limit > total) {
+         short *data2;
+         total *= 2;
+         data2 = (short *) realloc(data, total * sizeof(*data));
+         if (data2 == NULL) {
+            free(data);
+            stb_vorbis_close(v);
+            return -2;
+         }
+         data = data2;
+      }
+   }
+   *output = data;
+   stb_vorbis_close(v);
+   return data_len;
+}
+#endif // STB_VORBIS_NO_INTEGER_CONVERSION
+
+int stb_vorbis_get_samples_float_interleaved(stb_vorbis *f, int channels, float *buffer, int num_floats)
+{
+   float **outputs;
+   int len = num_floats / channels;
+   int n=0;
+   int z = f->channels;
+   if (z > channels) z = channels;
+   while (n < len) {
+      int i,j;
+      int k = f->channel_buffer_end - f->channel_buffer_start;
+      if (n+k >= len) k = len - n;
+      for (j=0; j < k; ++j) {
+         for (i=0; i < z; ++i)
+            *buffer++ = f->channel_buffers[i][f->channel_buffer_start+j];
+         for (   ; i < channels; ++i)
+            *buffer++ = 0;
+      }
+      n += k;
+      f->channel_buffer_start += k;
+      if (n == len)
+         break;
+      if (!stb_vorbis_get_frame_float(f, NULL, &outputs))
+         break;
+   }
+   return n;
+}
+
+int stb_vorbis_get_samples_float(stb_vorbis *f, int channels, float **buffer, int num_samples)
+{
+   float **outputs;
+   int n=0;
+   int z = f->channels;
+   if (z > channels) z = channels;
+   while (n < num_samples) {
+      int i;
+      int k = f->channel_buffer_end - f->channel_buffer_start;
+      if (n+k >= num_samples) k = num_samples - n;
+      if (k) {
+         for (i=0; i < z; ++i)
+            memcpy(buffer[i]+n, f->channel_buffers[i]+f->channel_buffer_start, sizeof(float)*k);
+         for (   ; i < channels; ++i)
+            memset(buffer[i]+n, 0, sizeof(float) * k);
+      }
+      n += k;
+      f->channel_buffer_start += k;
+      if (n == num_samples)
+         break;
+      if (!stb_vorbis_get_frame_float(f, NULL, &outputs))
+         break;
+   }
+   return n;
+}
+#endif // STB_VORBIS_NO_PULLDATA_API
+
+/* Version history
+    1.09    - 2016/04/04 - back out 'avoid discarding last frame' fix from previous version
+    1.08    - 2016/04/02 - fixed multiple warnings; fix setup memory leaks;
+                           avoid discarding last frame of audio data
+    1.07    - 2015/01/16 - fixed some warnings, fix mingw, const-correct API
+                           some more crash fixes when out of memory or with corrupt files 
+    1.06    - 2015/08/31 - full, correct support for seeking API (Dougall Johnson)
+                           some crash fixes when out of memory or with corrupt files
+    1.05    - 2015/04/19 - don't define __forceinline if it's redundant
+    1.04    - 2014/08/27 - fix missing const-correct case in API
+    1.03    - 2014/08/07 - Warning fixes
+    1.02    - 2014/07/09 - Declare qsort compare function _cdecl on windows
+    1.01    - 2014/06/18 - fix stb_vorbis_get_samples_float
+    1.0     - 2014/05/26 - fix memory leaks; fix warnings; fix bugs in multichannel
+                           (API change) report sample rate for decode-full-file funcs
+    0.99996 - bracket #include <malloc.h> for macintosh compilation by Laurent Gomila
+    0.99995 - use union instead of pointer-cast for fast-float-to-int to avoid alias-optimization problem
+    0.99994 - change fast-float-to-int to work in single-precision FPU mode, remove endian-dependence
+    0.99993 - remove assert that fired on legal files with empty tables
+    0.99992 - rewind-to-start
+    0.99991 - bugfix to stb_vorbis_get_samples_short by Bernhard Wodo
+    0.9999 - (should have been 0.99990) fix no-CRT support, compiling as C++
+    0.9998 - add a full-decode function with a memory source
+    0.9997 - fix a bug in the read-from-FILE case in 0.9996 addition
+    0.9996 - query length of vorbis stream in samples/seconds
+    0.9995 - bugfix to another optimization that only happened in certain files
+    0.9994 - bugfix to one of the optimizations that caused significant (but inaudible?) errors
+    0.9993 - performance improvements; runs in 99% to 104% of time of reference implementation
+    0.9992 - performance improvement of IMDCT; now performs close to reference implementation
+    0.9991 - performance improvement of IMDCT
+    0.999 - (should have been 0.9990) performance improvement of IMDCT
+    0.998 - no-CRT support from Casey Muratori
+    0.997 - bugfixes for bugs found by Terje Mathisen
+    0.996 - bugfix: fast-huffman decode initialized incorrectly for sparse codebooks; fixing gives 10% speedup - found by Terje Mathisen
+    0.995 - bugfix: fix to 'effective' overrun detection - found by Terje Mathisen
+    0.994 - bugfix: garbage decode on final VQ symbol of a non-multiple - found by Terje Mathisen
+    0.993 - bugfix: pushdata API required 1 extra byte for empty page (failed to consume final page if empty) - found by Terje Mathisen
+    0.992 - fixes for MinGW warning
+    0.991 - turn fast-float-conversion on by default
+    0.990 - fix push-mode seek recovery if you seek into the headers
+    0.98b - fix to bad release of 0.98
+    0.98 - fix push-mode seek recovery; robustify float-to-int and support non-fast mode
+    0.97 - builds under c++ (typecasting, don't use 'class' keyword)
+    0.96 - somehow MY 0.95 was right, but the web one was wrong, so here's my 0.95 rereleased as 0.96, fixes a typo in the clamping code
+    0.95 - clamping code for 16-bit functions
+    0.94 - not publically released
+    0.93 - fixed all-zero-floor case (was decoding garbage)
+    0.92 - fixed a memory leak
+    0.91 - conditional compiles to omit parts of the API and the infrastructure to support them: STB_VORBIS_NO_PULLDATA_API, STB_VORBIS_NO_PUSHDATA_API, STB_VORBIS_NO_STDIO, STB_VORBIS_NO_INTEGER_CONVERSION
+    0.90 - first public release
+*/
+
+#endif // STB_VORBIS_HEADER_ONLY
diff --git a/thirdparty/misc/triangulator.cpp b/thirdparty/misc/triangulator.cpp
new file mode 100644
index 0000000000..75b2b064c4
--- /dev/null
+++ b/thirdparty/misc/triangulator.cpp
@@ -0,0 +1,1550 @@
+//Copyright (C) 2011 by Ivan Fratric
+//
+//Permission is hereby granted, free of charge, to any person obtaining a copy
+//of this software and associated documentation files (the "Software"), to deal
+//in the Software without restriction, including without limitation the rights
+//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//copies of the Software, and to permit persons to whom the Software is
+//furnished to do so, subject to the following conditions:
+//
+//The above copyright notice and this permission notice shall be included in
+//all copies or substantial portions of the Software.
+//
+//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//THE SOFTWARE.
+
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "triangulator.h"
+
+
+#define TRIANGULATOR_VERTEXTYPE_REGULAR 0
+#define TRIANGULATOR_VERTEXTYPE_START 1
+#define TRIANGULATOR_VERTEXTYPE_END 2
+#define TRIANGULATOR_VERTEXTYPE_SPLIT 3
+#define TRIANGULATOR_VERTEXTYPE_MERGE 4
+
+TriangulatorPoly::TriangulatorPoly() {
+	hole = false;
+	numpoints = 0;
+	points = NULL;
+}
+
+TriangulatorPoly::~TriangulatorPoly() {
+	if(points) delete [] points;
+}
+
+void TriangulatorPoly::Clear() {
+	if(points) delete [] points;
+	hole = false;
+	numpoints = 0;
+	points = NULL;
+}
+
+void TriangulatorPoly::Init(long numpoints) {
+	Clear();
+	this->numpoints = numpoints;
+	points = new Vector2[numpoints];
+}
+
+void TriangulatorPoly::Triangle(Vector2 &p1, Vector2 &p2, Vector2 &p3) {
+	Init(3);
+	points[0] = p1;
+	points[1] = p2;
+	points[2] = p3;
+}
+
+TriangulatorPoly::TriangulatorPoly(const TriangulatorPoly &src) {
+	hole = src.hole;
+	numpoints = src.numpoints;
+	points = new Vector2[numpoints];
+	memcpy(points, src.points, numpoints*sizeof(Vector2));
+}
+
+TriangulatorPoly& TriangulatorPoly::operator=(const TriangulatorPoly &src) {
+	Clear();
+	hole = src.hole;
+	numpoints = src.numpoints;
+	points = new Vector2[numpoints];
+	memcpy(points, src.points, numpoints*sizeof(Vector2));
+	return *this;
+}
+
+int TriangulatorPoly::GetOrientation() {
+	long i1,i2;
+	real_t area = 0;
+	for(i1=0; i1<numpoints; i1++) {
+		i2 = i1+1;
+		if(i2 == numpoints) i2 = 0;
+		area += points[i1].x * points[i2].y - points[i1].y * points[i2].x;
+	}
+	if(area>0) return TRIANGULATOR_CCW;
+	if(area<0) return TRIANGULATOR_CW;
+	return 0;
+}
+
+void TriangulatorPoly::SetOrientation(int orientation) {
+	int polyorientation = GetOrientation();
+	if(polyorientation&&(polyorientation!=orientation)) {
+		Invert();
+	}
+}
+
+void TriangulatorPoly::Invert() {
+	long i;
+	Vector2 *invpoints;
+
+	invpoints = new Vector2[numpoints];
+	for(i=0;i<numpoints;i++) {
+		invpoints[i] = points[numpoints-i-1];
+	}
+
+	delete [] points;
+	points = invpoints;
+}
+
+Vector2 TriangulatorPartition::Normalize(const Vector2 &p) {
+	Vector2 r;
+	real_t n = sqrt(p.x*p.x + p.y*p.y);
+	if(n!=0) {
+		r = p/n;
+	} else {
+		r.x = 0;
+		r.y = 0;
+	}
+	return r;
+}
+
+real_t TriangulatorPartition::Distance(const Vector2 &p1, const Vector2 &p2) {
+	real_t dx,dy;
+	dx = p2.x - p1.x;
+	dy = p2.y - p1.y;
+	return(sqrt(dx*dx + dy*dy));
+}
+
+//checks if two lines intersect
+int TriangulatorPartition::Intersects(Vector2 &p11, Vector2 &p12, Vector2 &p21, Vector2 &p22) {
+	if((p11.x == p21.x)&&(p11.y == p21.y)) return 0;
+	if((p11.x == p22.x)&&(p11.y == p22.y)) return 0;
+	if((p12.x == p21.x)&&(p12.y == p21.y)) return 0;
+	if((p12.x == p22.x)&&(p12.y == p22.y)) return 0;
+
+	Vector2 v1ort,v2ort,v;
+	real_t dot11,dot12,dot21,dot22;
+
+	v1ort.x = p12.y-p11.y;
+	v1ort.y = p11.x-p12.x;
+
+	v2ort.x = p22.y-p21.y;
+	v2ort.y = p21.x-p22.x;
+
+	v = p21-p11;
+	dot21 = v.x*v1ort.x + v.y*v1ort.y;
+	v = p22-p11;
+	dot22 = v.x*v1ort.x + v.y*v1ort.y;
+
+	v = p11-p21;
+	dot11 = v.x*v2ort.x + v.y*v2ort.y;
+	v = p12-p21;
+	dot12 = v.x*v2ort.x + v.y*v2ort.y;
+
+	if(dot11*dot12>0) return 0;
+	if(dot21*dot22>0) return 0;
+
+	return 1;
+}
+
+//removes holes from inpolys by merging them with non-holes
+int TriangulatorPartition::RemoveHoles(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *outpolys) {
+	List<TriangulatorPoly> polys;
+	List<TriangulatorPoly>::Element *holeiter,*polyiter,*iter,*iter2;
+	long i,i2,holepointindex,polypointindex;
+	Vector2 holepoint,polypoint,bestpolypoint;
+	Vector2 linep1,linep2;
+	Vector2 v1,v2;
+	TriangulatorPoly newpoly;
+	bool hasholes;
+	bool pointvisible;
+	bool pointfound;
+
+	//check for trivial case (no holes)
+	hasholes = false;
+	for(iter = inpolys->front(); iter; iter=iter->next()) {
+		if(iter->get().IsHole()) {
+			hasholes = true;
+			break;
+		}
+	}
+	if(!hasholes) {
+		for(iter = inpolys->front(); iter; iter=iter->next()) {
+			outpolys->push_back(iter->get());
+		}
+		return 1;
+	}
+
+	polys = *inpolys;
+
+	while(1) {
+		//find the hole point with the largest x
+		hasholes = false;
+		for(iter = polys.front(); iter; iter=iter->next()) {
+			if(!iter->get().IsHole()) continue;
+
+			if(!hasholes) {
+				hasholes = true;
+				holeiter = iter;
+				holepointindex = 0;
+			}
+
+			for(i=0; i < iter->get().GetNumPoints(); i++) {
+				if(iter->get().GetPoint(i).x > holeiter->get().GetPoint(holepointindex).x) {
+					holeiter = iter;
+					holepointindex = i;
+				}
+			}
+		}
+		if(!hasholes) break;
+		holepoint = holeiter->get().GetPoint(holepointindex);
+
+		pointfound = false;
+		for(iter = polys.front(); iter; iter=iter->next()) {
+			if(iter->get().IsHole()) continue;
+			for(i=0; i < iter->get().GetNumPoints(); i++) {
+				if(iter->get().GetPoint(i).x <= holepoint.x) continue;
+				if(!InCone(iter->get().GetPoint((i+iter->get().GetNumPoints()-1)%(iter->get().GetNumPoints())),
+					   iter->get().GetPoint(i),
+					   iter->get().GetPoint((i+1)%(iter->get().GetNumPoints())),
+					   holepoint))
+					continue;
+				polypoint = iter->get().GetPoint(i);
+				if(pointfound) {
+					v1 = Normalize(polypoint-holepoint);
+					v2 = Normalize(bestpolypoint-holepoint);
+					if(v2.x > v1.x) continue;
+				}
+				pointvisible = true;
+				for(iter2 = polys.front(); iter2; iter2=iter2->next()) {
+					if(iter2->get().IsHole()) continue;
+					for(i2=0; i2 < iter2->get().GetNumPoints(); i2++) {
+						linep1 = iter2->get().GetPoint(i2);
+						linep2 = iter2->get().GetPoint((i2+1)%(iter2->get().GetNumPoints()));
+						if(Intersects(holepoint,polypoint,linep1,linep2)) {
+							pointvisible = false;
+							break;
+						}
+					}
+					if(!pointvisible) break;
+				}
+				if(pointvisible) {
+					pointfound = true;
+					bestpolypoint = polypoint;
+					polyiter = iter;
+					polypointindex = i;
+				}
+			}
+		}
+
+		if(!pointfound) return 0;
+
+		newpoly.Init(holeiter->get().GetNumPoints() + polyiter->get().GetNumPoints() + 2);
+		i2 = 0;
+		for(i=0;i<=polypointindex;i++) {
+			newpoly[i2] = polyiter->get().GetPoint(i);
+			i2++;
+		}
+		for(i=0;i<=holeiter->get().GetNumPoints();i++) {
+			newpoly[i2] = holeiter->get().GetPoint((i+holepointindex)%holeiter->get().GetNumPoints());
+			i2++;
+		}
+		for(i=polypointindex;i<polyiter->get().GetNumPoints();i++) {
+			newpoly[i2] = polyiter->get().GetPoint(i);
+			i2++;
+		}
+
+		polys.erase(holeiter);
+		polys.erase(polyiter);
+		polys.push_back(newpoly);
+	}
+
+	for(iter = polys.front(); iter; iter=iter->next()) {
+		outpolys->push_back(iter->get());
+	}
+
+	return 1;
+}
+
+bool TriangulatorPartition::IsConvex(Vector2& p1, Vector2& p2, Vector2& p3) {
+	real_t tmp;
+	tmp = (p3.y-p1.y)*(p2.x-p1.x)-(p3.x-p1.x)*(p2.y-p1.y);
+	if(tmp>0) return 1;
+	else return 0;
+}
+
+bool TriangulatorPartition::IsReflex(Vector2& p1, Vector2& p2, Vector2& p3) {
+	real_t tmp;
+	tmp = (p3.y-p1.y)*(p2.x-p1.x)-(p3.x-p1.x)*(p2.y-p1.y);
+	if(tmp<0) return 1;
+	else return 0;
+}
+
+bool TriangulatorPartition::IsInside(Vector2& p1, Vector2& p2, Vector2& p3, Vector2 &p) {
+	if(IsConvex(p1,p,p2)) return false;
+	if(IsConvex(p2,p,p3)) return false;
+	if(IsConvex(p3,p,p1)) return false;
+	return true;
+}
+
+bool TriangulatorPartition::InCone(Vector2 &p1, Vector2 &p2, Vector2 &p3, Vector2 &p) {
+	bool convex;
+
+	convex = IsConvex(p1,p2,p3);
+
+	if(convex) {
+		if(!IsConvex(p1,p2,p)) return false;
+		if(!IsConvex(p2,p3,p)) return false;
+		return true;
+	} else {
+		if(IsConvex(p1,p2,p)) return true;
+		if(IsConvex(p2,p3,p)) return true;
+		return false;
+	}
+}
+
+bool TriangulatorPartition::InCone(PartitionVertex *v, Vector2 &p) {
+	Vector2 p1,p2,p3;
+
+	p1 = v->previous->p;
+	p2 = v->p;
+	p3 = v->next->p;
+
+	return InCone(p1,p2,p3,p);
+}
+
+void TriangulatorPartition::UpdateVertexReflexity(PartitionVertex *v) {
+	PartitionVertex *v1,*v3;
+	v1 = v->previous;
+	v3 = v->next;
+	v->isConvex = !IsReflex(v1->p,v->p,v3->p);
+}
+
+void TriangulatorPartition::UpdateVertex(PartitionVertex *v, PartitionVertex *vertices, long numvertices) {
+	long i;
+	PartitionVertex *v1,*v3;
+	Vector2 vec1,vec3;
+
+	v1 = v->previous;
+	v3 = v->next;
+
+	v->isConvex = IsConvex(v1->p,v->p,v3->p);
+
+	vec1 = Normalize(v1->p - v->p);
+	vec3 = Normalize(v3->p - v->p);
+	v->angle = vec1.x*vec3.x + vec1.y*vec3.y;
+
+	if(v->isConvex) {
+		v->isEar = true;
+		for(i=0;i<numvertices;i++) {
+			if((vertices[i].p.x==v->p.x)&&(vertices[i].p.y==v->p.y)) continue;
+			if((vertices[i].p.x==v1->p.x)&&(vertices[i].p.y==v1->p.y)) continue;
+			if((vertices[i].p.x==v3->p.x)&&(vertices[i].p.y==v3->p.y)) continue;
+			if(IsInside(v1->p,v->p,v3->p,vertices[i].p)) {
+				v->isEar = false;
+				break;
+			}
+		}
+	} else {
+		v->isEar = false;
+	}
+}
+
+//triangulation by ear removal
+int TriangulatorPartition::Triangulate_EC(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles) {
+	long numvertices;
+	PartitionVertex *vertices;
+	PartitionVertex *ear;
+	TriangulatorPoly triangle;
+	long i,j;
+	bool earfound;
+
+	if(poly->GetNumPoints() < 3) return 0;
+	if(poly->GetNumPoints() == 3) {
+		triangles->push_back(*poly);
+		return 1;
+	}
+
+	numvertices = poly->GetNumPoints();
+
+	vertices = new PartitionVertex[numvertices];
+	for(i=0;i<numvertices;i++) {
+		vertices[i].isActive = true;
+		vertices[i].p = poly->GetPoint(i);
+		if(i==(numvertices-1)) vertices[i].next=&(vertices[0]);
+		else vertices[i].next=&(vertices[i+1]);
+		if(i==0) vertices[i].previous = &(vertices[numvertices-1]);
+		else vertices[i].previous = &(vertices[i-1]);
+	}
+	for(i=0;i<numvertices;i++) {
+		UpdateVertex(&vertices[i],vertices,numvertices);
+	}
+
+	for(i=0;i<numvertices-3;i++) {
+		earfound = false;
+		//find the most extruded ear
+		for(j=0;j<numvertices;j++) {
+			if(!vertices[j].isActive) continue;
+			if(!vertices[j].isEar) continue;
+			if(!earfound) {
+				earfound = true;
+				ear = &(vertices[j]);
+			} else {
+				if(vertices[j].angle > ear->angle) {
+					ear = &(vertices[j]);
+				}
+			}
+		}
+		if(!earfound) {
+			delete [] vertices;
+			return 0;
+		}
+
+		triangle.Triangle(ear->previous->p,ear->p,ear->next->p);
+		triangles->push_back(triangle);
+
+		ear->isActive = false;
+		ear->previous->next = ear->next;
+		ear->next->previous = ear->previous;
+
+		if(i==numvertices-4) break;
+
+		UpdateVertex(ear->previous,vertices,numvertices);
+		UpdateVertex(ear->next,vertices,numvertices);
+	}
+	for(i=0;i<numvertices;i++) {
+		if(vertices[i].isActive) {
+			triangle.Triangle(vertices[i].previous->p,vertices[i].p,vertices[i].next->p);
+			triangles->push_back(triangle);
+			break;
+		}
+	}
+
+	delete [] vertices;
+
+	return 1;
+}
+
+int TriangulatorPartition::Triangulate_EC(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *triangles) {
+	List<TriangulatorPoly> outpolys;
+	List<TriangulatorPoly>::Element*iter;
+
+	if(!RemoveHoles(inpolys,&outpolys)) return 0;
+	for(iter=outpolys.front();iter;iter=iter->next()) {
+		if(!Triangulate_EC(&(iter->get()),triangles)) return 0;
+	}
+	return 1;
+}
+
+int TriangulatorPartition::ConvexPartition_HM(TriangulatorPoly *poly, List<TriangulatorPoly> *parts) {
+	List<TriangulatorPoly> triangles;
+	List<TriangulatorPoly>::Element *iter1,*iter2;
+	TriangulatorPoly *poly1,*poly2;
+	TriangulatorPoly newpoly;
+	Vector2 d1,d2,p1,p2,p3;
+	long i11,i12,i21,i22,i13,i23,j,k;
+	bool isdiagonal;
+	long numreflex;
+
+	//check if the poly is already convex
+	numreflex = 0;
+	for(i11=0;i11<poly->GetNumPoints();i11++) {
+		if(i11==0) i12 = poly->GetNumPoints()-1;
+		else i12=i11-1;
+		if(i11==(poly->GetNumPoints()-1)) i13=0;
+		else i13=i11+1;
+		if(IsReflex(poly->GetPoint(i12),poly->GetPoint(i11),poly->GetPoint(i13))) {
+			numreflex = 1;
+			break;
+		}
+	}
+	if(numreflex == 0) {
+		parts->push_back(*poly);
+		return 1;
+	}
+
+	if(!Triangulate_EC(poly,&triangles)) return 0;
+
+	for(iter1 = triangles.front(); iter1 ; iter1=iter1->next()) {
+		poly1 = &(iter1->get());
+		for(i11=0;i11<poly1->GetNumPoints();i11++) {
+			d1 = poly1->GetPoint(i11);
+			i12 = (i11+1)%(poly1->GetNumPoints());
+			d2 = poly1->GetPoint(i12);
+
+			isdiagonal = false;
+			for(iter2 = iter1; iter2 ; iter2=iter2->next()) {
+				if(iter1 == iter2) continue;
+				poly2 = &(iter2->get());
+
+				for(i21=0;i21<poly2->GetNumPoints();i21++) {
+					if((d2.x != poly2->GetPoint(i21).x)||(d2.y != poly2->GetPoint(i21).y)) continue;
+					i22 = (i21+1)%(poly2->GetNumPoints());
+					if((d1.x != poly2->GetPoint(i22).x)||(d1.y != poly2->GetPoint(i22).y)) continue;
+					isdiagonal = true;
+					break;
+				}
+				if(isdiagonal) break;
+			}
+
+			if(!isdiagonal) continue;
+
+			p2 = poly1->GetPoint(i11);
+			if(i11 == 0) i13 = poly1->GetNumPoints()-1;
+			else i13 = i11-1;
+			p1 = poly1->GetPoint(i13);
+			if(i22 == (poly2->GetNumPoints()-1)) i23 = 0;
+			else i23 = i22+1;
+			p3 = poly2->GetPoint(i23);
+
+			if(!IsConvex(p1,p2,p3)) continue;
+
+			p2 = poly1->GetPoint(i12);
+			if(i12 == (poly1->GetNumPoints()-1)) i13 = 0;
+			else i13 = i12+1;
+			p3 = poly1->GetPoint(i13);
+			if(i21 == 0) i23 = poly2->GetNumPoints()-1;
+			else i23 = i21-1;
+			p1 = poly2->GetPoint(i23);
+
+			if(!IsConvex(p1,p2,p3)) continue;
+
+			newpoly.Init(poly1->GetNumPoints()+poly2->GetNumPoints()-2);
+			k = 0;
+			for(j=i12;j!=i11;j=(j+1)%(poly1->GetNumPoints())) {
+				newpoly[k] = poly1->GetPoint(j);
+				k++;
+			}
+			for(j=i22;j!=i21;j=(j+1)%(poly2->GetNumPoints())) {
+				newpoly[k] = poly2->GetPoint(j);
+				k++;
+			}
+
+			triangles.erase(iter2);
+			iter1->get() = newpoly;
+			poly1 = &(iter1->get());
+			i11 = -1;
+
+			continue;
+		}
+	}
+
+	for(iter1 = triangles.front(); iter1 ; iter1=iter1->next()) {
+		parts->push_back(iter1->get());
+	}
+
+	return 1;
+}
+
+int TriangulatorPartition::ConvexPartition_HM(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *parts) {
+	List<TriangulatorPoly> outpolys;
+	List<TriangulatorPoly>::Element* iter;
+
+	if(!RemoveHoles(inpolys,&outpolys)) return 0;
+	for(iter=outpolys.front();iter;iter=iter->next()) {
+		if(!ConvexPartition_HM(&(iter->get()),parts)) return 0;
+	}
+	return 1;
+}
+
+//minimum-weight polygon triangulation by dynamic programming
+//O(n^3) time complexity
+//O(n^2) space complexity
+int TriangulatorPartition::Triangulate_OPT(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles) {
+	long i,j,k,gap,n;
+	DPState **dpstates;
+	Vector2 p1,p2,p3,p4;
+	long bestvertex;
+	real_t weight,minweight,d1,d2;
+	Diagonal diagonal,newdiagonal;
+	List<Diagonal> diagonals;
+	TriangulatorPoly triangle;
+	int ret = 1;
+
+	n = poly->GetNumPoints();
+	dpstates = new DPState *[n];
+	for(i=1;i<n;i++) {
+		dpstates[i] = new DPState[i];
+	}
+
+	//init states and visibility
+	for(i=0;i<(n-1);i++) {
+		p1 = poly->GetPoint(i);
+		for(j=i+1;j<n;j++) {
+			dpstates[j][i].visible = true;
+			dpstates[j][i].weight = 0;
+			dpstates[j][i].bestvertex = -1;
+			if(j!=(i+1)) {
+				p2 = poly->GetPoint(j);
+
+				//visibility check
+				if(i==0) p3 = poly->GetPoint(n-1);
+				else p3 = poly->GetPoint(i-1);
+				if(i==(n-1)) p4 = poly->GetPoint(0);
+				else p4 = poly->GetPoint(i+1);
+				if(!InCone(p3,p1,p4,p2)) {
+					dpstates[j][i].visible = false;
+					continue;
+				}
+
+				if(j==0) p3 = poly->GetPoint(n-1);
+				else p3 = poly->GetPoint(j-1);
+				if(j==(n-1)) p4 = poly->GetPoint(0);
+				else p4 = poly->GetPoint(j+1);
+				if(!InCone(p3,p2,p4,p1)) {
+					dpstates[j][i].visible = false;
+					continue;
+				}
+
+				for(k=0;k<n;k++) {
+					p3 = poly->GetPoint(k);
+					if(k==(n-1)) p4 = poly->GetPoint(0);
+					else p4 = poly->GetPoint(k+1);
+					if(Intersects(p1,p2,p3,p4)) {
+						dpstates[j][i].visible = false;
+						break;
+					}
+				}
+			}
+		}
+	}
+	dpstates[n-1][0].visible = true;
+	dpstates[n-1][0].weight = 0;
+	dpstates[n-1][0].bestvertex = -1;
+
+	for(gap = 2; gap<n; gap++) {
+		for(i=0; i<(n-gap); i++) {
+			j = i+gap;
+			if(!dpstates[j][i].visible) continue;
+			bestvertex = -1;
+			for(k=(i+1);k<j;k++) {
+				if(!dpstates[k][i].visible) continue;
+				if(!dpstates[j][k].visible) continue;
+
+				if(k<=(i+1)) d1=0;
+				else d1 = Distance(poly->GetPoint(i),poly->GetPoint(k));
+				if(j<=(k+1)) d2=0;
+				else d2 = Distance(poly->GetPoint(k),poly->GetPoint(j));
+
+				weight = dpstates[k][i].weight + dpstates[j][k].weight + d1 + d2;
+
+				if((bestvertex == -1)||(weight<minweight)) {
+					bestvertex = k;
+					minweight = weight;
+				}
+			}
+			if(bestvertex == -1) {
+				for(i=1;i<n;i++) {
+					delete [] dpstates[i];
+				}
+				delete [] dpstates;
+
+				return 0;
+			}
+
+			dpstates[j][i].bestvertex = bestvertex;
+			dpstates[j][i].weight = minweight;
+		}
+	}
+
+	newdiagonal.index1 = 0;
+	newdiagonal.index2 = n-1;
+	diagonals.push_back(newdiagonal);
+	while(!diagonals.empty()) {
+		diagonal = (diagonals.front()->get());
+		diagonals.pop_front();
+		bestvertex = dpstates[diagonal.index2][diagonal.index1].bestvertex;
+		if(bestvertex == -1) {
+			ret = 0;
+			break;
+		}
+		triangle.Triangle(poly->GetPoint(diagonal.index1),poly->GetPoint(bestvertex),poly->GetPoint(diagonal.index2));
+		triangles->push_back(triangle);
+		if(bestvertex > (diagonal.index1+1)) {
+			newdiagonal.index1 = diagonal.index1;
+			newdiagonal.index2 = bestvertex;
+			diagonals.push_back(newdiagonal);
+		}
+		if(diagonal.index2 > (bestvertex+1)) {
+			newdiagonal.index1 = bestvertex;
+			newdiagonal.index2 = diagonal.index2;
+			diagonals.push_back(newdiagonal);
+		}
+	}
+
+	for(i=1;i<n;i++) {
+		delete [] dpstates[i];
+	}
+	delete [] dpstates;
+
+	return ret;
+}
+
+void TriangulatorPartition::UpdateState(long a, long b, long w, long i, long j, DPState2 **dpstates) {
+	Diagonal newdiagonal;
+	List<Diagonal> *pairs;
+	long w2;
+
+	w2 = dpstates[a][b].weight;
+	if(w>w2) return;
+
+	pairs = &(dpstates[a][b].pairs);
+	newdiagonal.index1 = i;
+	newdiagonal.index2 = j;
+
+	if(w<w2) {
+		pairs->clear();
+		pairs->push_front(newdiagonal);
+		dpstates[a][b].weight = w;
+	} else {
+		if((!pairs->empty())&&(i <= pairs->front()->get().index1)) return;
+		while((!pairs->empty())&&(pairs->front()->get().index2 >= j)) pairs->pop_front();
+		pairs->push_front(newdiagonal);
+	}
+}
+
+void TriangulatorPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
+	List<Diagonal> *pairs;
+	List<Diagonal>::Element *iter,*lastiter;
+	long top;
+	long w;
+
+	if(!dpstates[i][j].visible) return;
+	top = j;
+	w = dpstates[i][j].weight;
+	if(k-j > 1) {
+		if (!dpstates[j][k].visible) return;
+		w += dpstates[j][k].weight + 1;
+	}
+	if(j-i > 1) {
+		pairs = &(dpstates[i][j].pairs);
+		iter = NULL;
+		lastiter = NULL;
+		while(iter!=pairs->front()) {
+			if (!iter)
+				iter=pairs->back();
+			else
+				iter=iter->prev();
+
+			if(!IsReflex(vertices[iter->get().index2].p,vertices[j].p,vertices[k].p)) lastiter = iter;
+			else break;
+		}
+		if(lastiter == NULL) w++;
+		else {
+			if(IsReflex(vertices[k].p,vertices[i].p,vertices[lastiter->get().index1].p)) w++;
+			else top = lastiter->get().index1;
+		}
+	}
+	UpdateState(i,k,w,top,j,dpstates);
+}
+
+void TriangulatorPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
+	List<Diagonal> *pairs;
+	List<Diagonal>::Element* iter,*lastiter;
+	long top;
+	long w;
+
+	if(!dpstates[j][k].visible) return;
+	top = j;
+	w = dpstates[j][k].weight;
+
+	if (j-i > 1) {
+		if (!dpstates[i][j].visible) return;
+		w += dpstates[i][j].weight + 1;
+	}
+	if (k-j > 1) {
+		pairs = &(dpstates[j][k].pairs);
+
+		iter = pairs->front();
+		if((!pairs->empty())&&(!IsReflex(vertices[i].p,vertices[j].p,vertices[iter->get().index1].p))) {
+			lastiter = iter;
+			while(iter!=NULL) {
+				if(!IsReflex(vertices[i].p,vertices[j].p,vertices[iter->get().index1].p)) {
+					lastiter = iter;
+					iter=iter->next();
+				}
+				else break;
+			}
+			if(IsReflex(vertices[lastiter->get().index2].p,vertices[k].p,vertices[i].p)) w++;
+			else top = lastiter->get().index2;
+		} else w++;
+	}
+	UpdateState(i,k,w,j,top,dpstates);
+}
+
+int TriangulatorPartition::ConvexPartition_OPT(TriangulatorPoly *poly, List<TriangulatorPoly> *parts) {
+	Vector2 p1,p2,p3,p4;
+	PartitionVertex *vertices;
+	DPState2 **dpstates;
+	long i,j,k,n,gap;
+	List<Diagonal> diagonals,diagonals2;
+	Diagonal diagonal,newdiagonal;
+	List<Diagonal> *pairs,*pairs2;
+	List<Diagonal>::Element* iter,*iter2;
+	int ret;
+	TriangulatorPoly newpoly;
+	List<long> indices;
+	List<long>::Element* iiter;
+	bool ijreal,jkreal;
+
+	n = poly->GetNumPoints();
+	vertices = new PartitionVertex[n];
+
+	dpstates = new DPState2 *[n];
+	for(i=0;i<n;i++) {
+		dpstates[i] = new DPState2[n];
+	}
+
+	//init vertex information
+	for(i=0;i<n;i++) {
+		vertices[i].p = poly->GetPoint(i);
+		vertices[i].isActive = true;
+		if(i==0) vertices[i].previous = &(vertices[n-1]);
+		else vertices[i].previous = &(vertices[i-1]);
+		if(i==(poly->GetNumPoints()-1)) vertices[i].next = &(vertices[0]);
+		else vertices[i].next = &(vertices[i+1]);
+	}
+	for(i=1;i<n;i++) {
+		UpdateVertexReflexity(&(vertices[i]));
+	}
+
+	//init states and visibility
+	for(i=0;i<(n-1);i++) {
+		p1 = poly->GetPoint(i);
+		for(j=i+1;j<n;j++) {
+			dpstates[i][j].visible = true;
+			if(j==i+1) {
+				dpstates[i][j].weight = 0;
+			} else {
+				dpstates[i][j].weight = 2147483647;
+			}
+			if(j!=(i+1)) {
+				p2 = poly->GetPoint(j);
+
+				//visibility check
+				if(!InCone(&vertices[i],p2)) {
+					dpstates[i][j].visible = false;
+					continue;
+				}
+				if(!InCone(&vertices[j],p1)) {
+					dpstates[i][j].visible = false;
+					continue;
+				}
+
+				for(k=0;k<n;k++) {
+					p3 = poly->GetPoint(k);
+					if(k==(n-1)) p4 = poly->GetPoint(0);
+					else p4 = poly->GetPoint(k+1);
+					if(Intersects(p1,p2,p3,p4)) {
+						dpstates[i][j].visible = false;
+						break;
+					}
+				}
+			}
+		}
+	}
+	for(i=0;i<(n-2);i++) {
+		j = i+2;
+		if(dpstates[i][j].visible) {
+			dpstates[i][j].weight = 0;
+			newdiagonal.index1 = i+1;
+			newdiagonal.index2 = i+1;
+			dpstates[i][j].pairs.push_back(newdiagonal);
+		}
+	}
+
+	dpstates[0][n-1].visible = true;
+	vertices[0].isConvex = false; //by convention
+
+	for(gap=3; gap<n; gap++) {
+		for(i=0;i<n-gap;i++) {
+			if(vertices[i].isConvex) continue;
+			k = i+gap;
+			if(dpstates[i][k].visible) {
+				if(!vertices[k].isConvex) {
+					for(j=i+1;j<k;j++) TypeA(i,j,k,vertices,dpstates);
+				} else {
+					for(j=i+1;j<(k-1);j++) {
+						if(vertices[j].isConvex) continue;
+						TypeA(i,j,k,vertices,dpstates);
+					}
+					TypeA(i,k-1,k,vertices,dpstates);
+				}
+			}
+		}
+		for(k=gap;k<n;k++) {
+			if(vertices[k].isConvex) continue;
+			i = k-gap;
+			if((vertices[i].isConvex)&&(dpstates[i][k].visible)) {
+				TypeB(i,i+1,k,vertices,dpstates);
+				for(j=i+2;j<k;j++) {
+					if(vertices[j].isConvex) continue;
+					TypeB(i,j,k,vertices,dpstates);
+				}
+			}
+		}
+	}
+
+
+	//recover solution
+	ret = 1;
+	newdiagonal.index1 = 0;
+	newdiagonal.index2 = n-1;
+	diagonals.push_front(newdiagonal);
+	while(!diagonals.empty()) {
+		diagonal = (diagonals.front()->get());
+		diagonals.pop_front();
+		if((diagonal.index2 - diagonal.index1) <=1) continue;
+		pairs = &(dpstates[diagonal.index1][diagonal.index2].pairs);
+		if(pairs->empty()) {
+			ret = 0;
+			break;
+		}
+		if(!vertices[diagonal.index1].isConvex) {
+			iter = pairs->back();
+
+			j = iter->get().index2;
+			newdiagonal.index1 = j;
+			newdiagonal.index2 = diagonal.index2;
+			diagonals.push_front(newdiagonal);
+			if((j - diagonal.index1)>1) {
+				if(iter->get().index1 != iter->get().index2) {
+					pairs2 = &(dpstates[diagonal.index1][j].pairs);
+					while(1) {
+						if(pairs2->empty()) {
+							ret = 0;
+							break;
+						}
+						iter2 = pairs2->back();
+
+						if(iter->get().index1 != iter2->get().index1) pairs2->pop_back();
+						else break;
+					}
+					if(ret == 0) break;
+				}
+				newdiagonal.index1 = diagonal.index1;
+				newdiagonal.index2 = j;
+				diagonals.push_front(newdiagonal);
+			}
+		} else {
+			iter = pairs->front();
+			j = iter->get().index1;
+			newdiagonal.index1 = diagonal.index1;
+			newdiagonal.index2 = j;
+			diagonals.push_front(newdiagonal);
+			if((diagonal.index2 - j) > 1) {
+				if(iter->get().index1 != iter->get().index2) {
+					pairs2 = &(dpstates[j][diagonal.index2].pairs);
+					while(1) {
+						if(pairs2->empty()) {
+							ret = 0;
+							break;
+						}
+						iter2 = pairs2->front();
+						if(iter->get().index2 != iter2->get().index2) pairs2->pop_front();
+						else break;
+					}
+					if(ret == 0) break;
+				}
+				newdiagonal.index1 = j;
+				newdiagonal.index2 = diagonal.index2;
+				diagonals.push_front(newdiagonal);
+			}
+		}
+	}
+
+	if(ret == 0) {
+		for(i=0;i<n;i++) {
+			delete [] dpstates[i];
+		}
+		delete [] dpstates;
+		delete [] vertices;
+
+		return ret;
+	}
+
+	newdiagonal.index1 = 0;
+	newdiagonal.index2 = n-1;
+	diagonals.push_front(newdiagonal);
+	while(!diagonals.empty()) {
+		diagonal = (diagonals.front())->get();
+		diagonals.pop_front();
+		if((diagonal.index2 - diagonal.index1) <= 1) continue;
+
+		indices.clear();
+		diagonals2.clear();
+		indices.push_back(diagonal.index1);
+		indices.push_back(diagonal.index2);
+		diagonals2.push_front(diagonal);
+
+		while(!diagonals2.empty()) {
+			diagonal = (diagonals2.front()->get());
+			diagonals2.pop_front();
+			if((diagonal.index2 - diagonal.index1) <= 1) continue;
+			ijreal = true;
+			jkreal = true;
+			pairs = &(dpstates[diagonal.index1][diagonal.index2].pairs);
+			if(!vertices[diagonal.index1].isConvex) {
+				iter = pairs->back();
+				j = iter->get().index2;
+				if(iter->get().index1 != iter->get().index2) ijreal = false;
+			} else {
+				iter = pairs->front();
+				j = iter->get().index1;
+				if(iter->get().index1 != iter->get().index2) jkreal = false;
+			}
+
+			newdiagonal.index1 = diagonal.index1;
+			newdiagonal.index2 = j;
+			if(ijreal) {
+				diagonals.push_back(newdiagonal);
+			} else {
+				diagonals2.push_back(newdiagonal);
+			}
+
+			newdiagonal.index1 = j;
+			newdiagonal.index2 = diagonal.index2;
+			if(jkreal) {
+				diagonals.push_back(newdiagonal);
+			} else {
+				diagonals2.push_back(newdiagonal);
+			}
+
+			indices.push_back(j);
+		}
+
+		indices.sort();
+		newpoly.Init((long)indices.size());
+		k=0;
+		for(iiter = indices.front();iiter;iiter=iiter->next()) {
+			newpoly[k] = vertices[iiter->get()].p;
+			k++;
+		}
+		parts->push_back(newpoly);
+	}
+
+	for(i=0;i<n;i++) {
+		delete [] dpstates[i];
+	}
+	delete [] dpstates;
+	delete [] vertices;
+
+	return ret;
+}
+
+//triangulates a set of polygons by first partitioning them into monotone polygons
+//O(n*log(n)) time complexity, O(n) space complexity
+//the algorithm used here is outlined in the book
+//"Computational Geometry: Algorithms and Applications"
+//by Mark de Berg, Otfried Cheong, Marc van Kreveld and Mark Overmars
+int TriangulatorPartition::MonotonePartition(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *monotonePolys) {
+	List<TriangulatorPoly>::Element *iter;
+	MonotoneVertex *vertices;
+	long i,numvertices,vindex,vindex2,newnumvertices,maxnumvertices;
+	long polystartindex, polyendindex;
+	TriangulatorPoly *poly;
+	MonotoneVertex *v,*v2,*vprev,*vnext;
+	ScanLineEdge newedge;
+	bool error = false;
+
+	numvertices = 0;
+	for(iter = inpolys->front(); iter ; iter=iter->next()) {
+		numvertices += iter->get().GetNumPoints();
+	}
+
+	maxnumvertices = numvertices*3;
+	vertices = new MonotoneVertex[maxnumvertices];
+	newnumvertices = numvertices;
+
+	polystartindex = 0;
+	for(iter = inpolys->front(); iter ; iter=iter->next()) {
+		poly = &(iter->get());
+		polyendindex = polystartindex + poly->GetNumPoints()-1;
+		for(i=0;i<poly->GetNumPoints();i++) {
+			vertices[i+polystartindex].p = poly->GetPoint(i);
+			if(i==0) vertices[i+polystartindex].previous = polyendindex;
+			else vertices[i+polystartindex].previous = i+polystartindex-1;
+			if(i==(poly->GetNumPoints()-1)) vertices[i+polystartindex].next = polystartindex;
+			else vertices[i+polystartindex].next = i+polystartindex+1;
+		}
+		polystartindex = polyendindex+1;
+	}
+
+	//construct the priority queue
+	long *priority = new long [numvertices];
+	for(i=0;i<numvertices;i++) priority[i] = i;
+	SortArray<long,VertexSorter> sorter;
+	sorter.compare.vertices=vertices;
+	sorter.sort(priority,numvertices);
+
+	//determine vertex types
+	char *vertextypes = new char[maxnumvertices];
+	for(i=0;i<numvertices;i++) {
+		v = &(vertices[i]);
+		vprev = &(vertices[v->previous]);
+		vnext = &(vertices[v->next]);
+
+		if(Below(vprev->p,v->p)&&Below(vnext->p,v->p)) {
+			if(IsConvex(vnext->p,vprev->p,v->p)) {
+				vertextypes[i] = TRIANGULATOR_VERTEXTYPE_START;
+			} else {
+				vertextypes[i] = TRIANGULATOR_VERTEXTYPE_SPLIT;
+			}
+		} else if(Below(v->p,vprev->p)&&Below(v->p,vnext->p)) {
+			if(IsConvex(vnext->p,vprev->p,v->p))
+			{
+				vertextypes[i] = TRIANGULATOR_VERTEXTYPE_END;
+			} else {
+				vertextypes[i] = TRIANGULATOR_VERTEXTYPE_MERGE;
+			}
+		} else {
+			vertextypes[i] = TRIANGULATOR_VERTEXTYPE_REGULAR;
+		}
+	}
+
+	//helpers
+	long *helpers = new long[maxnumvertices];
+
+	//binary search tree that holds edges intersecting the scanline
+	//note that while set doesn't actually have to be implemented as a tree
+	//complexity requirements for operations are the same as for the balanced binary search tree
+	Set<ScanLineEdge> edgeTree;
+	//store iterators to the edge tree elements
+	//this makes deleting existing edges much faster
+	Set<ScanLineEdge>::Element **edgeTreeIterators,*edgeIter;
+	edgeTreeIterators = new Set<ScanLineEdge>::Element*[maxnumvertices];
+	//Pair<Set<ScanLineEdge>::Element*,bool> edgeTreeRet;
+	for(i = 0; i<numvertices; i++) edgeTreeIterators[i] = NULL;
+
+	//for each vertex
+	for(i=0;i<numvertices;i++) {
+		vindex = priority[i];
+		v = &(vertices[vindex]);
+		vindex2 = vindex;
+		v2 = v;
+
+		//depending on the vertex type, do the appropriate action
+		//comments in the following sections are copied from "Computational Geometry: Algorithms and Applications"
+		switch(vertextypes[vindex]) {
+			case TRIANGULATOR_VERTEXTYPE_START:
+				//Insert ei in T and set helper(ei) to vi.
+				newedge.p1 = v->p;
+				newedge.p2 = vertices[v->next].p;
+				newedge.index = vindex;
+				edgeTreeIterators[vindex] = edgeTree.insert(newedge);
+				helpers[vindex] = vindex;
+				break;
+
+			case TRIANGULATOR_VERTEXTYPE_END:
+				//if helper(ei-1) is a merge vertex
+				if(vertextypes[helpers[v->previous]]==TRIANGULATOR_VERTEXTYPE_MERGE) {
+					//Insert the diagonal connecting vi to helper(ei-1) in D.
+					AddDiagonal(vertices,&newnumvertices,vindex,helpers[v->previous],
+							vertextypes, edgeTreeIterators, &edgeTree, helpers);
+				}
+				//Delete ei-1 from T
+				edgeTree.erase(edgeTreeIterators[v->previous]);
+				break;
+
+			case TRIANGULATOR_VERTEXTYPE_SPLIT:
+				//Search in T to find the edge e j directly left of vi.
+				newedge.p1 = v->p;
+				newedge.p2 = v->p;
+				edgeIter = edgeTree.lower_bound(newedge);
+				if(edgeIter == edgeTree.front()) {
+					error = true;
+					break;
+				}
+				edgeIter=edgeIter->prev();
+				//Insert the diagonal connecting vi to helper(ej) in D.
+				AddDiagonal(vertices,&newnumvertices,vindex,helpers[edgeIter->get().index],
+						vertextypes, edgeTreeIterators, &edgeTree, helpers);
+				vindex2 = newnumvertices-2;
+				v2 = &(vertices[vindex2]);
+				//helper(e j)�vi
+				helpers[edgeIter->get().index] = vindex;
+				//Insert ei in T and set helper(ei) to vi.
+				newedge.p1 = v2->p;
+				newedge.p2 = vertices[v2->next].p;
+				newedge.index = vindex2;
+
+				edgeTreeIterators[vindex2] = edgeTree.insert(newedge);
+				helpers[vindex2] = vindex2;
+				break;
+
+			case TRIANGULATOR_VERTEXTYPE_MERGE:
+				//if helper(ei-1) is a merge vertex
+				if(vertextypes[helpers[v->previous]]==TRIANGULATOR_VERTEXTYPE_MERGE) {
+					//Insert the diagonal connecting vi to helper(ei-1) in D.
+					AddDiagonal(vertices,&newnumvertices,vindex,helpers[v->previous],
+							vertextypes, edgeTreeIterators, &edgeTree, helpers);
+					vindex2 = newnumvertices-2;
+					v2 = &(vertices[vindex2]);
+				}
+				//Delete ei-1 from T.
+				edgeTree.erase(edgeTreeIterators[v->previous]);
+				//Search in T to find the edge e j directly left of vi.
+				newedge.p1 = v->p;
+				newedge.p2 = v->p;
+				edgeIter = edgeTree.lower_bound(newedge);
+				if(edgeIter == edgeTree.front()) {
+					error = true;
+					break;
+				}
+				edgeIter=edgeIter->prev();
+				//if helper(ej) is a merge vertex
+				if(vertextypes[helpers[edgeIter->get().index]]==TRIANGULATOR_VERTEXTYPE_MERGE) {
+					//Insert the diagonal connecting vi to helper(e j) in D.
+					AddDiagonal(vertices,&newnumvertices,vindex2,helpers[edgeIter->get().index],
+							vertextypes, edgeTreeIterators, &edgeTree, helpers);
+				}
+				//helper(e j)�vi
+				helpers[edgeIter->get().index] = vindex2;
+				break;
+
+			case TRIANGULATOR_VERTEXTYPE_REGULAR:
+				//if the interior of P lies to the right of vi
+				if(Below(v->p,vertices[v->previous].p)) {
+					//if helper(ei-1) is a merge vertex
+					if(vertextypes[helpers[v->previous]]==TRIANGULATOR_VERTEXTYPE_MERGE) {
+						//Insert the diagonal connecting vi to helper(ei-1) in D.
+						AddDiagonal(vertices,&newnumvertices,vindex,helpers[v->previous],
+								vertextypes, edgeTreeIterators, &edgeTree, helpers);
+						vindex2 = newnumvertices-2;
+						v2 = &(vertices[vindex2]);
+					}
+					//Delete ei-1 from T.
+					edgeTree.erase(edgeTreeIterators[v->previous]);
+					//Insert ei in T and set helper(ei) to vi.
+					newedge.p1 = v2->p;
+					newedge.p2 = vertices[v2->next].p;
+					newedge.index = vindex2;
+					edgeTreeIterators[vindex2] = edgeTree.insert(newedge);
+					helpers[vindex2] = vindex;
+				} else {
+					//Search in T to find the edge ej directly left of vi.
+					newedge.p1 = v->p;
+					newedge.p2 = v->p;
+					edgeIter = edgeTree.lower_bound(newedge);
+					if(edgeIter == edgeTree.front()) {
+						error = true;
+						break;
+					}
+					edgeIter=edgeIter->prev();
+					//if helper(ej) is a merge vertex
+					if(vertextypes[helpers[edgeIter->get().index]]==TRIANGULATOR_VERTEXTYPE_MERGE) {
+						//Insert the diagonal connecting vi to helper(e j) in D.
+						AddDiagonal(vertices,&newnumvertices,vindex,helpers[edgeIter->get().index],
+								vertextypes, edgeTreeIterators, &edgeTree, helpers);
+					}
+					//helper(e j)�vi
+					helpers[edgeIter->get().index] = vindex;
+				}
+				break;
+		}
+
+		if(error) break;
+	}
+
+	char *used = new char[newnumvertices];
+	memset(used,0,newnumvertices*sizeof(char));
+
+	if(!error) {
+		//return result
+		long size;
+		TriangulatorPoly mpoly;
+		for(i=0;i<newnumvertices;i++) {
+			if(used[i]) continue;
+			v = &(vertices[i]);
+			vnext = &(vertices[v->next]);
+			size = 1;
+			while(vnext!=v) {
+				vnext = &(vertices[vnext->next]);
+				size++;
+			}
+			mpoly.Init(size);
+			v = &(vertices[i]);
+			mpoly[0] = v->p;
+			vnext = &(vertices[v->next]);
+			size = 1;
+			used[i] = 1;
+			used[v->next] = 1;
+			while(vnext!=v) {
+				mpoly[size] = vnext->p;
+				used[vnext->next] = 1;
+				vnext = &(vertices[vnext->next]);
+				size++;
+			}
+			monotonePolys->push_back(mpoly);
+		}
+	}
+
+	//cleanup
+	delete [] vertices;
+	delete [] priority;
+	delete [] vertextypes;
+	delete [] edgeTreeIterators;
+	delete [] helpers;
+	delete [] used;
+
+	if(error) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+//adds a diagonal to the doubly-connected list of vertices
+void TriangulatorPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
+					char *vertextypes, Set<ScanLineEdge>::Element **edgeTreeIterators,
+					Set<ScanLineEdge> *edgeTree, long *helpers)
+{
+	long newindex1,newindex2;
+
+	newindex1 = *numvertices;
+	(*numvertices)++;
+	newindex2 = *numvertices;
+	(*numvertices)++;
+
+	vertices[newindex1].p = vertices[index1].p;
+	vertices[newindex2].p = vertices[index2].p;
+
+	vertices[newindex2].next = vertices[index2].next;
+	vertices[newindex1].next = vertices[index1].next;
+
+	vertices[vertices[index2].next].previous = newindex2;
+	vertices[vertices[index1].next].previous = newindex1;
+
+	vertices[index1].next = newindex2;
+	vertices[newindex2].previous = index1;
+
+	vertices[index2].next = newindex1;
+	vertices[newindex1].previous = index2;
+
+	//update all relevant structures
+	vertextypes[newindex1] = vertextypes[index1];
+	edgeTreeIterators[newindex1] = edgeTreeIterators[index1];
+	helpers[newindex1] = helpers[index1];
+	if(edgeTreeIterators[newindex1] != NULL)
+		edgeTreeIterators[newindex1]->get().index = newindex1;
+	vertextypes[newindex2] = vertextypes[index2];
+	edgeTreeIterators[newindex2] = edgeTreeIterators[index2];
+	helpers[newindex2] = helpers[index2];
+	if(edgeTreeIterators[newindex2] != NULL)
+		edgeTreeIterators[newindex2]->get().index = newindex2;
+}
+
+bool TriangulatorPartition::Below(Vector2 &p1, Vector2 &p2) {
+	if(p1.y < p2.y) return true;
+	else if(p1.y == p2.y) {
+		if(p1.x < p2.x) return true;
+	}
+	return false;
+}
+
+
+
+
+
+//sorts in the falling order of y values, if y is equal, x is used instead
+bool TriangulatorPartition::VertexSorter::operator() (long index1, long index2) const {
+	if(vertices[index1].p.y > vertices[index2].p.y) return true;
+	else if(vertices[index1].p.y == vertices[index2].p.y) {
+		if(vertices[index1].p.x > vertices[index2].p.x) return true;
+	}
+	return false;
+}
+
+bool TriangulatorPartition::ScanLineEdge::IsConvex(const Vector2& p1, const Vector2& p2, const Vector2& p3) const {
+	real_t tmp;
+	tmp = (p3.y-p1.y)*(p2.x-p1.x)-(p3.x-p1.x)*(p2.y-p1.y);
+	if(tmp>0) return 1;
+	else return 0;
+}
+
+bool TriangulatorPartition::ScanLineEdge::operator < (const ScanLineEdge & other) const {
+	if(other.p1.y == other.p2.y) {
+		if(p1.y == p2.y) {
+			if(p1.y < other.p1.y) return true;
+			else return false;
+		}
+		if(IsConvex(p1,p2,other.p1)) return true;
+		else return false;
+	} else if(p1.y == p2.y) {
+		if(IsConvex(other.p1,other.p2,p1)) return false;
+		else return true;
+	} else if(p1.y < other.p1.y) {
+		if(IsConvex(other.p1,other.p2,p1)) return false;
+		else return true;
+	} else {
+		if(IsConvex(p1,p2,other.p1)) return true;
+		else return false;
+	}
+}
+
+//triangulates monotone polygon
+//O(n) time, O(n) space complexity
+int TriangulatorPartition::TriangulateMonotone(TriangulatorPoly *inPoly, List<TriangulatorPoly> *triangles) {
+	long i,i2,j,topindex,bottomindex,leftindex,rightindex,vindex;
+	Vector2 *points;
+	long numpoints;
+	TriangulatorPoly triangle;
+
+	numpoints = inPoly->GetNumPoints();
+	points = inPoly->GetPoints();
+
+	//trivial calses
+	if(numpoints < 3) return 0;
+	if(numpoints == 3) {
+		triangles->push_back(*inPoly);
+	}
+
+	topindex = 0; bottomindex=0;
+	for(i=1;i<numpoints;i++) {
+		if(Below(points[i],points[bottomindex])) bottomindex = i;
+		if(Below(points[topindex],points[i])) topindex = i;
+	}
+
+	//check if the poly is really monotone
+	i = topindex;
+	while(i!=bottomindex) {
+		i2 = i+1; if(i2>=numpoints) i2 = 0;
+		if(!Below(points[i2],points[i])) return 0;
+		i = i2;
+	}
+	i = bottomindex;
+	while(i!=topindex) {
+		i2 = i+1; if(i2>=numpoints) i2 = 0;
+		if(!Below(points[i],points[i2])) return 0;
+		i = i2;
+	}
+
+	char *vertextypes = new char[numpoints];
+	long *priority = new long[numpoints];
+
+	//merge left and right vertex chains
+	priority[0] = topindex;
+	vertextypes[topindex] = 0;
+	leftindex = topindex+1; if(leftindex>=numpoints) leftindex = 0;
+	rightindex = topindex-1; if(rightindex<0) rightindex = numpoints-1;
+	for(i=1;i<(numpoints-1);i++) {
+		if(leftindex==bottomindex) {
+			priority[i] = rightindex;
+			rightindex--; if(rightindex<0) rightindex = numpoints-1;
+			vertextypes[priority[i]] = -1;
+		} else if(rightindex==bottomindex) {
+			priority[i] = leftindex;
+			leftindex++;  if(leftindex>=numpoints) leftindex = 0;
+			vertextypes[priority[i]] = 1;
+		} else {
+			if(Below(points[leftindex],points[rightindex])) {
+				priority[i] = rightindex;
+				rightindex--; if(rightindex<0) rightindex = numpoints-1;
+				vertextypes[priority[i]] = -1;
+			} else {
+				priority[i] = leftindex;
+				leftindex++;  if(leftindex>=numpoints) leftindex = 0;
+				vertextypes[priority[i]] = 1;
+			}
+		}
+	}
+	priority[i] = bottomindex;
+	vertextypes[bottomindex] = 0;
+
+	long *stack = new long[numpoints];
+	long stackptr = 0;
+
+	stack[0] = priority[0];
+	stack[1] = priority[1];
+	stackptr = 2;
+
+	//for each vertex from top to bottom trim as many triangles as possible
+	for(i=2;i<(numpoints-1);i++) {
+		vindex = priority[i];
+		if(vertextypes[vindex]!=vertextypes[stack[stackptr-1]]) {
+			for(j=0;j<(stackptr-1);j++) {
+				if(vertextypes[vindex]==1) {
+					triangle.Triangle(points[stack[j+1]],points[stack[j]],points[vindex]);
+				} else {
+					triangle.Triangle(points[stack[j]],points[stack[j+1]],points[vindex]);
+				}
+				triangles->push_back(triangle);
+			}
+			stack[0] = priority[i-1];
+			stack[1] = priority[i];
+			stackptr = 2;
+		} else {
+			stackptr--;
+			while(stackptr>0) {
+				if(vertextypes[vindex]==1) {
+					if(IsConvex(points[vindex],points[stack[stackptr-1]],points[stack[stackptr]])) {
+						triangle.Triangle(points[vindex],points[stack[stackptr-1]],points[stack[stackptr]]);
+						triangles->push_back(triangle);
+						stackptr--;
+					} else {
+						break;
+					}
+				} else {
+					if(IsConvex(points[vindex],points[stack[stackptr]],points[stack[stackptr-1]])) {
+						triangle.Triangle(points[vindex],points[stack[stackptr]],points[stack[stackptr-1]]);
+						triangles->push_back(triangle);
+						stackptr--;
+					} else {
+						break;
+					}
+				}
+			}
+			stackptr++;
+			stack[stackptr] = vindex;
+			stackptr++;
+		}
+	}
+	vindex = priority[i];
+	for(j=0;j<(stackptr-1);j++) {
+		if(vertextypes[stack[j+1]]==1) {
+			triangle.Triangle(points[stack[j]],points[stack[j+1]],points[vindex]);
+		} else {
+			triangle.Triangle(points[stack[j+1]],points[stack[j]],points[vindex]);
+		}
+		triangles->push_back(triangle);
+	}
+
+	delete [] priority;
+	delete [] vertextypes;
+	delete [] stack;
+
+	return 1;
+}
+
+int TriangulatorPartition::Triangulate_MONO(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *triangles) {
+	List<TriangulatorPoly> monotone;
+	List<TriangulatorPoly>::Element* iter;
+
+	if(!MonotonePartition(inpolys,&monotone)) return 0;
+	for(iter = monotone.front(); iter;iter=iter->next()) {
+		if(!TriangulateMonotone(&(iter->get()),triangles)) return 0;
+	}
+	return 1;
+}
+
+int TriangulatorPartition::Triangulate_MONO(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles) {
+	List<TriangulatorPoly> polys;
+	polys.push_back(*poly);
+
+	return Triangulate_MONO(&polys, triangles);
+}
diff --git a/thirdparty/misc/triangulator.h b/thirdparty/misc/triangulator.h
new file mode 100644
index 0000000000..b6dd7e8236
--- /dev/null
+++ b/thirdparty/misc/triangulator.h
@@ -0,0 +1,306 @@
+//Copyright (C) 2011 by Ivan Fratric
+//
+//Permission is hereby granted, free of charge, to any person obtaining a copy
+//of this software and associated documentation files (the "Software"), to deal
+//in the Software without restriction, including without limitation the rights
+//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//copies of the Software, and to permit persons to whom the Software is
+//furnished to do so, subject to the following conditions:
+//
+//The above copyright notice and this permission notice shall be included in
+//all copies or substantial portions of the Software.
+//
+//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//THE SOFTWARE.
+
+#ifndef TRIANGULATOR_H
+#define TRIANGULATOR_H
+
+#include "math_2d.h"
+#include "list.h"
+#include "set.h"
+//2D point structure
+
+
+#define TRIANGULATOR_CCW 1
+#define TRIANGULATOR_CW -1
+//Polygon implemented as an array of points with a 'hole' flag
+class TriangulatorPoly {
+protected:
+
+
+
+	Vector2 *points;
+	long numpoints;
+	bool hole;
+
+public:
+
+	//constructors/destructors
+	TriangulatorPoly();
+	~TriangulatorPoly();
+
+	TriangulatorPoly(const TriangulatorPoly &src);
+	TriangulatorPoly& operator=(const TriangulatorPoly &src);
+
+	//getters and setters
+	long GetNumPoints() {
+		return numpoints;
+	}
+
+	bool IsHole() {
+		return hole;
+	}
+
+	void SetHole(bool hole) {
+		this->hole = hole;
+	}
+
+	Vector2 &GetPoint(long i) {
+		return points[i];
+	}
+
+	Vector2 *GetPoints() {
+		return points;
+	}
+
+	Vector2& operator[] (int i) {
+		return points[i];
+	}
+
+	//clears the polygon points
+	void Clear();
+
+	//inits the polygon with numpoints vertices
+	void Init(long numpoints);
+
+	//creates a triangle with points p1,p2,p3
+	void Triangle(Vector2 &p1, Vector2 &p2, Vector2 &p3);
+
+	//inverts the orfer of vertices
+	void Invert();
+
+	//returns the orientation of the polygon
+	//possible values:
+	//   Triangulator_CCW : polygon vertices are in counter-clockwise order
+	//   Triangulator_CW : polygon vertices are in clockwise order
+	//       0 : the polygon has no (measurable) area
+	int GetOrientation();
+
+	//sets the polygon orientation
+	//orientation can be
+	//   Triangulator_CCW : sets vertices in counter-clockwise order
+	//   Triangulator_CW : sets vertices in clockwise order
+	void SetOrientation(int orientation);
+};
+
+class TriangulatorPartition {
+protected:
+	struct PartitionVertex {
+		bool isActive;
+		bool isConvex;
+		bool isEar;
+
+		Vector2 p;
+		real_t angle;
+		PartitionVertex *previous;
+		PartitionVertex *next;
+	};
+
+	struct MonotoneVertex {
+		Vector2 p;
+		long previous;
+		long next;
+	};
+
+	struct VertexSorter{
+		mutable MonotoneVertex *vertices;
+		bool operator() (long index1, long index2) const;
+	};
+
+	struct Diagonal {
+		long index1;
+		long index2;
+	};
+
+	//dynamic programming state for minimum-weight triangulation
+	struct DPState {
+		bool visible;
+		real_t weight;
+		long bestvertex;
+	};
+
+	//dynamic programming state for convex partitioning
+	struct DPState2 {
+		bool visible;
+		long weight;
+		List<Diagonal> pairs;
+	};
+
+	//edge that intersects the scanline
+	struct ScanLineEdge {
+		mutable long index;
+		Vector2 p1;
+		Vector2 p2;
+
+		//determines if the edge is to the left of another edge
+		bool operator< (const ScanLineEdge & other) const;
+
+		bool IsConvex(const Vector2& p1, const Vector2& p2, const Vector2& p3) const;
+	};
+
+	//standard helper functions
+	bool IsConvex(Vector2& p1, Vector2& p2, Vector2& p3);
+	bool IsReflex(Vector2& p1, Vector2& p2, Vector2& p3);
+	bool IsInside(Vector2& p1, Vector2& p2, Vector2& p3, Vector2 &p);
+
+	bool InCone(Vector2 &p1, Vector2 &p2, Vector2 &p3, Vector2 &p);
+	bool InCone(PartitionVertex *v, Vector2 &p);
+
+	int Intersects(Vector2 &p11, Vector2 &p12, Vector2 &p21, Vector2 &p22);
+
+	Vector2 Normalize(const Vector2 &p);
+	real_t Distance(const Vector2 &p1, const Vector2 &p2);
+
+	//helper functions for Triangulate_EC
+	void UpdateVertexReflexity(PartitionVertex *v);
+	void UpdateVertex(PartitionVertex *v,PartitionVertex *vertices, long numvertices);
+
+	//helper functions for ConvexPartition_OPT
+	void UpdateState(long a, long b, long w, long i, long j, DPState2 **dpstates);
+	void TypeA(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates);
+	void TypeB(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates);
+
+	//helper functions for MonotonePartition
+	bool Below(Vector2 &p1, Vector2 &p2);
+	void AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
+			 char *vertextypes, Set<ScanLineEdge>::Element **edgeTreeIterators,
+			 Set<ScanLineEdge> *edgeTree, long *helpers);
+
+	//triangulates a monotone polygon, used in Triangulate_MONO
+	int TriangulateMonotone(TriangulatorPoly *inPoly, List<TriangulatorPoly> *triangles);
+
+public:
+
+	//simple heuristic procedure for removing holes from a list of polygons
+	//works by creating a diagonal from the rightmost hole vertex to some visible vertex
+	//time complexity: O(h*(n^2)), h is the number of holes, n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   inpolys : a list of polygons that can contain holes
+	//             vertices of all non-hole polys have to be in counter-clockwise order
+	//             vertices of all hole polys have to be in clockwise order
+	//   outpolys : a list of polygons without holes
+	//returns 1 on success, 0 on failure
+	int RemoveHoles(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *outpolys);
+
+	//triangulates a polygon by ear clipping
+	//time complexity O(n^2), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   poly : an input polygon to be triangulated
+	//          vertices have to be in counter-clockwise order
+	//   triangles : a list of triangles (result)
+	//returns 1 on success, 0 on failure
+	int Triangulate_EC(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles);
+
+	//triangulates a list of polygons that may contain holes by ear clipping algorithm
+	//first calls RemoveHoles to get rid of the holes, and then Triangulate_EC for each resulting polygon
+	//time complexity: O(h*(n^2)), h is the number of holes, n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   inpolys : a list of polygons to be triangulated (can contain holes)
+	//             vertices of all non-hole polys have to be in counter-clockwise order
+	//             vertices of all hole polys have to be in clockwise order
+	//   triangles : a list of triangles (result)
+	//returns 1 on success, 0 on failure
+	int Triangulate_EC(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *triangles);
+
+	//creates an optimal polygon triangulation in terms of minimal edge length
+	//time complexity: O(n^3), n is the number of vertices
+	//space complexity: O(n^2)
+	//params:
+	//   poly : an input polygon to be triangulated
+	//          vertices have to be in counter-clockwise order
+	//   triangles : a list of triangles (result)
+	//returns 1 on success, 0 on failure
+	int Triangulate_OPT(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles);
+
+	//triangulates a polygons by firstly partitioning it into monotone polygons
+	//time complexity: O(n*log(n)), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   poly : an input polygon to be triangulated
+	//          vertices have to be in counter-clockwise order
+	//   triangles : a list of triangles (result)
+	//returns 1 on success, 0 on failure
+	int Triangulate_MONO(TriangulatorPoly *poly, List<TriangulatorPoly> *triangles);
+
+	//triangulates a list of polygons by firstly partitioning them into monotone polygons
+	//time complexity: O(n*log(n)), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   inpolys : a list of polygons to be triangulated (can contain holes)
+	//             vertices of all non-hole polys have to be in counter-clockwise order
+	//             vertices of all hole polys have to be in clockwise order
+	//   triangles : a list of triangles (result)
+	//returns 1 on success, 0 on failure
+	int Triangulate_MONO(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *triangles);
+
+	//creates a monotone partition of a list of polygons that can contain holes
+	//time complexity: O(n*log(n)), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   inpolys : a list of polygons to be triangulated (can contain holes)
+	//             vertices of all non-hole polys have to be in counter-clockwise order
+	//             vertices of all hole polys have to be in clockwise order
+	//   monotonePolys : a list of monotone polygons (result)
+	//returns 1 on success, 0 on failure
+	int MonotonePartition(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *monotonePolys);
+
+	//partitions a polygon into convex polygons by using Hertel-Mehlhorn algorithm
+	//the algorithm gives at most four times the number of parts as the optimal algorithm
+	//however, in practice it works much better than that and often gives optimal partition
+	//uses triangulation obtained by ear clipping as intermediate result
+	//time complexity O(n^2), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   poly : an input polygon to be partitioned
+	//          vertices have to be in counter-clockwise order
+	//   parts : resulting list of convex polygons
+	//returns 1 on success, 0 on failure
+	int ConvexPartition_HM(TriangulatorPoly *poly, List<TriangulatorPoly> *parts);
+
+	//partitions a list of polygons into convex parts by using Hertel-Mehlhorn algorithm
+	//the algorithm gives at most four times the number of parts as the optimal algorithm
+	//however, in practice it works much better than that and often gives optimal partition
+	//uses triangulation obtained by ear clipping as intermediate result
+	//time complexity O(n^2), n is the number of vertices
+	//space complexity: O(n)
+	//params:
+	//   inpolys : an input list of polygons to be partitioned
+	//             vertices of all non-hole polys have to be in counter-clockwise order
+	//             vertices of all hole polys have to be in clockwise order
+	//   parts : resulting list of convex polygons
+	//returns 1 on success, 0 on failure
+	int ConvexPartition_HM(List<TriangulatorPoly> *inpolys, List<TriangulatorPoly> *parts);
+
+	//optimal convex partitioning (in terms of number of resulting convex polygons)
+	//using the Keil-Snoeyink algorithm
+	//M. Keil, J. Snoeyink, "On the time bound for convex decomposition of simple polygons", 1998
+	//time complexity O(n^3), n is the number of vertices
+	//space complexity: O(n^3)
+	//   poly : an input polygon to be partitioned
+	//          vertices have to be in counter-clockwise order
+	//   parts : resulting list of convex polygons
+	//returns 1 on success, 0 on failure
+	int ConvexPartition_OPT(TriangulatorPoly *poly, List<TriangulatorPoly> *parts);
+};
+
+
+#endif
diff --git a/thirdparty/misc/yuv2rgb.h b/thirdparty/misc/yuv2rgb.h
new file mode 100644
index 0000000000..a9bef76da8
--- /dev/null
+++ b/thirdparty/misc/yuv2rgb.h
@@ -0,0 +1,1123 @@
+/* Thirdparty code presumably from http://wss.co.uk/pinknoise/yuv2rgb/ */
+/* FIXME: Move to thirdparty dir */
+
+#ifndef YUV2RGB_H
+#define YUV2RGB_H
+
+#include "typedefs.h"
+
+static const uint32_t tables[256*3] = {
+	/* y_table */
+	0x7FFFFFEDU,
+	0x7FFFFFEFU,
+	0x7FFFFFF0U,
+	0x7FFFFFF1U,
+	0x7FFFFFF2U,
+	0x7FFFFFF3U,
+	0x7FFFFFF4U,
+	0x7FFFFFF6U,
+	0x7FFFFFF7U,
+	0x7FFFFFF8U,
+	0x7FFFFFF9U,
+	0x7FFFFFFAU,
+	0x7FFFFFFBU,
+	0x7FFFFFFDU,
+	0x7FFFFFFEU,
+	0x7FFFFFFFU,
+	0x80000000U,
+	0x80400801U,
+	0x80A01002U,
+	0x80E01803U,
+	0x81202805U,
+	0x81803006U,
+	0x81C03807U,
+	0x82004008U,
+	0x82604809U,
+	0x82A0500AU,
+	0x82E0600CU,
+	0x8340680DU,
+	0x8380700EU,
+	0x83C0780FU,
+	0x84208010U,
+	0x84608811U,
+	0x84A09813U,
+	0x8500A014U,
+	0x8540A815U,
+	0x8580B016U,
+	0x85E0B817U,
+	0x8620C018U,
+	0x8660D01AU,
+	0x86C0D81BU,
+	0x8700E01CU,
+	0x8740E81DU,
+	0x87A0F01EU,
+	0x87E0F81FU,
+	0x88210821U,
+	0x88811022U,
+	0x88C11823U,
+	0x89012024U,
+	0x89412825U,
+	0x89A13026U,
+	0x89E14028U,
+	0x8A214829U,
+	0x8A81502AU,
+	0x8AC1582BU,
+	0x8B01602CU,
+	0x8B61682DU,
+	0x8BA1782FU,
+	0x8BE18030U,
+	0x8C418831U,
+	0x8C819032U,
+	0x8CC19833U,
+	0x8D21A034U,
+	0x8D61B036U,
+	0x8DA1B837U,
+	0x8E01C038U,
+	0x8E41C839U,
+	0x8E81D03AU,
+	0x8EE1D83BU,
+	0x8F21E83DU,
+	0x8F61F03EU,
+	0x8FC1F83FU,
+	0x90020040U,
+	0x90420841U,
+	0x90A21042U,
+	0x90E22044U,
+	0x91222845U,
+	0x91823046U,
+	0x91C23847U,
+	0x92024048U,
+	0x92624849U,
+	0x92A2504AU,
+	0x92E2604CU,
+	0x9342684DU,
+	0x9382704EU,
+	0x93C2784FU,
+	0x94228050U,
+	0x94628851U,
+	0x94A29853U,
+	0x9502A054U,
+	0x9542A855U,
+	0x9582B056U,
+	0x95E2B857U,
+	0x9622C058U,
+	0x9662D05AU,
+	0x96C2D85BU,
+	0x9702E05CU,
+	0x9742E85DU,
+	0x97A2F05EU,
+	0x97E2F85FU,
+	0x98230861U,
+	0x98831062U,
+	0x98C31863U,
+	0x99032064U,
+	0x99632865U,
+	0x99A33066U,
+	0x99E34068U,
+	0x9A434869U,
+	0x9A83506AU,
+	0x9AC3586BU,
+	0x9B23606CU,
+	0x9B63686DU,
+	0x9BA3786FU,
+	0x9BE38070U,
+	0x9C438871U,
+	0x9C839072U,
+	0x9CC39873U,
+	0x9D23A074U,
+	0x9D63B076U,
+	0x9DA3B877U,
+	0x9E03C078U,
+	0x9E43C879U,
+	0x9E83D07AU,
+	0x9EE3D87BU,
+	0x9F23E87DU,
+	0x9F63F07EU,
+	0x9FC3F87FU,
+	0xA0040080U,
+	0xA0440881U,
+	0xA0A41082U,
+	0xA0E42084U,
+	0xA1242885U,
+	0xA1843086U,
+	0xA1C43887U,
+	0xA2044088U,
+	0xA2644889U,
+	0xA2A4588BU,
+	0xA2E4608CU,
+	0xA344688DU,
+	0xA384708EU,
+	0xA3C4788FU,
+	0xA4248090U,
+	0xA4649092U,
+	0xA4A49893U,
+	0xA504A094U,
+	0xA544A895U,
+	0xA584B096U,
+	0xA5E4B897U,
+	0xA624C098U,
+	0xA664D09AU,
+	0xA6C4D89BU,
+	0xA704E09CU,
+	0xA744E89DU,
+	0xA7A4F09EU,
+	0xA7E4F89FU,
+	0xA82508A1U,
+	0xA88510A2U,
+	0xA8C518A3U,
+	0xA90520A4U,
+	0xA96528A5U,
+	0xA9A530A6U,
+	0xA9E540A8U,
+	0xAA4548A9U,
+	0xAA8550AAU,
+	0xAAC558ABU,
+	0xAB2560ACU,
+	0xAB6568ADU,
+	0xABA578AFU,
+	0xAC0580B0U,
+	0xAC4588B1U,
+	0xAC8590B2U,
+	0xACE598B3U,
+	0xAD25A0B4U,
+	0xAD65B0B6U,
+	0xADA5B8B7U,
+	0xAE05C0B8U,
+	0xAE45C8B9U,
+	0xAE85D0BAU,
+	0xAEE5D8BBU,
+	0xAF25E8BDU,
+	0xAF65F0BEU,
+	0xAFC5F8BFU,
+	0xB00600C0U,
+	0xB04608C1U,
+	0xB0A610C2U,
+	0xB0E620C4U,
+	0xB12628C5U,
+	0xB18630C6U,
+	0xB1C638C7U,
+	0xB20640C8U,
+	0xB26648C9U,
+	0xB2A658CBU,
+	0xB2E660CCU,
+	0xB34668CDU,
+	0xB38670CEU,
+	0xB3C678CFU,
+	0xB42680D0U,
+	0xB46690D2U,
+	0xB4A698D3U,
+	0xB506A0D4U,
+	0xB546A8D5U,
+	0xB586B0D6U,
+	0xB5E6B8D7U,
+	0xB626C8D9U,
+	0xB666D0DAU,
+	0xB6C6D8DBU,
+	0xB706E0DCU,
+	0xB746E8DDU,
+	0xB7A6F0DEU,
+	0xB7E6F8DFU,
+	0xB82708E1U,
+	0xB88710E2U,
+	0xB8C718E3U,
+	0xB90720E4U,
+	0xB96728E5U,
+	0xB9A730E6U,
+	0xB9E740E8U,
+	0xBA4748E9U,
+	0xBA8750EAU,
+	0xBAC758EBU,
+	0xBB2760ECU,
+	0xBB6768EDU,
+	0xBBA778EFU,
+	0xBC0780F0U,
+	0xBC4788F1U,
+	0xBC8790F2U,
+	0xBCE798F3U,
+	0xBD27A0F4U,
+	0xBD67B0F6U,
+	0xBDC7B8F7U,
+	0xBE07C0F8U,
+	0xBE47C8F9U,
+	0xBEA7D0FAU,
+	0xBEE7D8FBU,
+	0xBF27E8FDU,
+	0xBF87F0FEU,
+	0xBFC7F8FFU,
+	0xC0080100U,
+	0xC0480901U,
+	0xC0A81102U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	0xC0E82104U,
+	/* u_table */
+	0x0C400103U,
+	0x0C200105U,
+	0x0C200107U,
+	0x0C000109U,
+	0x0BE0010BU,
+	0x0BC0010DU,
+	0x0BA0010FU,
+	0x0BA00111U,
+	0x0B800113U,
+	0x0B600115U,
+	0x0B400117U,
+	0x0B400119U,
+	0x0B20011BU,
+	0x0B00011DU,
+	0x0AE0011FU,
+	0x0AE00121U,
+	0x0AC00123U,
+	0x0AA00125U,
+	0x0A800127U,
+	0x0A600129U,
+	0x0A60012BU,
+	0x0A40012DU,
+	0x0A20012FU,
+	0x0A000131U,
+	0x0A000132U,
+	0x09E00134U,
+	0x09C00136U,
+	0x09A00138U,
+	0x09A0013AU,
+	0x0980013CU,
+	0x0960013EU,
+	0x09400140U,
+	0x09400142U,
+	0x09200144U,
+	0x09000146U,
+	0x08E00148U,
+	0x08C0014AU,
+	0x08C0014CU,
+	0x08A0014EU,
+	0x08800150U,
+	0x08600152U,
+	0x08600154U,
+	0x08400156U,
+	0x08200158U,
+	0x0800015AU,
+	0x0800015CU,
+	0x07E0015EU,
+	0x07C00160U,
+	0x07A00162U,
+	0x07A00164U,
+	0x07800166U,
+	0x07600168U,
+	0x0740016AU,
+	0x0720016CU,
+	0x0720016EU,
+	0x07000170U,
+	0x06E00172U,
+	0x06C00174U,
+	0x06C00176U,
+	0x06A00178U,
+	0x0680017AU,
+	0x0660017CU,
+	0x0660017EU,
+	0x06400180U,
+	0x06200182U,
+	0x06000184U,
+	0x05E00185U,
+	0x05E00187U,
+	0x05C00189U,
+	0x05A0018BU,
+	0x0580018DU,
+	0x0580018FU,
+	0x05600191U,
+	0x05400193U,
+	0x05200195U,
+	0x05200197U,
+	0x05000199U,
+	0x04E0019BU,
+	0x04C0019DU,
+	0x04C0019FU,
+	0x04A001A1U,
+	0x048001A3U,
+	0x046001A5U,
+	0x044001A7U,
+	0x044001A9U,
+	0x042001ABU,
+	0x040001ADU,
+	0x03E001AFU,
+	0x03E001B1U,
+	0x03C001B3U,
+	0x03A001B5U,
+	0x038001B7U,
+	0x038001B9U,
+	0x036001BBU,
+	0x034001BDU,
+	0x032001BFU,
+	0x032001C1U,
+	0x030001C3U,
+	0x02E001C5U,
+	0x02C001C7U,
+	0x02A001C9U,
+	0x02A001CBU,
+	0x028001CDU,
+	0x026001CFU,
+	0x024001D1U,
+	0x024001D3U,
+	0x022001D5U,
+	0x020001D7U,
+	0x01E001D8U,
+	0x01E001DAU,
+	0x01C001DCU,
+	0x01A001DEU,
+	0x018001E0U,
+	0x016001E2U,
+	0x016001E4U,
+	0x014001E6U,
+	0x012001E8U,
+	0x010001EAU,
+	0x010001ECU,
+	0x00E001EEU,
+	0x00C001F0U,
+	0x00A001F2U,
+	0x00A001F4U,
+	0x008001F6U,
+	0x006001F8U,
+	0x004001FAU,
+	0x004001FCU,
+	0x002001FEU,
+	0x00000200U,
+	0xFFE00202U,
+	0xFFC00204U,
+	0xFFC00206U,
+	0xFFA00208U,
+	0xFF80020AU,
+	0xFF60020CU,
+	0xFF60020EU,
+	0xFF400210U,
+	0xFF200212U,
+	0xFF000214U,
+	0xFF000216U,
+	0xFEE00218U,
+	0xFEC0021AU,
+	0xFEA0021CU,
+	0xFEA0021EU,
+	0xFE800220U,
+	0xFE600222U,
+	0xFE400224U,
+	0xFE200226U,
+	0xFE200228U,
+	0xFE000229U,
+	0xFDE0022BU,
+	0xFDC0022DU,
+	0xFDC0022FU,
+	0xFDA00231U,
+	0xFD800233U,
+	0xFD600235U,
+	0xFD600237U,
+	0xFD400239U,
+	0xFD20023BU,
+	0xFD00023DU,
+	0xFCE0023FU,
+	0xFCE00241U,
+	0xFCC00243U,
+	0xFCA00245U,
+	0xFC800247U,
+	0xFC800249U,
+	0xFC60024BU,
+	0xFC40024DU,
+	0xFC20024FU,
+	0xFC200251U,
+	0xFC000253U,
+	0xFBE00255U,
+	0xFBC00257U,
+	0xFBC00259U,
+	0xFBA0025BU,
+	0xFB80025DU,
+	0xFB60025FU,
+	0xFB400261U,
+	0xFB400263U,
+	0xFB200265U,
+	0xFB000267U,
+	0xFAE00269U,
+	0xFAE0026BU,
+	0xFAC0026DU,
+	0xFAA0026FU,
+	0xFA800271U,
+	0xFA800273U,
+	0xFA600275U,
+	0xFA400277U,
+	0xFA200279U,
+	0xFA20027BU,
+	0xFA00027CU,
+	0xF9E0027EU,
+	0xF9C00280U,
+	0xF9A00282U,
+	0xF9A00284U,
+	0xF9800286U,
+	0xF9600288U,
+	0xF940028AU,
+	0xF940028CU,
+	0xF920028EU,
+	0xF9000290U,
+	0xF8E00292U,
+	0xF8E00294U,
+	0xF8C00296U,
+	0xF8A00298U,
+	0xF880029AU,
+	0xF860029CU,
+	0xF860029EU,
+	0xF84002A0U,
+	0xF82002A2U,
+	0xF80002A4U,
+	0xF80002A6U,
+	0xF7E002A8U,
+	0xF7C002AAU,
+	0xF7A002ACU,
+	0xF7A002AEU,
+	0xF78002B0U,
+	0xF76002B2U,
+	0xF74002B4U,
+	0xF74002B6U,
+	0xF72002B8U,
+	0xF70002BAU,
+	0xF6E002BCU,
+	0xF6C002BEU,
+	0xF6C002C0U,
+	0xF6A002C2U,
+	0xF68002C4U,
+	0xF66002C6U,
+	0xF66002C8U,
+	0xF64002CAU,
+	0xF62002CCU,
+	0xF60002CEU,
+	0xF60002CFU,
+	0xF5E002D1U,
+	0xF5C002D3U,
+	0xF5A002D5U,
+	0xF5A002D7U,
+	0xF58002D9U,
+	0xF56002DBU,
+	0xF54002DDU,
+	0xF52002DFU,
+	0xF52002E1U,
+	0xF50002E3U,
+	0xF4E002E5U,
+	0xF4C002E7U,
+	0xF4C002E9U,
+	0xF4A002EBU,
+	0xF48002EDU,
+	0xF46002EFU,
+	0xF46002F1U,
+	0xF44002F3U,
+	0xF42002F5U,
+	0xF40002F7U,
+	0xF3E002F9U,
+	0xF3E002FBU,
+	/* v_table */
+	0x1A09A000U,
+	0x19E9A800U,
+	0x19A9B800U,
+	0x1969C800U,
+	0x1949D000U,
+	0x1909E000U,
+	0x18C9E800U,
+	0x18A9F800U,
+	0x186A0000U,
+	0x182A1000U,
+	0x180A2000U,
+	0x17CA2800U,
+	0x17AA3800U,
+	0x176A4000U,
+	0x172A5000U,
+	0x170A6000U,
+	0x16CA6800U,
+	0x168A7800U,
+	0x166A8000U,
+	0x162A9000U,
+	0x160AA000U,
+	0x15CAA800U,
+	0x158AB800U,
+	0x156AC000U,
+	0x152AD000U,
+	0x14EAE000U,
+	0x14CAE800U,
+	0x148AF800U,
+	0x146B0000U,
+	0x142B1000U,
+	0x13EB2000U,
+	0x13CB2800U,
+	0x138B3800U,
+	0x134B4000U,
+	0x132B5000U,
+	0x12EB6000U,
+	0x12CB6800U,
+	0x128B7800U,
+	0x124B8000U,
+	0x122B9000U,
+	0x11EBA000U,
+	0x11ABA800U,
+	0x118BB800U,
+	0x114BC000U,
+	0x112BD000U,
+	0x10EBE000U,
+	0x10ABE800U,
+	0x108BF800U,
+	0x104C0000U,
+	0x100C1000U,
+	0x0FEC2000U,
+	0x0FAC2800U,
+	0x0F8C3800U,
+	0x0F4C4000U,
+	0x0F0C5000U,
+	0x0EEC5800U,
+	0x0EAC6800U,
+	0x0E6C7800U,
+	0x0E4C8000U,
+	0x0E0C9000U,
+	0x0DEC9800U,
+	0x0DACA800U,
+	0x0D6CB800U,
+	0x0D4CC000U,
+	0x0D0CD000U,
+	0x0CCCD800U,
+	0x0CACE800U,
+	0x0C6CF800U,
+	0x0C4D0000U,
+	0x0C0D1000U,
+	0x0BCD1800U,
+	0x0BAD2800U,
+	0x0B6D3800U,
+	0x0B2D4000U,
+	0x0B0D5000U,
+	0x0ACD5800U,
+	0x0AAD6800U,
+	0x0A6D7800U,
+	0x0A2D8000U,
+	0x0A0D9000U,
+	0x09CD9800U,
+	0x098DA800U,
+	0x096DB800U,
+	0x092DC000U,
+	0x090DD000U,
+	0x08CDD800U,
+	0x088DE800U,
+	0x086DF800U,
+	0x082E0000U,
+	0x07EE1000U,
+	0x07CE1800U,
+	0x078E2800U,
+	0x076E3800U,
+	0x072E4000U,
+	0x06EE5000U,
+	0x06CE5800U,
+	0x068E6800U,
+	0x064E7800U,
+	0x062E8000U,
+	0x05EE9000U,
+	0x05CE9800U,
+	0x058EA800U,
+	0x054EB800U,
+	0x052EC000U,
+	0x04EED000U,
+	0x04AED800U,
+	0x048EE800U,
+	0x044EF000U,
+	0x042F0000U,
+	0x03EF1000U,
+	0x03AF1800U,
+	0x038F2800U,
+	0x034F3000U,
+	0x030F4000U,
+	0x02EF5000U,
+	0x02AF5800U,
+	0x028F6800U,
+	0x024F7000U,
+	0x020F8000U,
+	0x01EF9000U,
+	0x01AF9800U,
+	0x016FA800U,
+	0x014FB000U,
+	0x010FC000U,
+	0x00EFD000U,
+	0x00AFD800U,
+	0x006FE800U,
+	0x004FF000U,
+	0x00100000U,
+	0xFFD01000U,
+	0xFFB01800U,
+	0xFF702800U,
+	0xFF303000U,
+	0xFF104000U,
+	0xFED05000U,
+	0xFEB05800U,
+	0xFE706800U,
+	0xFE307000U,
+	0xFE108000U,
+	0xFDD09000U,
+	0xFD909800U,
+	0xFD70A800U,
+	0xFD30B000U,
+	0xFD10C000U,
+	0xFCD0D000U,
+	0xFC90D800U,
+	0xFC70E800U,
+	0xFC30F000U,
+	0xFBF10000U,
+	0xFBD11000U,
+	0xFB911800U,
+	0xFB712800U,
+	0xFB313000U,
+	0xFAF14000U,
+	0xFAD14800U,
+	0xFA915800U,
+	0xFA516800U,
+	0xFA317000U,
+	0xF9F18000U,
+	0xF9D18800U,
+	0xF9919800U,
+	0xF951A800U,
+	0xF931B000U,
+	0xF8F1C000U,
+	0xF8B1C800U,
+	0xF891D800U,
+	0xF851E800U,
+	0xF831F000U,
+	0xF7F20000U,
+	0xF7B20800U,
+	0xF7921800U,
+	0xF7522800U,
+	0xF7123000U,
+	0xF6F24000U,
+	0xF6B24800U,
+	0xF6925800U,
+	0xF6526800U,
+	0xF6127000U,
+	0xF5F28000U,
+	0xF5B28800U,
+	0xF5729800U,
+	0xF552A800U,
+	0xF512B000U,
+	0xF4F2C000U,
+	0xF4B2C800U,
+	0xF472D800U,
+	0xF452E800U,
+	0xF412F000U,
+	0xF3D30000U,
+	0xF3B30800U,
+	0xF3731800U,
+	0xF3532800U,
+	0xF3133000U,
+	0xF2D34000U,
+	0xF2B34800U,
+	0xF2735800U,
+	0xF2336800U,
+	0xF2137000U,
+	0xF1D38000U,
+	0xF1B38800U,
+	0xF1739800U,
+	0xF133A800U,
+	0xF113B000U,
+	0xF0D3C000U,
+	0xF093C800U,
+	0xF073D800U,
+	0xF033E000U,
+	0xF013F000U,
+	0xEFD40000U,
+	0xEF940800U,
+	0xEF741800U,
+	0xEF342000U,
+	0xEEF43000U,
+	0xEED44000U,
+	0xEE944800U,
+	0xEE745800U,
+	0xEE346000U,
+	0xEDF47000U,
+	0xEDD48000U,
+	0xED948800U,
+	0xED549800U,
+	0xED34A000U,
+	0xECF4B000U,
+	0xECD4C000U,
+	0xEC94C800U,
+	0xEC54D800U,
+	0xEC34E000U,
+	0xEBF4F000U,
+	0xEBB50000U,
+	0xEB950800U,
+	0xEB551800U,
+	0xEB352000U,
+	0xEAF53000U,
+	0xEAB54000U,
+	0xEA954800U,
+	0xEA555800U,
+	0xEA156000U,
+	0xE9F57000U,
+	0xE9B58000U,
+	0xE9958800U,
+	0xE9559800U,
+	0xE915A000U,
+	0xE8F5B000U,
+	0xE8B5C000U,
+	0xE875C800U,
+	0xE855D800U,
+	0xE815E000U,
+	0xE7F5F000U,
+	0xE7B60000U,
+	0xE7760800U,
+	0xE7561800U,
+	0xE7162000U,
+	0xE6D63000U,
+	0xE6B64000U,
+	0xE6764800U,
+	0xE6365800U
+};
+
+#define FLAGS 0x40080100
+#define READUV(U,V) (tables[256 + (U)] + tables[512 + (V)])
+#define READY(Y)    tables[Y]
+#define FIXUP(Y)                 \
+do {                             \
+    int tmp = (Y) & FLAGS;       \
+    if (tmp != 0)                \
+    {                            \
+	tmp  -= tmp>>8;          \
+	(Y)  |= tmp;             \
+	tmp   = FLAGS & ~(Y>>1); \
+	(Y)  += tmp>>8;          \
+    }                            \
+} while (0 == 1)
+
+#define STORE(Y,DSTPTR)         \
+do {                            \
+    *(DSTPTR)++ = (Y);          \
+    *(DSTPTR)++ = (Y)>>22;      \
+    *(DSTPTR)++ = (Y)>>11;      \
+    *(DSTPTR)++ = 255;            \
+} while (0 == 1)
+
+static void yuv422_2_rgb8888(uint8_t  *dst_ptr,
+		const uint8_t  *y_ptr,
+		const uint8_t  *u_ptr,
+		const uint8_t  *v_ptr,
+		      int32_t   width,
+		      int32_t   height,
+		      int32_t   y_span,
+		      int32_t   uv_span,
+		      int32_t   dst_span,
+		      int32_t   dither)
+{
+    height -= 1;
+    while (height > 0)
+    {
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do top row pair */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y0  = uv + READY(*y_ptr++);
+	    y1  = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    FIXUP(y1);
+	    STORE(y0, dst_ptr);
+	    STORE(y1, dst_ptr);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing top row pix */
+	    uint32_t uv, y0;
+
+	    uv = READUV(*u_ptr,*v_ptr);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	}
+	dst_ptr += dst_span-width*4;
+	y_ptr   += y_span-width;
+	u_ptr   += uv_span-(width>>1);
+	v_ptr   += uv_span-(width>>1);
+	height = (height<<16)>>16;
+	height -= 1;
+	if (height == 0)
+	    break;
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do second row pair */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y0  = uv + READY(*y_ptr++);
+	    y1  = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    FIXUP(y1);
+	    STORE(y0, dst_ptr);
+	    STORE(y1, dst_ptr);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing bottom row pix */
+	    uint32_t uv, y0;
+
+	    uv = READUV(*u_ptr,*v_ptr);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	}
+	dst_ptr += dst_span-width*4;
+	y_ptr   += y_span-width;
+	u_ptr   += uv_span-(width>>1);
+	v_ptr   += uv_span-(width>>1);
+	height = (height<<16)>>16;
+	height -= 1;
+    }
+}
+
+
+#undef FLAGS
+#undef READUV
+#undef READY
+#undef FIXUP
+#undef STORE
+
+
+#define FLAGS 0x40080100
+#define READUV(U,V) (tables[256 + (U)] + tables[512 + (V)])
+#define READY(Y)    tables[Y]
+#define FIXUP(Y)                 \
+do {                             \
+    int tmp = (Y) & FLAGS;       \
+    if (tmp != 0)                \
+    {                            \
+	tmp  -= tmp>>8;          \
+	(Y)  |= tmp;             \
+	tmp   = FLAGS & ~(Y>>1); \
+	(Y)  += tmp>>8;          \
+    }                            \
+} while (0 == 1)
+
+#define STORE(Y,DSTPTR)     \
+do {                        \
+    (DSTPTR) = 0xFF000000 | (Y & 0xFF) | (0xFF00 & (Y>>14)) | (0xFF0000 & (Y<<5));\
+} while (0 == 1)
+
+static void yuv420_2_rgb8888(uint8_t  *dst_ptr_,
+		const uint8_t  *y_ptr,
+		const uint8_t  *u_ptr,
+		const uint8_t  *v_ptr,
+		      int32_t   width,
+		      int32_t   height,
+		      int32_t   y_span,
+		      int32_t   uv_span,
+		      int32_t   dst_span,
+		      int32_t   dither)
+{
+    uint32_t *dst_ptr = (uint32_t *)(void *)dst_ptr_;
+    dst_span >>= 2;
+
+    height -= 1;
+    while (height > 0)
+    {
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do 2 column pairs */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y1  = uv + READY(y_ptr[y_span]);
+	    y0  = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    FIXUP(y0);
+	    STORE(y1, dst_ptr[dst_span]);
+	    STORE(y0, *dst_ptr++);
+	    y1  = uv + READY(y_ptr[y_span]);
+	    y0  = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    FIXUP(y0);
+	    STORE(y1, dst_ptr[dst_span]);
+	    STORE(y0, *dst_ptr++);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing column pair */
+	    uint32_t uv, y0, y1;
+
+	    uv = READUV(*u_ptr,*v_ptr);
+	    y1 = uv + READY(y_ptr[y_span]);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr[dst_span]);
+	    STORE(y1, *dst_ptr++);
+	}
+	dst_ptr += dst_span*2-width;
+	y_ptr   += y_span*2-width;
+	u_ptr   += uv_span-(width>>1);
+	v_ptr   += uv_span-(width>>1);
+	height = (height<<16)>>16;
+	height -= 2;
+    }
+    if (height == 0)
+    {
+	/* Trail row */
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do a row pair */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y1  = uv + READY(*y_ptr++);
+	    y0  = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    FIXUP(y0);
+	    STORE(y1, *dst_ptr++);
+	    STORE(y0, *dst_ptr++);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing pix */
+	    uint32_t uv, y0;
+
+	    uv = READUV(*u_ptr++,*v_ptr++);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, *dst_ptr++);
+	}
+    }
+}
+
+
+
+#undef FLAGS
+#undef READUV
+#undef READY
+#undef FIXUP
+#undef STORE
+
+#define FLAGS 0x40080100
+#define READUV(U,V) (tables[256 + (U)] + tables[512 + (V)])
+#define READY(Y)    tables[Y]
+#define FIXUP(Y)                 \
+do {                             \
+    int tmp = (Y) & FLAGS;       \
+    if (tmp != 0)                \
+    {                            \
+	tmp  -= tmp>>8;          \
+	(Y)  |= tmp;             \
+	tmp   = FLAGS & ~(Y>>1); \
+	(Y)  += tmp>>8;          \
+    }                            \
+} while (0 == 1)
+
+#define STORE(Y,DSTPTR)         \
+do {                            \
+    *(DSTPTR)++ = (Y);          \
+    *(DSTPTR)++ = (Y)>>22;      \
+    *(DSTPTR)++ = (Y)>>11;      \
+	*(DSTPTR)++ = 255;           \
+} while (0 == 1)
+
+static void yuv444_2_rgb8888(uint8_t  *dst_ptr,
+		const uint8_t  *y_ptr,
+		const uint8_t  *u_ptr,
+		const uint8_t  *v_ptr,
+		      int32_t   width,
+		      int32_t   height,
+		      int32_t   y_span,
+		      int32_t   uv_span,
+		      int32_t   dst_span,
+		      int32_t   dither)
+{
+    height -= 1;
+    while (height > 0)
+    {
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do top row pair */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y0  = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y1  = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    STORE(y1, dst_ptr);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing top row pix */
+	    uint32_t uv, y0;
+
+	    uv = READUV(*u_ptr++,*v_ptr++);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	}
+	dst_ptr += dst_span-width*4;
+	y_ptr   += y_span-width;
+	u_ptr   += uv_span-width;
+	v_ptr   += uv_span-width;
+	height = (height<<16)>>16;
+	height -= 1;
+	if (height == 0)
+	    break;
+	height -= width<<16;
+	height += 1<<16;
+	while (height < 0)
+	{
+	    /* Do second row pair */
+	    uint32_t uv, y0, y1;
+
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y0  = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	    uv  = READUV(*u_ptr++,*v_ptr++);
+	    y1  = uv + READY(*y_ptr++);
+	    FIXUP(y1);
+	    STORE(y1, dst_ptr);
+	    height += (2<<16);
+	}
+	if ((height>>16) == 0)
+	{
+	    /* Trailing bottom row pix */
+	    uint32_t uv, y0;
+
+	    uv = READUV(*u_ptr++,*v_ptr++);
+	    y0 = uv + READY(*y_ptr++);
+	    FIXUP(y0);
+	    STORE(y0, dst_ptr);
+	}
+	dst_ptr += dst_span-width*4;
+	y_ptr   += y_span-width;
+	u_ptr   += uv_span-width;
+	v_ptr   += uv_span-width;
+	height = (height<<16)>>16;
+	height -= 1;
+    }
+}
+#endif // YUV2RGB_H