From 83b630b8c27fc3307eba36fa2b6193690bd18e4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Mon, 11 May 2020 14:36:46 +0200
Subject: thirdparty: Cleanup after #38386, document provenance and copyright

Also renamed `delaunay.h` to `delaunay_2d.h` to match the class name.
---
 thirdparty/README.md                     |   19 +
 thirdparty/misc/r128.c                   |    2 +
 thirdparty/misc/r128.h                   | 2123 +++++++++++++++++++++++++++++
 thirdparty/misc/stb_rect_pack.h          |  628 +++++++++
 thirdparty/oidn/.gitignore               |    1 -
 thirdparty/oidn/LICENSE.txt              |  202 +++
 thirdparty/r128/r128.h                   | 2124 ------------------------------
 thirdparty/stb_rect_pack/stb_rect_pack.h |  629 ---------
 8 files changed, 2974 insertions(+), 2754 deletions(-)
 create mode 100644 thirdparty/misc/r128.c
 create mode 100644 thirdparty/misc/r128.h
 create mode 100644 thirdparty/misc/stb_rect_pack.h
 delete mode 100644 thirdparty/oidn/.gitignore
 create mode 100644 thirdparty/oidn/LICENSE.txt
 delete mode 100644 thirdparty/r128/r128.h
 delete mode 100644 thirdparty/stb_rect_pack/stb_rect_pack.h

(limited to 'thirdparty')

diff --git a/thirdparty/README.md b/thirdparty/README.md
index e51e7d7f24..c000133fe7 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -384,11 +384,19 @@ Collection of single-file libraries used in Godot components.
   * Upstream: http://www.pcg-random.org
   * Version: minimal C implementation, http://www.pcg-random.org/download.html
   * License: Apache 2.0
+- `r128.h`
+  * Upstream: https://github.com/fahickman/r128
+  * Version: 1.4.3 (2019)
+  * License: Public Domain
 - `smaz.{c,h}`
   * Upstream: https://github.com/antirez/smaz
   * Version: git (150e125cbae2e8fd20dd332432776ce13395d4d4, 2009)
   * License: BSD-3-Clause
   * Modifications: use `const char*` instead of `char*` for input string
+- `stb_rect_pack.h`
+  * Upstream: https://github.com/nothings/stb
+  * Version: 1.00 (2019)
+  * License: Public Domain (Unlicense) or MIT
 - `triangulator.{cpp,h}`
   * Upstream: https://github.com/ivanfratric/polypartition (`src/polypartition.cpp`)
   * Version: TBD, class was renamed
@@ -437,6 +445,17 @@ Files extracted from the upstream source:
 - LICENSE.txt
 
 
+## oidn
+
+- Upstream: https://github.com/OpenImageDenoise/oidn
+- Version: TBD
+- License: Apache 2.0
+
+Files extracted from upstream source:
+
+- TBD
+
+
 ## opus
 
 - Upstream: https://opus-codec.org
diff --git a/thirdparty/misc/r128.c b/thirdparty/misc/r128.c
new file mode 100644
index 0000000000..6b981aa693
--- /dev/null
+++ b/thirdparty/misc/r128.c
@@ -0,0 +1,2 @@
+#define R128_IMPLEMENTATION
+#include "r128.h"
diff --git a/thirdparty/misc/r128.h b/thirdparty/misc/r128.h
new file mode 100644
index 0000000000..be7cd3024d
--- /dev/null
+++ b/thirdparty/misc/r128.h
@@ -0,0 +1,2123 @@
+/*
+r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.4.3
+
+COMPILATION
+-----------
+Drop this header file somewhere in your project and include it wherever it is
+needed. There is no separate .c file for this library. To get the code, in ONE
+file in your project, put:
+
+#define R128_IMPLEMENTATION
+
+before you include this file. You may also provide a definition for R128_ASSERT
+to force the library to use a custom assert macro.
+
+COMPILER/LIBRARY SUPPORT
+------------------------
+This library requires a C89 compiler with support for 64-bit integers. If your
+compiler does not support the long long data type, the R128_U64, etc. macros
+must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
+for speed. If your compiler does not support these intrinsics, you can add
+#define R128_STDC_ONLY
+in your implementation file before including r128.h.
+
+The only C runtime library functionality used by this library is <assert.h>.
+This can be avoided by defining an R128_ASSERT macro in your implementation
+file. Since this library uses 64-bit arithmetic, this may implicitly add a
+runtime library dependency on 32-bit platforms.
+
+C++ SUPPORT
+-----------
+Operator overloads are supplied for C++ files that include this file. Since all
+C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
+file can be either C++ or C.
+
+LICENSE
+-------
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef H_R128_H
+#define H_R128_H
+
+#include <stddef.h>
+
+// 64-bit integer support
+// If your compiler does not have stdint.h, add appropriate defines for these macros.
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#  define R128_S32 __int32
+#  define R128_U32 unsigned __int32
+#  define R128_S64 __int64
+#  define R128_U64 unsigned __int64
+#  define R128_LIT_S64(x) x##i64
+#  define R128_LIT_U64(x) x##ui64
+#else
+#  include <stdint.h>
+#  define R128_S32 int32_t
+#  define R128_U32 uint32_t
+#  define R128_S64 int64_t
+#  define R128_U64 uint64_t
+#  define R128_LIT_S64(x) x##ll
+#  define R128_LIT_U64(x) x##ull
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct R128 {
+   R128_U64 lo;
+   R128_U64 hi;
+
+#ifdef __cplusplus
+   R128();
+   R128(double);
+   R128(int);
+   R128(R128_S64);
+   R128(R128_U64 low, R128_U64 high);
+
+   operator double() const;
+   operator R128_S64() const;
+   operator int() const;
+   operator bool() const;
+
+   bool operator!() const;
+   R128 operator~() const;
+   R128 operator-() const;
+   R128 &operator|=(const R128 &rhs);
+   R128 &operator&=(const R128 &rhs);
+   R128 &operator^=(const R128 &rhs);
+   R128 &operator+=(const R128 &rhs);
+   R128 &operator-=(const R128 &rhs);
+   R128 &operator*=(const R128 &rhs);
+   R128 &operator/=(const R128 &rhs);
+   R128 &operator%=(const R128 &rhs);
+   R128 &operator<<=(int amount);
+   R128 &operator>>=(int amount);
+#endif   //__cplusplus
+} R128;
+
+// Type conversion
+extern void r128FromInt(R128 *dst, R128_S64 v);
+extern void r128FromFloat(R128 *dst, double v);
+extern R128_S64 r128ToInt(const R128 *v);
+extern double r128ToFloat(const R128 *v);
+
+// Copy
+extern void r128Copy(R128 *dst, const R128 *src);
+
+// Negate
+extern void r128Neg(R128 *dst, const R128 *src);
+
+// Bitwise operations
+extern void r128Not(R128 *dst, const R128 *src);               // ~a
+extern void r128Or(R128 *dst, const R128 *a, const R128 *b);   // a | b
+extern void r128And(R128 *dst, const R128 *a, const R128 *b);  // a & b
+extern void r128Xor(R128 *dst, const R128 *a, const R128 *b);  // a ^ b
+extern void r128Shl(R128 *dst, const R128 *src, int amount);   // shift left by amount mod 128
+extern void r128Shr(R128 *dst, const R128 *src, int amount);   // shift right logical by amount mod 128
+extern void r128Sar(R128 *dst, const R128 *src, int amount);   // shift right arithmetic by amount mod 128
+
+// Arithmetic
+extern void r128Add(R128 *dst, const R128 *a, const R128 *b);  // a + b
+extern void r128Sub(R128 *dst, const R128 *a, const R128 *b);  // a - b
+extern void r128Mul(R128 *dst, const R128 *a, const R128 *b);  // a * b
+extern void r128Div(R128 *dst, const R128 *a, const R128 *b);  // a / b
+extern void r128Mod(R128 *dst, const R128 *a, const R128 *b);  // a - toInt(a / b) * b
+
+extern void r128Sqrt(R128 *dst, const R128 *v);  // sqrt(v)
+extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
+
+// Comparison
+extern int  r128Cmp(const R128 *a, const R128 *b);  // sign of a-b
+extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
+extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
+extern void r128Floor(R128 *dst, const R128 *v);
+extern void r128Ceil(R128 *dst, const R128 *v);
+extern int  r128IsNeg(const R128 *v); // quick check for < 0
+
+// String conversion
+//
+typedef enum R128ToStringSign {
+   R128ToStringSign_Default,  // no sign character for positive values
+   R128ToStringSign_Space,    // leading space for positive values
+   R128ToStringSign_Plus,     // leading '+' for positive values
+} R128ToStringSign;
+
+// Formatting options for use with r128ToStringOpt. The "defaults" correspond
+// to a format string of "%f".
+//
+typedef struct R128ToStringFormat {
+   // sign character for positive values. Default is R128ToStringSign_Default.
+   R128ToStringSign sign;
+
+   // minimum number of characters to write. Default is 0.
+   int width;
+
+   // place to the right of the decimal at which rounding is performed. If negative,
+   // a maximum of 20 decimal places will be written, with no trailing zeroes.
+   // (20 places is sufficient to ensure that r128FromString will convert back to the
+   // original value.) Default is -1. NOTE: This is not the same default that the C
+   // standard library uses for %f.
+   int precision;
+
+   // If non-zero, pads the output string with leading zeroes if the final result is
+   // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
+   int zeroPad;
+
+   // Always print a decimal point, even if the value is an integer. Default is 0.
+   int decimal;
+
+   // Left-align output if width specifier requires padding.
+   // Default is 0 (right align).
+   int leftAlign;
+} R128ToStringFormat;
+
+// r128ToStringOpt: convert R128 to a decimal string, with formatting.
+//
+// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
+// (including null terminator). No additional rounding is performed if dstSize is not large
+// enough to hold the entire string.
+//
+// opt: an R128ToStringFormat struct (q.v.) with formatting options.
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Number of bytes that will be written (i.e. how big does dst need to be?):
+// If width is specified: width + 1 bytes.
+// If precision is specified: at most precision + 22 bytes.
+// If neither is specified: at most 42 bytes.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
+
+// r128ToStringf: convert R128 to a decimal string, with formatting.
+//
+// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
+// (including null terminator).
+//
+// format: a printf-style format specifier, as one would use with floating point types.
+//    e.g. "%+5.2f". (The leading % and trailing f are optional.)
+//    NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
+//       that do not correspond to a format placeholder are ignored.
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Number of bytes that will be written (i.e. how big does dst need to be?):
+// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
+// Otherwise: at most max(width, 41) + 1 bytes.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
+
+// r128ToString: convert R128 to a decimal string, with default formatting.
+// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
+//
+// Uses the R128_decimal global as the decimal point character.
+// Always writes a null terminator, even if the destination buffer is not large enough.
+//
+// Will write at most 42 bytes (including NUL) to dst.
+//
+// Returns the number of bytes that would have been written if dst was sufficiently large,
+// not including the final null terminator.
+//
+extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
+
+// r128FromString: Convert string to R128.
+//
+// The string can be formatted either as a decimal number with optional sign
+// or as hexadecimal with a prefix of 0x or 0X.
+//
+// endptr, if not NULL, is set to the character following the last character
+//   used in the conversion.
+//
+extern void r128FromString(R128 *dst, const char *s, char **endptr);
+
+// Constants
+extern const R128 R128_min;      // minimum (most negative) value
+extern const R128 R128_max;      // maximum (most positive) value
+extern const R128 R128_smallest; // smallest positive value
+extern const R128 R128_zero;     // zero
+extern const R128 R128_one;      // 1.0
+
+extern char R128_decimal;        // decimal point character used by r128From/ToString. defaults to '.'
+
+#ifdef __cplusplus
+}
+
+#include <limits>
+namespace std {
+template<>
+struct numeric_limits<R128>
+{
+   static const bool is_specialized = true;
+
+   static R128 min() throw() { return R128_min; }
+   static R128 max() throw() { return R128_max; }
+
+   static const int digits = 127;
+   static const int digits10 = 38;
+   static const bool is_signed = true;
+   static const bool is_integer = false;
+   static const bool is_exact = false;
+   static const int radix = 2;
+   static R128 epsilon() throw() { return R128_smallest; }
+   static R128 round_error() throw() { return R128_one; }
+
+   static const int min_exponent = 0;
+   static const int min_exponent10 = 0;
+   static const int max_exponent = 0;
+   static const int max_exponent10 = 0;
+
+   static const bool has_infinity = false;
+   static const bool has_quiet_NaN = false;
+   static const bool has_signaling_NaN = false;
+   static const float_denorm_style has_denorm = denorm_absent;
+   static const bool has_denorm_loss = false;
+
+   static R128 infinity() throw() { return R128_zero; }
+   static R128 quiet_NaN() throw() { return R128_zero; }
+   static R128 signaling_NaN() throw() { return R128_zero; }
+   static R128 denorm_min() throw() { return R128_zero; }
+
+   static const bool is_iec559 = false;
+   static const bool is_bounded = true;
+   static const bool is_modulo = true;
+
+   static const bool traps = numeric_limits<R128_U64>::traps;
+   static const bool tinyness_before = false;
+   static const float_round_style round_style = round_toward_zero;
+};
+}  //namespace std
+
+inline R128::R128() {}
+
+inline R128::R128(double v)
+{
+   r128FromFloat(this, v);
+}
+
+inline R128::R128(int v)
+{
+   r128FromInt(this, v);
+}
+
+inline R128::R128(R128_S64 v)
+{
+   r128FromInt(this, v);
+}
+
+inline R128::R128(R128_U64 low, R128_U64 high)
+{
+   lo = low;
+   hi = high;
+}
+
+inline R128::operator double() const
+{
+   return r128ToFloat(this);
+}
+
+inline R128::operator R128_S64() const
+{
+   return r128ToInt(this);
+}
+
+inline R128::operator int() const
+{
+   return (int) r128ToInt(this);
+}
+
+inline R128::operator bool() const
+{
+   return lo || hi;
+}
+
+inline bool R128::operator!() const
+{
+   return !lo && !hi;
+}
+
+inline R128 R128::operator~() const
+{
+   R128 r;
+   r128Not(&r, this);
+   return r;
+}
+
+inline R128 R128::operator-() const
+{
+   R128 r;
+   r128Neg(&r, this);
+   return r;
+}
+
+inline R128 &R128::operator|=(const R128 &rhs)
+{
+   r128Or(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator&=(const R128 &rhs)
+{
+   r128And(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator^=(const R128 &rhs)
+{
+   r128Xor(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator+=(const R128 &rhs)
+{
+   r128Add(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator-=(const R128 &rhs)
+{
+   r128Sub(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator*=(const R128 &rhs)
+{
+   r128Mul(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator/=(const R128 &rhs)
+{
+   r128Div(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator%=(const R128 &rhs)
+{
+   r128Mod(this, this, &rhs);
+   return *this;
+}
+
+inline R128 &R128::operator<<=(int amount)
+{
+   r128Shl(this, this, amount);
+   return *this;
+}
+
+inline R128 &R128::operator>>=(int amount)
+{
+   r128Sar(this, this, amount);
+   return *this;
+}
+
+static inline R128 operator|(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r |= rhs;
+}
+
+static inline R128 operator&(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r &= rhs;
+}
+
+static inline R128 operator^(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r ^= rhs;
+}
+
+static inline R128 operator+(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r += rhs;
+}
+
+static inline R128 operator-(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r -= rhs;
+}
+
+static inline R128 operator*(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r *= rhs;
+}
+
+static inline R128 operator/(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r /= rhs;
+}
+
+static inline R128 operator%(const R128 &lhs, const R128 &rhs)
+{
+   R128 r(lhs);
+   return r %= rhs;
+}
+
+static inline R128 operator<<(const R128 &lhs, int amount)
+{
+   R128 r(lhs);
+   return r <<= amount;
+}
+
+static inline R128 operator>>(const R128 &lhs, int amount)
+{
+   R128 r(lhs);
+   return r >>= amount;
+}
+
+static inline bool operator<(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) < 0;
+}
+
+static inline bool operator>(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) > 0;
+}
+
+static inline bool operator<=(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) <= 0;
+}
+
+static inline bool operator>=(const R128 &lhs, const R128 &rhs)
+{
+   return r128Cmp(&lhs, &rhs) >= 0;
+}
+
+static inline bool operator==(const R128 &lhs, const R128 &rhs)
+{
+   return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+}
+
+static inline bool operator!=(const R128 &lhs, const R128 &rhs)
+{
+   return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
+}
+
+#endif   //__cplusplus
+#endif   //H_R128_H
+
+#ifdef R128_IMPLEMENTATION
+
+#ifdef R128_DEBUG_VIS
+#  define R128_DEBUG_SET(x)   r128ToString(R128_last, sizeof(R128_last), x)
+#else
+#  define R128_DEBUG_SET(x)
+#endif
+
+#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
+#define R128_R0(x) ((R128_U32)(x)->lo)
+#define R128_R2(x) ((R128_U32)(x)->hi)
+#if defined(_M_IX86)
+// workaround: MSVC x86's handling of 64-bit values is not great
+#  define R128_SET4(x, r0, r1, r2, r3) do { \
+      ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
+      ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
+      ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
+      ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
+      } while(0)
+#  define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
+#  define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
+#else
+#  define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
+      (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
+#  define R128_R1(x) ((R128_U32)((x)->lo >> 32))
+#  define R128_R3(x) ((R128_U32)((x)->hi >> 32))
+#endif
+
+#if defined(_M_X64)
+#  define R128_INTEL 1
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__x86_64__)
+#  define R128_INTEL 1
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <x86intrin.h>
+#  endif
+#elif defined(_M_IX86)
+#  define R128_INTEL 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__i386__)
+#  define R128_INTEL 1
+#  ifndef R128_STDC_ONLY
+#     include <x86intrin.h>
+#  endif
+#elif defined(_M_ARM)
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(_M_ARM64)
+#  define R128_64BIT 1
+#  ifndef R128_STDC_ONLY
+#     include <intrin.h>
+#  endif
+#elif defined(__aarch64__)
+#  define R128_64BIT 1
+#endif
+
+#ifndef R128_INTEL
+#  define R128_INTEL 0
+#endif
+
+#ifndef R128_64BIT
+#  define R128_64BIT 0
+#endif
+
+#ifndef R128_ASSERT
+#  include <assert.h>
+#  define R128_ASSERT(x) assert(x)
+#endif
+
+#include <stdlib.h>  // for NULL
+
+static const R128ToStringFormat R128__defaultFormat = {
+   R128ToStringSign_Default,
+   0,
+   -1,
+   0,
+   0,
+   0
+};
+
+const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
+const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
+const R128 R128_smallest = { 1, 0 };
+const R128 R128_zero = { 0, 0 };
+const R128 R128_one = { 0, 1 };
+char R128_decimal = '.';
+#ifdef R128_DEBUG_VIS
+char R128_last[42];
+#endif
+
+static int r128__clz64(R128_U64 x)
+{
+#if defined(R128_STDC_ONLY)
+   R128_U64 n = 64, y;
+   y = x >> 32; if (y) { n -= 32; x = y; }
+   y = x >> 16; if (y) { n -= 16; x = y; }
+   y = x >>  8; if (y) { n -=  8; x = y; }
+   y = x >>  4; if (y) { n -=  4; x = y; }
+   y = x >>  2; if (y) { n -=  2; x = y; }
+   y = x >>  1; if (y) { n -=  1; x = y; }
+   return (int)(n - x);
+#elif defined(_M_X64) || defined(_M_ARM64)
+   unsigned long idx;
+   if (_BitScanReverse64(&idx, x)) {
+      return 63 - (int)idx;
+   } else {
+      return 64;
+   }
+#elif defined(_MSC_VER)
+   unsigned long idx;
+   if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
+      return 31 - (int)idx;
+   } else if (_BitScanReverse(&idx, (R128_U32)x)) {
+      return 63 - (int)idx;
+   } else {
+      return 64;
+   }
+#else
+   return x ? __builtin_clzll(x) : 64;
+#endif
+}
+
+#if !R128_64BIT
+// 32*32->64
+static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
+{
+#  if defined(_M_IX86) && !defined(R128_STDC_ONLY)
+   return __emulu(a, b);
+#  elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
+   return _arm_umull(a, b);
+#  else
+   return a * (R128_U64)b;
+#  endif
+}
+
+// 64/32->32
+static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
+{
+#  if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
+   unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
+   return _udiv64(n, d, rem);
+#  elif defined(_M_IX86) && !defined(R128_STDC_ONLY)
+   __asm {
+      mov eax, nlo
+      mov edx, nhi
+      div d
+      mov ecx, rem
+      mov dword ptr [ecx], edx
+   }
+#  elif defined(__i386__) && !defined(R128_STDC_ONLY)
+   R128_U32 q, r;
+   __asm("divl %4"
+      : "=a"(q), "=d"(r)
+      : "a"(nlo), "d"(nhi), "X"(d));
+   *rem = r;
+   return q;
+#  else
+   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
+   *rem = (R128_U32)(n64 % d);
+   return (R128_U32)(n64 / d);
+#  endif
+}
+#elif !defined(_M_X64) || defined(R128_STDC_ONLY)
+#define r128__umul64(a, b) ((a) * (R128_U64)(b))
+static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
+{
+   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
+   *rem = (R128_U32)(n64 % d);
+   return (R128_U32)(n64 / d);
+}
+#endif   //!R128_64BIT
+
+static void r128__neg(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+   {
+      unsigned char carry = 0;
+#  if R128_64BIT
+      carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
+      carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
+#  else
+      R128_U32 r0, r1, r2, r3;
+      carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
+      carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
+      carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
+      carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
+      R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+   }
+#else
+   if (src->lo) {
+      dst->lo = ~src->lo + 1;
+      dst->hi = ~src->hi;
+   } else {
+      dst->lo = 0;
+      dst->hi = ~src->hi + 1;
+   }
+#endif   //R128_INTEL
+}
+
+// 64*64->128
+static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY)
+   dst->lo = _umul128(a, b, &dst->hi);
+#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
+   unsigned __int128 p0 = a * (unsigned __int128)b;
+   dst->hi = (R128_U64)(p0 >> 64);
+   dst->lo = (R128_U64)p0;
+#else
+   R128_U32 alo = (R128_U32)a;
+   R128_U32 ahi = (R128_U32)(a >> 32);
+   R128_U32 blo = (R128_U32)b;
+   R128_U32 bhi = (R128_U32)(b >> 32);
+   R128_U64 p0, p1, p2, p3;
+
+   p0 = r128__umul64(alo, blo);
+   p1 = r128__umul64(alo, bhi);
+   p2 = r128__umul64(ahi, blo);
+   p3 = r128__umul64(ahi, bhi);
+
+   {
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+      R128_U32 r0, r1, r2, r3;
+      unsigned char carry;
+
+      r0 = (R128_U32)(p0);
+      r1 = (R128_U32)(p0 >> 32);
+      r2 = (R128_U32)(p1 >> 32);
+      r3 = (R128_U32)(p3 >> 32);
+
+      carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
+      carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
+      _addcarry_u32(carry, r3, 0, &r3);
+      carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
+      carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
+      _addcarry_u32(carry, r3, 0, &r3);
+
+      R128_SET4(dst, r0, r1, r2, r3);
+#else
+      R128_U64 carry, lo, hi;
+      carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
+
+      lo = p0 + ((p1 + p2) << 32);
+      hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
+
+      R128_SET2(dst, lo, hi);
+#endif
+   }
+#endif
+}
+
+// 128/64->64
+#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY)
+// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
+// "inline assembly" to avoid long division or outline assembly.
+#pragma code_seg(".text")
+__declspec(allocate(".text")) static const unsigned char r128__udiv128Code[] = {
+   0x48, 0x8B, 0xC1,       //mov  rax, rcx
+   0x49, 0xF7, 0xF0,       //div  rax, r8
+   0x49, 0x89, 0x11,       //mov  qword ptr [r9], rdx
+   0xC3                    //ret
+};
+typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
+static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
+#else
+static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY)
+   return _udiv128(nhi, nlo, d, rem);
+#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
+   R128_U64 q, r;
+   __asm("divq %4"
+      : "=a"(q), "=d"(r)
+      : "a"(nlo), "d"(nhi), "X"(d));
+   *rem = r;
+   return q;
+#else
+   R128_U64 tmp;
+   R128_U32 d0, d1;
+   R128_U32 n3, n2, n1, n0;
+   R128_U32 q0, q1;
+   R128_U32 r;
+   int shift;
+
+   R128_ASSERT(d != 0);    //division by zero
+   R128_ASSERT(nhi < d);   //overflow
+
+   // normalize
+   shift = r128__clz64(d);
+
+   if (shift) {
+      R128 tmp128;
+      R128_SET2(&tmp128, nlo, nhi);
+      r128Shl(&tmp128, &tmp128, shift);
+      n3 = R128_R3(&tmp128);
+      n2 = R128_R2(&tmp128);
+      n1 = R128_R1(&tmp128);
+      n0 = R128_R0(&tmp128);
+      d <<= shift;
+   } else {
+      n3 = (R128_U32)(nhi >> 32);
+      n2 = (R128_U32)nhi;
+      n1 = (R128_U32)(nlo >> 32);
+      n0 = (R128_U32)nlo;
+   }
+
+   d1 = (R128_U32)(d >> 32);
+   d0 = (R128_U32)d;
+
+   // first digit
+   R128_ASSERT(n3 <= d1);
+   if (n3 < d1) {
+      q1 = r128__udiv64(n2, n3, d1, &r);
+   } else {
+      q1 = 0xffffffffu;
+      r = n2 + d1;
+   }
+refine1:
+   if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
+      --q1;
+      if (r < ~d1 + 1) {
+         r += d1;
+         goto refine1;
+      }
+   }
+
+   tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
+   n2 = (R128_U32)(tmp >> 32);
+   n1 = (R128_U32)tmp;
+
+   // second digit
+   R128_ASSERT(n2 <= d1);
+   if (n2 < d1) {
+      q0 = r128__udiv64(n1, n2, d1, &r);
+   } else {
+      q0 = 0xffffffffu;
+      r = n1 + d1;
+   }
+refine0:
+   if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
+      --q0;
+      if (r < ~d1 + 1) {
+         r += d1;
+         goto refine0;
+      }
+   }
+
+   tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
+   n1 = (R128_U32)(tmp >> 32);
+   n0 = (R128_U32)tmp;
+
+   *rem = (((R128_U64)n1 << 32) + n0) >> shift;
+   return ((R128_U64)q1 << 32) + q0;
+#endif
+}
+#endif
+
+static int r128__ucmp(const R128 *a, const R128 *b)
+{
+   if (a->hi != b->hi) {
+      if (a->hi > b->hi) {
+         return 1;
+      } else {
+         return -1;
+      }
+   } else {
+      if (a->lo == b->lo) {
+         return 0;
+      } else if (a->lo > b->lo) {
+         return 1;
+      } else {
+         return -1;
+      }
+   }
+}
+
+static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
+{
+#if defined(_M_X64) && !defined(R128_STDC_ONLY)
+   R128_U64 t0, t1;
+   R128_U64 lo, hi = 0;
+   unsigned char carry;
+
+   t0 = _umul128(a->lo, b->lo, &t1);
+   carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
+   _addcarry_u64(carry, hi, hi, &hi);
+
+   t0 = _umul128(a->lo, b->hi, &t1);
+   carry = _addcarry_u64(0, lo, t0, &lo);
+   _addcarry_u64(carry, hi, t1, &hi);
+
+   t0 = _umul128(a->hi, b->lo, &t1);
+   carry = _addcarry_u64(0, lo, t0, &lo);
+   _addcarry_u64(carry, hi, t1, &hi);
+
+   t0 = _umul128(a->hi, b->hi, &t1);
+   hi += t0;
+
+   R128_SET2(dst, lo, hi);
+#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
+   unsigned __int128 p0, p1, p2, p3;
+   p0 = a->lo * (unsigned __int128)b->lo;
+   p1 = a->lo * (unsigned __int128)b->hi;
+   p2 = a->hi * (unsigned __int128)b->lo;
+   p3 = a->hi * (unsigned __int128)b->hi;
+
+   p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
+   dst->lo = (R128_U64)p0;
+   dst->hi = (R128_U64)(p0 >> 64);
+#else
+   R128 p0, p1, p2, p3, round;
+
+   r128__umul128(&p0, a->lo, b->lo);
+   round.hi = 0; round.lo = p0.lo >> 63;
+   p0.lo = p0.hi; p0.hi = 0; //r128Shr(&p0, &p0, 64);
+   r128Add(&p0, &p0, &round);
+
+   r128__umul128(&p1, a->hi, b->lo);
+   r128Add(&p0, &p0, &p1);
+
+   r128__umul128(&p2, a->lo, b->hi);
+   r128Add(&p0, &p0, &p2);
+
+   r128__umul128(&p3, a->hi, b->hi);
+   p3.hi = p3.lo; p3.lo = 0; //r128Shl(&p3, &p3, 64);
+   r128Add(&p0, &p0, &p3);
+
+   R128_SET2(dst, p0.lo, p0.hi);
+#endif
+}
+
+// Shift d left until the high bit is set, and shift n left by the same amount.
+// returns non-zero on overflow.
+static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
+{
+   R128_U64 d0, d1;
+   R128_U64 n0, n1;
+   int shift;
+
+   d1 = d->hi;
+   d0 = d->lo;
+   n1 = n->hi;
+   n0 = n->lo;
+
+   if (d1) {
+      shift = r128__clz64(d1);
+      if (shift) {
+         d1 = (d1 << shift) | (d0 >> (64 - shift));
+         d0 = d0 << shift;
+         *n2 = n1 >> (64 - shift);
+         n1 = (n1 << shift) | (n0 >> (64 - shift));
+         n0 = n0 << shift;
+      } else {
+         *n2 = 0;
+      }
+   } else {
+      shift = r128__clz64(d0);
+      if (r128__clz64(n1) <= shift) {
+         return 1; // overflow
+      }
+
+      if (shift) {
+         d1 = d0 << shift;
+         d0 = 0;
+         *n2 = (n1 << shift) | (n0 >> (64 - shift));
+         n1 = n0 << shift;
+         n0 = 0;
+      } else {
+         d1 = d0;
+         d0 = 0;
+         *n2 = n1;
+         n1 = n0;
+         n0 = 0;
+      }
+   }
+
+   R128_SET2(n, n0, n1);
+   R128_SET2(d, d0, d1);
+   return 0;
+}
+
+static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
+{
+   R128 tmp;
+   R128_U64 d0, d1;
+   R128_U64 n1, n2, n3;
+   R128 q;
+
+   R128_ASSERT(dividend != NULL);
+   R128_ASSERT(divisor != NULL);
+   R128_ASSERT(quotient != NULL);
+   R128_ASSERT(divisor->hi != 0 || divisor->lo != 0);  // divide by zero
+
+   // scale dividend and normalize
+   {
+      R128 n, d;
+      R128_SET2(&n, dividend->lo, dividend->hi);
+      R128_SET2(&d, divisor->lo, divisor->hi);
+      if (r128__norm(&n, &d, &n3)) {
+         R128_SET2(quotient, R128_max.lo, R128_max.hi);
+         return;
+      }
+
+      d1 = d.hi;
+      d0 = d.lo;
+      n2 = n.hi;
+      n1 = n.lo;
+   }
+
+   // first digit
+   R128_ASSERT(n3 <= d1);
+   {
+      R128 t0, t1;
+      t0.lo = n1;
+      if (n3 < d1) {
+         q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
+      } else {
+         q.hi = R128_LIT_U64(0xffffffffffffffff);
+         t0.hi = n2 + d1;
+      }
+
+refine1:
+      r128__umul128(&t1, q.hi, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q.hi;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine1;
+         }
+      }
+   }
+
+   {
+      R128 t0, t1, t2;
+      t0.hi = n2;
+      t0.lo = n1;
+
+      r128__umul128(&t1, q.hi, d0);
+      r128__umul128(&t2, q.hi, d1);
+
+      t2.hi = t2.lo; t2.lo = 0;  //r128Shl(&t2, &t2, 64);
+      r128Add(&tmp, &t1, &t2);
+      r128Sub(&tmp, &t0, &tmp);
+   }
+   n2 = tmp.hi;
+   n1 = tmp.lo;
+
+   // second digit
+   R128_ASSERT(n2 <= d1);
+   {
+      R128 t0, t1;
+      t0.lo = 0;
+      if (n2 < d1) {
+         q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
+      } else {
+         q.lo = R128_LIT_U64(0xffffffffffffffff);
+         t0.hi = n1 + d1;
+      }
+
+   refine0:
+      r128__umul128(&t1, q.lo, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q.lo;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine0;
+         }
+      }
+   }
+
+   R128_SET2(quotient, q.lo, q.hi);
+}
+
+static R128_U64 r128__umod(R128 *n, R128 *d)
+{
+   R128_U64 d0, d1;
+   R128_U64 n3, n2, n1;
+   R128_U64 q;
+
+   R128_ASSERT(d != NULL);
+   R128_ASSERT(n != NULL);
+   R128_ASSERT(d->hi != 0 || d->lo != 0);  // divide by zero
+
+   if (r128__norm(n, d, &n3)) {
+      return R128_LIT_U64(0xffffffffffffffff);
+   }
+
+   d1 = d->hi;
+   d0 = d->lo;
+   n2 = n->hi;
+   n1 = n->lo;
+
+   R128_ASSERT(n3 < d1);
+   {
+      R128 t0, t1;
+      t0.lo = n1;
+      q = r128__udiv128(n2, n3, d1, &t0.hi);
+
+   refine1:
+      r128__umul128(&t1, q, d0);
+      if (r128__ucmp(&t1, &t0) > 0) {
+         --q;
+         if (t0.hi < ~d1 + 1) {
+            t0.hi += d1;
+            goto refine1;
+         }
+      }
+   }
+
+   return q;
+}
+
+static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
+{
+   char buf[128];
+   R128 tmp;
+   R128_U64 whole;
+   char *cursor, *decimal, *dstp = dst;
+   int sign = 0;
+   int fullPrecision = 1;
+   int width, precision;
+   int padCnt, trail = 0;
+
+   R128_ASSERT(dst != NULL && dstSize > 0);
+   R128_ASSERT(v != NULL);
+   R128_ASSERT(format != NULL);
+
+   --dstSize;
+
+   R128_SET2(&tmp, v->lo, v->hi);
+   if (r128IsNeg(&tmp)) {
+      r128__neg(&tmp, &tmp);
+      sign = 1;
+   }
+
+   width = format->width;
+   if (width < 0) {
+      width = 0;
+   }
+
+   precision = format->precision;
+   if (precision < 0) {
+      // print a maximum of 20 digits
+      fullPrecision = 0;
+      precision = 20;
+   } else if (precision > sizeof(buf) - 21) {
+      trail = precision - (sizeof(buf) - 21);
+      precision -= trail;
+   }
+
+   whole = tmp.hi;
+   decimal = cursor = buf;
+
+   // fractional part first in case a carry into the whole part is required
+   if (tmp.lo || format->decimal) {
+      while (tmp.lo || (fullPrecision && precision)) {
+         if ((int)(cursor - buf) == precision) {
+            if ((R128_S64)tmp.lo < 0) {
+               // round up, propagate carry backwards
+               char *c;
+               for (c = cursor - 1; c >= buf; --c) {
+                  char d = ++*c;
+                  if (d <= '9') {
+                     goto endfrac;
+                  } else {
+                     *c = '0';
+                  }
+               }
+
+               // carry out into the whole part
+               whole++;
+            }
+
+            break;
+         }
+
+         r128__umul128(&tmp, tmp.lo, 10);
+         *cursor++ = (char)tmp.hi + '0';
+      }
+
+   endfrac:
+      if (format->decimal || precision) {
+         decimal = cursor;
+         *cursor++ = R128_decimal;
+      }
+   }
+
+   // whole part
+   do {
+      char digit = (char)(whole % 10);
+      whole /= 10;
+      *cursor++ = digit + '0';
+   } while (whole);
+
+#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
+
+   padCnt = width - (int)(cursor - buf) - 1;
+
+   // left padding
+   if (!format->leftAlign) {
+      char padChar = format->zeroPad ? '0' : ' ';
+      if (format->zeroPad) {
+         if (sign) {
+            R128__WRITE('-');
+         } else if (format->sign == R128ToStringSign_Plus) {
+            R128__WRITE('+');
+         } else if (format->sign == R128ToStringSign_Space) {
+            R128__WRITE(' ');
+         } else {
+            ++padCnt;
+         }
+      }
+
+      for (; padCnt > 0; --padCnt) {
+         R128__WRITE(padChar);
+      }
+   }
+
+   if (format->leftAlign || !format->zeroPad) {
+      if (sign) {
+         R128__WRITE('-');
+      } else if (format->sign == R128ToStringSign_Plus) {
+         R128__WRITE('+');
+      } else if (format->sign == R128ToStringSign_Space) {
+         R128__WRITE(' ');
+      } else {
+         ++padCnt;
+      }
+   }
+
+   {
+      char *i;
+
+      // reverse the whole part
+      for (i = cursor - 1; i >= decimal; --i) {
+         R128__WRITE(*i);
+      }
+
+      // copy the fractional part
+      for (i = buf; i < decimal; ++i) {
+         R128__WRITE(*i);
+      }
+   }
+
+   // right padding
+   if (format->leftAlign) {
+      char padChar = format->zeroPad ? '0' : ' ';
+      for (; padCnt > 0; --padCnt) {
+         R128__WRITE(padChar);
+      }
+   }
+
+   // trailing zeroes for very large precision
+   while (trail--) {
+      R128__WRITE('0');
+   }
+
+#undef R128__WRITE
+
+   if (dstp <= dst + dstSize) {
+      *dstp = '\0';
+   } else {
+      dst[dstSize] = '\0';
+   }
+   return (int)(dstp - dst);
+}
+
+void r128FromInt(R128 *dst, R128_S64 v)
+{
+   R128_ASSERT(dst != NULL);
+   dst->lo = 0;
+   dst->hi = (R128_U64)v;
+   R128_DEBUG_SET(dst);
+}
+
+void r128FromFloat(R128 *dst, double v)
+{
+   R128_ASSERT(dst != NULL);
+
+   if (v < -9223372036854775808.0) {
+      r128Copy(dst, &R128_min);
+   } else if (v >= 9223372036854775808.0) {
+      r128Copy(dst, &R128_max);
+   } else {
+      R128 r;
+      int sign = 0;
+
+      if (v < 0) {
+         v = -v;
+         sign = 1;
+      }
+
+      r.hi = (R128_U64)(R128_S64)v;
+      v -= (R128_S64)v;
+      r.lo = (R128_U64)(v * 18446744073709551616.0);
+
+      if (sign) {
+         r128__neg(&r, &r);
+      }
+
+      r128Copy(dst, &r);
+   }
+}
+
+void r128FromString(R128 *dst, const char *s, char **endptr)
+{
+   R128_U64 lo = 0, hi = 0;
+   R128_U64 base = 10;
+
+   int sign = 0;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(s != NULL);
+
+   R128_SET2(dst, 0, 0);
+
+   // consume whitespace
+   for (;;) {
+      if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
+         ++s;
+      } else {
+         break;
+      }
+   }
+
+   // sign
+   if (*s == '-') {
+      sign = 1;
+      ++s;
+   } else if (*s == '+') {
+      ++s;
+   }
+
+   // parse base prefix
+   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+      base = 16;
+      s += 2;
+   }
+
+   // whole part
+   for (;; ++s) {
+      R128_U64 digit;
+
+      if ('0' <= *s && *s <= '9') {
+         digit = *s - '0';
+      } else if (base == 16 && 'a' <= *s && *s <= 'f') {
+         digit = *s - 'a' + 10;
+      } else if (base == 16 && 'A' <= *s && *s <= 'F') {
+         digit = *s - 'A' + 10;
+      } else {
+         break;
+      }
+
+      hi = hi * base + digit;
+   }
+
+   // fractional part
+   if (*s == R128_decimal) {
+      const char *exp = ++s;
+
+      // find the last digit and work backwards
+      for (;; ++s) {
+         if ('0' <= *s && *s <= '9') {
+         } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
+         } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
+         } else {
+            break;
+         }
+      }
+
+      for (--s; s >= exp; --s) {
+         R128_U64 digit, unused;
+
+         if ('0' <= *s && *s <= '9') {
+            digit = *s - '0';
+         } else if ('a' <= *s && *s <= 'f') {
+            digit = *s - 'a' + 10;
+         } else {
+            digit = *s - 'A' + 10;
+         }
+
+         lo = r128__udiv128(lo, digit, base, &unused);
+      }
+   }
+
+   R128_SET2(dst, lo, hi);
+   if (sign) {
+      r128__neg(dst, dst);
+   }
+
+   if (endptr) {
+      *endptr = (char *) s;
+   }
+}
+
+R128_S64 r128ToInt(const R128 *v)
+{
+   R128_ASSERT(v != NULL);
+   return (R128_S64)v->hi;
+}
+
+double r128ToFloat(const R128 *v)
+{
+   R128 tmp;
+   int sign = 0;
+   double d;
+
+   R128_ASSERT(v != NULL);
+
+   R128_SET2(&tmp, v->lo, v->hi);
+   if (r128IsNeg(&tmp)) {
+      r128__neg(&tmp, &tmp);
+      sign = 1;
+   }
+
+   d = tmp.hi + tmp.lo * (1 / 18446744073709551616.0);
+   if (sign) {
+      d = -d;
+   }
+
+   return d;
+}
+
+int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
+{
+   return r128__format(dst, dstSize, v, opt);
+}
+
+int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
+{
+   R128ToStringFormat opts;
+
+   R128_ASSERT(dst != NULL && dstSize);
+   R128_ASSERT(format != NULL);
+   R128_ASSERT(v != NULL);
+
+   opts.sign = R128__defaultFormat.sign;
+   opts.precision = R128__defaultFormat.precision;
+   opts.zeroPad = R128__defaultFormat.zeroPad;
+   opts.decimal = R128__defaultFormat.decimal;
+   opts.leftAlign = R128__defaultFormat.leftAlign;
+
+   if (*format == '%') {
+      ++format;
+   }
+
+   // flags field
+   for (;; ++format) {
+      if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
+         opts.sign = R128ToStringSign_Space;
+      } else if (*format == '+') {
+         opts.sign = R128ToStringSign_Plus;
+      } else if (*format == '0') {
+         opts.zeroPad = 1;
+      } else if (*format == '-') {
+         opts.leftAlign = 1;
+      } else if (*format == '#') {
+         opts.decimal = 1;
+      } else {
+         break;
+      }
+   }
+
+   // width field
+   opts.width = 0;
+   for (;;) {
+      if ('0' <= *format && *format <= '9') {
+         opts.width = opts.width * 10 + *format++ - '0';
+      } else {
+         break;
+      }
+   }
+
+   // precision field
+   if (*format == '.') {
+      opts.precision = 0;
+      ++format;
+      for (;;) {
+         if ('0' <= *format && *format <= '9') {
+            opts.precision = opts.precision * 10 + *format++ - '0';
+         } else {
+            break;
+         }
+      }
+   }
+
+   return r128__format(dst, dstSize, v, &opts);
+}
+
+int r128ToString(char *dst, size_t dstSize, const R128 *v)
+{
+   return r128__format(dst, dstSize, v, &R128__defaultFormat);
+}
+
+void r128Copy(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+   dst->lo = src->lo;
+   dst->hi = src->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Neg(R128 *dst, const R128 *src)
+{
+   r128__neg(dst, src);
+   R128_DEBUG_SET(dst);
+}
+
+void r128Not(R128 *dst, const R128 *src)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+   dst->lo = ~src->lo;
+   dst->hi = ~src->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Or(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo | b->lo;
+   dst->hi = a->hi | b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128And(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo & b->lo;
+   dst->hi = a->hi & b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Xor(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   dst->lo = a->lo ^ b->lo;
+   dst->hi = a->hi ^ b->hi;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Shl(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shld eax, ebx, cl
+      shld ebx, esi, cl
+      shld esi, edi, cl
+      shl edi, cl
+
+      // clear out low 12 bytes of stack
+      xor edx, edx
+      mov dword ptr[r], edx
+      mov dword ptr[r + 4], edx
+      mov dword ptr[r + 8], edx
+
+      // store shifted amount offset by count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      mov dword ptr[r + ecx * 4 + 0], edi
+      mov dword ptr[r + ecx * 4 + 4], esi
+      mov dword ptr[r + ecx * 4 + 8], ebx
+      mov dword ptr[r + ecx * 4 + 12], eax
+   }
+#else
+
+   r[0] = src->lo;
+   r[1] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[1] = r[0] << (amount - 64);
+      r[0] = 0;
+   } else if (amount) {
+#  ifdef _M_X64
+      r[1] = __shiftleft128(r[0], r[1], (char) amount);
+#  else
+      r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
+#  endif
+      r[0] = r[0] << amount;
+   }
+#endif   //_M_IX86
+
+   dst->lo = r[0];
+   dst->hi = r[1];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Shr(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shrd edi, esi, cl
+      shrd esi, ebx, cl
+      shrd ebx, eax, cl
+      shr eax, cl
+
+      // clear out high 12 bytes of stack
+      xor edx, edx
+      mov dword ptr[r + 20], edx
+      mov dword ptr[r + 24], edx
+      mov dword ptr[r + 28], edx
+
+      // store shifted amount offset by -count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      neg ecx
+      mov dword ptr[r + ecx * 4 + 16], edi
+      mov dword ptr[r + ecx * 4 + 20], esi
+      mov dword ptr[r + ecx * 4 + 24], ebx
+      mov dword ptr[r + ecx * 4 + 28], eax
+   }
+#else
+   r[2] = src->lo;
+   r[3] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[2] = r[3] >> (amount - 64);
+      r[3] = 0;
+   } else if (amount) {
+#ifdef _M_X64
+      r[2] = __shiftright128(r[2], r[3], (char) amount);
+#else
+      r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
+#endif
+      r[3] = r[3] >> amount;
+   }
+#endif
+
+   dst->lo = r[2];
+   dst->hi = r[3];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Sar(R128 *dst, const R128 *src, int amount)
+{
+   R128_U64 r[4];
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(src != NULL);
+
+#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
+   __asm {
+      // load src
+      mov edx, dword ptr[src]
+      mov ecx, amount
+
+      mov edi, dword ptr[edx]
+      mov esi, dword ptr[edx + 4]
+      mov ebx, dword ptr[edx + 8]
+      mov eax, dword ptr[edx + 12]
+
+      // shift mod 32
+      shrd edi, esi, cl
+      shrd esi, ebx, cl
+      shrd ebx, eax, cl
+      sar eax, cl
+
+      // copy sign to high 12 bytes of stack
+      cdq
+      mov dword ptr[r + 20], edx
+      mov dword ptr[r + 24], edx
+      mov dword ptr[r + 28], edx
+
+      // store shifted amount offset by -count/32 bits
+      shr ecx, 5
+      and ecx, 3
+      neg ecx
+      mov dword ptr[r + ecx * 4 + 16], edi
+      mov dword ptr[r + ecx * 4 + 20], esi
+      mov dword ptr[r + ecx * 4 + 24], ebx
+      mov dword ptr[r + ecx * 4 + 28], eax
+   }
+#else
+   r[2] = src->lo;
+   r[3] = src->hi;
+
+   amount &= 127;
+   if (amount >= 64) {
+      r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
+      r[3] = (R128_U64)((R128_S64)r[3] >> 63);
+   } else if (amount) {
+      r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
+      r[3] = (R128_U64)((R128_S64)r[3] >> amount);
+   }
+#endif
+
+   dst->lo = r[2];
+   dst->hi = r[3];
+   R128_DEBUG_SET(dst);
+}
+
+void r128Add(R128 *dst, const R128 *a, const R128 *b)
+{
+   unsigned char carry = 0;
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+#  if R128_64BIT
+   carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
+   carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
+#  else
+   R128_U32 r0, r1, r2, r3;
+   carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
+   carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
+   carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
+   carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
+   R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+#else
+   {
+      R128_U64 r = a->lo + b->lo;
+      carry = r < a->lo;
+      dst->lo = r;
+      dst->hi = a->hi + b->hi + carry;
+   }
+#endif   //R128_INTEL
+
+   R128_DEBUG_SET(dst);
+}
+
+void r128Sub(R128 *dst, const R128 *a, const R128 *b)
+{
+   unsigned char borrow = 0;
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+#if R128_INTEL && !defined(R128_STDC_ONLY)
+#  if R128_64BIT
+   borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
+   borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
+#  else
+   R128_U32 r0, r1, r2, r3;
+   borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
+   borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
+   borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
+   borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
+   R128_SET4(dst, r0, r1, r2, r3);
+#  endif //R128_64BIT
+#else
+   {
+      R128_U64 r = a->lo - b->lo;
+      borrow = r > a->lo;
+      dst->lo = r;
+      dst->hi = a->hi - b->hi - borrow;
+   }
+#endif   //R128_INTEL
+
+   R128_DEBUG_SET(dst);
+}
+
+void r128Mul(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 ta, tb, tc;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&ta, a->lo, a->hi);
+   R128_SET2(&tb, b->lo, b->hi);
+
+   if (r128IsNeg(&ta)) {
+      r128__neg(&ta, &ta);
+      sign = !sign;
+   }
+   if (r128IsNeg(&tb)) {
+      r128__neg(&tb, &tb);
+      sign = !sign;
+   }
+
+   r128__umul(&tc, &ta, &tb);
+   if (sign) {
+      r128__neg(&tc, &tc);
+   }
+
+   r128Copy(dst, &tc);
+}
+
+void r128Div(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 tn, td, tq;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&tn, a->lo, a->hi);
+   R128_SET2(&td, b->lo, b->hi);
+
+   if (r128IsNeg(&tn)) {
+      r128__neg(&tn, &tn);
+      sign = !sign;
+   }
+
+   if (td.lo == 0 && td.hi == 0) {
+      // divide by zero
+      if (sign) {
+         r128Copy(dst, &R128_min);
+      } else {
+         r128Copy(dst, &R128_max);
+      }
+      return;
+   } else if (r128IsNeg(&td)) {
+      r128__neg(&td, &td);
+      sign = !sign;
+   }
+
+   r128__udiv(&tq, &tn, &td);
+
+   if (sign) {
+      r128__neg(&tq, &tq);
+   }
+
+   r128Copy(dst, &tq);
+}
+
+void r128Mod(R128 *dst, const R128 *a, const R128 *b)
+{
+   int sign = 0;
+   R128 tn, td, tq;
+
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   R128_SET2(&tn, a->lo, a->hi);
+   R128_SET2(&td, b->lo, b->hi);
+
+   if (r128IsNeg(&tn)) {
+      r128__neg(&tn, &tn);
+      sign = !sign;
+   }
+
+   if (td.lo == 0 && td.hi == 0) {
+      // divide by zero
+      if (sign) {
+         r128Copy(dst, &R128_min);
+      } else {
+         r128Copy(dst, &R128_max);
+      }
+      return;
+   } else if (r128IsNeg(&td)) {
+      r128__neg(&td, &td);
+      sign = !sign;
+   }
+
+   tq.hi = r128__umod(&tn, &td);
+   tq.lo = 0;
+
+   if (sign) {
+      tq.hi = ~tq.hi + 1;
+   }
+
+   r128Mul(&tq, &tq, b);
+   r128Sub(dst, a, &tq);
+}
+
+void r128Rsqrt(R128 *dst, const R128 *v)
+{
+   static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
+   R128 x, est;
+   int i;
+
+   if ((R128_S64)v->hi < 0) {
+      r128Copy(dst, &R128_min);
+      return;
+   }
+
+   R128_SET2(&x, v->lo, v->hi);
+
+   // get initial estimate
+   if (x.hi) {
+      int shift = (64 + r128__clz64(x.hi)) >> 1;
+      est.lo = R128_LIT_U64(1) << shift;
+      est.hi = 0;
+   } else if (x.lo) {
+      int shift = r128__clz64(x.lo) >> 1;
+      est.hi = R128_LIT_U64(1) << shift;
+      est.lo = 0;
+   } else {
+      R128_SET2(dst, 0, 0);
+      return;
+   }
+
+   // x /= 2
+   r128Shr(&x, &x, 1);
+
+   // Newton-Raphson iterate
+   for (i = 0; i < 7; ++i) {
+      R128 newEst;
+
+      // newEst = est * (threeHalves - (x / 2) * est * est);
+      r128__umul(&newEst, &est, &est);
+      r128__umul(&newEst, &newEst, &x);
+      r128Sub(&newEst, &threeHalves, &newEst);
+      r128__umul(&newEst, &est, &newEst);
+
+      if (newEst.lo == est.lo && newEst.hi == est.hi) {
+         break;
+      }
+      R128_SET2(&est, newEst.lo, newEst.hi);
+   }
+
+   r128Copy(dst, &est);
+}
+
+void r128Sqrt(R128 *dst, const R128 *v)
+{
+   R128 x, est;
+   int i;
+
+   if ((R128_S64)v->hi < 0) {
+      r128Copy(dst, &R128_min);
+      return;
+   }
+
+   R128_SET2(&x, v->lo, v->hi);
+
+   // get initial estimate
+   if (x.hi) {
+      int shift = (63 - r128__clz64(x.hi)) >> 1;
+      r128Shr(&est, &x, shift);
+   } else if (x.lo) {
+      int shift = (1 + r128__clz64(x.lo)) >> 1;
+      r128Shl(&est, &x, shift);
+   } else {
+      R128_SET2(dst, 0, 0);
+      return;
+   }
+
+   // Newton-Raphson iterate
+   for (i = 0; i < 7; ++i) {
+      R128 newEst;
+
+      // newEst = (est + x / est) / 2
+      r128__udiv(&newEst, &x, &est);
+      r128Add(&newEst, &newEst, &est);
+      r128Shr(&newEst, &newEst, 1);
+
+      if (newEst.lo == est.lo && newEst.hi == est.hi) {
+         break;
+      }
+      R128_SET2(&est, newEst.lo, newEst.hi);
+   }
+
+   r128Copy(dst, &est);
+}
+
+int r128Cmp(const R128 *a, const R128 *b)
+{
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (a->hi == b->hi) {
+      if (a->lo == b->lo) {
+         return 0;
+      } else if (a->lo > b->lo) {
+         return 1;
+      } else {
+         return -1;
+      }
+   } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
+      return 1;
+   } else {
+      return -1;
+   }
+}
+
+int r128IsNeg(const R128 *v)
+{
+   R128_ASSERT(v != NULL);
+
+   return (R128_S64)v->hi < 0;
+}
+
+void r128Min(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (r128Cmp(a, b) < 0) {
+      r128Copy(dst, a);
+   } else {
+      r128Copy(dst, b);
+   }
+}
+
+void r128Max(R128 *dst, const R128 *a, const R128 *b)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(a != NULL);
+   R128_ASSERT(b != NULL);
+
+   if (r128Cmp(a, b) > 0) {
+      r128Copy(dst, a);
+   } else {
+      r128Copy(dst, b);
+   }
+}
+
+void r128Floor(R128 *dst, const R128 *v)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(v != NULL);
+
+   if ((R128_S64)v->hi < 0) {
+      dst->hi = v->hi - (v->lo != 0);
+   } else {
+      dst->hi = v->hi;
+   }
+   dst->lo = 0;
+   R128_DEBUG_SET(dst);
+}
+
+void r128Ceil(R128 *dst, const R128 *v)
+{
+   R128_ASSERT(dst != NULL);
+   R128_ASSERT(v != NULL);
+
+   if ((R128_S64)v->hi > 0) {
+      dst->hi = v->hi + (v->lo != 0);
+   } else {
+      dst->hi = v->hi;
+   }
+   dst->lo = 0;
+   R128_DEBUG_SET(dst);
+}
+
+#endif   //R128_IMPLEMENTATION
diff --git a/thirdparty/misc/stb_rect_pack.h b/thirdparty/misc/stb_rect_pack.h
new file mode 100644
index 0000000000..5c848de0e7
--- /dev/null
+++ b/thirdparty/misc/stb_rect_pack.h
@@ -0,0 +1,628 @@
+// stb_rect_pack.h - v1.00 - public domain - rectangle packing
+// Sean Barrett 2014
+//
+// Useful for e.g. packing rectangular textures into an atlas.
+// Does not do rotation.
+//
+// Not necessarily the awesomest packing method, but better than
+// the totally naive one in stb_truetype (which is primarily what
+// this is meant to replace).
+//
+// Has only had a few tests run, may have issues.
+//
+// More docs to come.
+//
+// No memory allocations; uses qsort() and assert() from stdlib.
+// Can override those by defining STBRP_SORT and STBRP_ASSERT.
+//
+// This library currently uses the Skyline Bottom-Left algorithm.
+//
+// Please note: better rectangle packers are welcome! Please
+// implement them to the same API, but with a different init
+// function.
+//
+// Credits
+//
+//  Library
+//    Sean Barrett
+//  Minor features
+//    Martins Mozeiko
+//    github:IntellectualKitty
+//
+//  Bugfixes / warning fixes
+//    Jeremy Jaussaud
+//    Fabian Giesen
+//
+// Version history:
+//
+//     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
+//     0.99  (2019-02-07)  warning fixes
+//     0.11  (2017-03-03)  return packing success/fail result
+//     0.10  (2016-10-25)  remove cast-away-const to avoid warnings
+//     0.09  (2016-08-27)  fix compiler warnings
+//     0.08  (2015-09-13)  really fix bug with empty rects (w=0 or h=0)
+//     0.07  (2015-09-13)  fix bug with empty rects (w=0 or h=0)
+//     0.06  (2015-04-15)  added STBRP_SORT to allow replacing qsort
+//     0.05:  added STBRP_ASSERT to allow replacing assert
+//     0.04:  fixed minor bug in STBRP_LARGE_RECTS support
+//     0.01:  initial release
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//       INCLUDE SECTION
+//
+
+#ifndef STB_INCLUDE_STB_RECT_PACK_H
+#define STB_INCLUDE_STB_RECT_PACK_H
+
+#define STB_RECT_PACK_VERSION  1
+
+#ifdef STBRP_STATIC
+#define STBRP_DEF static
+#else
+#define STBRP_DEF extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct stbrp_context stbrp_context;
+typedef struct stbrp_node    stbrp_node;
+typedef struct stbrp_rect    stbrp_rect;
+
+#ifdef STBRP_LARGE_RECTS
+typedef int            stbrp_coord;
+#else
+typedef unsigned short stbrp_coord;
+#endif
+
+STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects);
+// Assign packed locations to rectangles. The rectangles are of type
+// 'stbrp_rect' defined below, stored in the array 'rects', and there
+// are 'num_rects' many of them.
+//
+// Rectangles which are successfully packed have the 'was_packed' flag
+// set to a non-zero value and 'x' and 'y' store the minimum location
+// on each axis (i.e. bottom-left in cartesian coordinates, top-left
+// if you imagine y increasing downwards). Rectangles which do not fit
+// have the 'was_packed' flag set to 0.
+//
+// You should not try to access the 'rects' array from another thread
+// while this function is running, as the function temporarily reorders
+// the array while it executes.
+//
+// To pack into another rectangle, you need to call stbrp_init_target
+// again. To continue packing into the same rectangle, you can call
+// this function again. Calling this multiple times with multiple rect
+// arrays will probably produce worse packing results than calling it
+// a single time with the full rectangle array, but the option is
+// available.
+//
+// The function returns 1 if all of the rectangles were successfully
+// packed and 0 otherwise.
+
+struct stbrp_rect
+{
+   // reserved for your use:
+   int            id;
+
+   // input:
+   stbrp_coord    w, h;
+
+   // output:
+   stbrp_coord    x, y;
+   int            was_packed;  // non-zero if valid packing
+
+}; // 16 bytes, nominally
+
+
+STBRP_DEF void stbrp_init_target (stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes);
+// Initialize a rectangle packer to:
+//    pack a rectangle that is 'width' by 'height' in dimensions
+//    using temporary storage provided by the array 'nodes', which is 'num_nodes' long
+//
+// You must call this function every time you start packing into a new target.
+//
+// There is no "shutdown" function. The 'nodes' memory must stay valid for
+// the following stbrp_pack_rects() call (or calls), but can be freed after
+// the call (or calls) finish.
+//
+// Note: to guarantee best results, either:
+//       1. make sure 'num_nodes' >= 'width'
+//   or  2. call stbrp_allow_out_of_mem() defined below with 'allow_out_of_mem = 1'
+//
+// If you don't do either of the above things, widths will be quantized to multiples
+// of small integers to guarantee the algorithm doesn't run out of temporary storage.
+//
+// If you do #2, then the non-quantized algorithm will be used, but the algorithm
+// may run out of temporary storage and be unable to pack some rectangles.
+
+STBRP_DEF void stbrp_setup_allow_out_of_mem (stbrp_context *context, int allow_out_of_mem);
+// Optionally call this function after init but before doing any packing to
+// change the handling of the out-of-temp-memory scenario, described above.
+// If you call init again, this will be reset to the default (false).
+
+
+STBRP_DEF void stbrp_setup_heuristic (stbrp_context *context, int heuristic);
+// Optionally select which packing heuristic the library should use. Different
+// heuristics will produce better/worse results for different data sets.
+// If you call init again, this will be reset to the default.
+
+enum
+{
+   STBRP_HEURISTIC_Skyline_default=0,
+   STBRP_HEURISTIC_Skyline_BL_sortHeight = STBRP_HEURISTIC_Skyline_default,
+   STBRP_HEURISTIC_Skyline_BF_sortHeight
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// the details of the following structures don't matter to you, but they must
+// be visible so you can handle the memory allocations for them
+
+struct stbrp_node
+{
+   stbrp_coord  x,y;
+   stbrp_node  *next;
+};
+
+struct stbrp_context
+{
+   int width;
+   int height;
+   int align;
+   int init_mode;
+   int heuristic;
+   int num_nodes;
+   stbrp_node *active_head;
+   stbrp_node *free_head;
+   stbrp_node extra[2]; // we allocate two extra nodes so optimal user-node-count is 'width' not 'width+2'
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//     IMPLEMENTATION SECTION
+//
+
+#ifdef STB_RECT_PACK_IMPLEMENTATION
+#ifndef STBRP_SORT
+#include <stdlib.h>
+#define STBRP_SORT qsort
+#endif
+
+#ifndef STBRP_ASSERT
+#include <assert.h>
+#define STBRP_ASSERT assert
+#endif
+
+#ifdef _MSC_VER
+#define STBRP__NOTUSED(v)  (void)(v)
+#else
+#define STBRP__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+enum
+{
+   STBRP__INIT_skyline = 1
+};
+
+STBRP_DEF void stbrp_setup_heuristic(stbrp_context *context, int heuristic)
+{
+   switch (context->init_mode) {
+      case STBRP__INIT_skyline:
+         STBRP_ASSERT(heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight || heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight);
+         context->heuristic = heuristic;
+         break;
+      default:
+         STBRP_ASSERT(0);
+   }
+}
+
+STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_out_of_mem)
+{
+   if (allow_out_of_mem)
+      // if it's ok to run out of memory, then don't bother aligning them;
+      // this gives better packing, but may fail due to OOM (even though
+      // the rectangles easily fit). @TODO a smarter approach would be to only
+      // quantize once we've hit OOM, then we could get rid of this parameter.
+      context->align = 1;
+   else {
+      // if it's not ok to run out of memory, then quantize the widths
+      // so that num_nodes is always enough nodes.
+      //
+      // I.e. num_nodes * align >= width
+      //                  align >= width / num_nodes
+      //                  align = ceil(width/num_nodes)
+
+      context->align = (context->width + context->num_nodes-1) / context->num_nodes;
+   }
+}
+
+STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes)
+{
+   int i;
+#ifndef STBRP_LARGE_RECTS
+   STBRP_ASSERT(width <= 0xffff && height <= 0xffff);
+#endif
+
+   for (i=0; i < num_nodes-1; ++i)
+      nodes[i].next = &nodes[i+1];
+   nodes[i].next = NULL;
+   context->init_mode = STBRP__INIT_skyline;
+   context->heuristic = STBRP_HEURISTIC_Skyline_default;
+   context->free_head = &nodes[0];
+   context->active_head = &context->extra[0];
+   context->width = width;
+   context->height = height;
+   context->num_nodes = num_nodes;
+   stbrp_setup_allow_out_of_mem(context, 0);
+
+   // node 0 is the full width, node 1 is the sentinel (lets us not store width explicitly)
+   context->extra[0].x = 0;
+   context->extra[0].y = 0;
+   context->extra[0].next = &context->extra[1];
+   context->extra[1].x = (stbrp_coord) width;
+#ifdef STBRP_LARGE_RECTS
+   context->extra[1].y = (1<<30);
+#else
+   context->extra[1].y = 65535;
+#endif
+   context->extra[1].next = NULL;
+}
+
+// find minimum y position if it starts at x1
+static int stbrp__skyline_find_min_y(stbrp_context *c, stbrp_node *first, int x0, int width, int *pwaste)
+{
+   stbrp_node *node = first;
+   int x1 = x0 + width;
+   int min_y, visited_width, waste_area;
+
+   STBRP__NOTUSED(c);
+
+   STBRP_ASSERT(first->x <= x0);
+
+   #if 0
+   // skip in case we're past the node
+   while (node->next->x <= x0)
+      ++node;
+   #else
+   STBRP_ASSERT(node->next->x > x0); // we ended up handling this in the caller for efficiency
+   #endif
+
+   STBRP_ASSERT(node->x <= x0);
+
+   min_y = 0;
+   waste_area = 0;
+   visited_width = 0;
+   while (node->x < x1) {
+      if (node->y > min_y) {
+         // raise min_y higher.
+         // we've accounted for all waste up to min_y,
+         // but we'll now add more waste for everything we've visted
+         waste_area += visited_width * (node->y - min_y);
+         min_y = node->y;
+         // the first time through, visited_width might be reduced
+         if (node->x < x0)
+            visited_width += node->next->x - x0;
+         else
+            visited_width += node->next->x - node->x;
+      } else {
+         // add waste area
+         int under_width = node->next->x - node->x;
+         if (under_width + visited_width > width)
+            under_width = width - visited_width;
+         waste_area += under_width * (min_y - node->y);
+         visited_width += under_width;
+      }
+      node = node->next;
+   }
+
+   *pwaste = waste_area;
+   return min_y;
+}
+
+typedef struct
+{
+   int x,y;
+   stbrp_node **prev_link;
+} stbrp__findresult;
+
+static stbrp__findresult stbrp__skyline_find_best_pos(stbrp_context *c, int width, int height)
+{
+   int best_waste = (1<<30), best_x, best_y = (1 << 30);
+   stbrp__findresult fr;
+   stbrp_node **prev, *node, *tail, **best = NULL;
+
+   // align to multiple of c->align
+   width = (width + c->align - 1);
+   width -= width % c->align;
+   STBRP_ASSERT(width % c->align == 0);
+
+   // if it can't possibly fit, bail immediately
+   if (width > c->width || height > c->height) {
+      fr.prev_link = NULL;
+      fr.x = fr.y = 0;
+      return fr;
+   }
+
+   node = c->active_head;
+   prev = &c->active_head;
+   while (node->x + width <= c->width) {
+      int y,waste;
+      y = stbrp__skyline_find_min_y(c, node, node->x, width, &waste);
+      if (c->heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight) { // actually just want to test BL
+         // bottom left
+         if (y < best_y) {
+            best_y = y;
+            best = prev;
+         }
+      } else {
+         // best-fit
+         if (y + height <= c->height) {
+            // can only use it if it first vertically
+            if (y < best_y || (y == best_y && waste < best_waste)) {
+               best_y = y;
+               best_waste = waste;
+               best = prev;
+            }
+         }
+      }
+      prev = &node->next;
+      node = node->next;
+   }
+
+   best_x = (best == NULL) ? 0 : (*best)->x;
+
+   // if doing best-fit (BF), we also have to try aligning right edge to each node position
+   //
+   // e.g, if fitting
+   //
+   //     ____________________
+   //    |____________________|
+   //
+   //            into
+   //
+   //   |                         |
+   //   |             ____________|
+   //   |____________|
+   //
+   // then right-aligned reduces waste, but bottom-left BL is always chooses left-aligned
+   //
+   // This makes BF take about 2x the time
+
+   if (c->heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight) {
+      tail = c->active_head;
+      node = c->active_head;
+      prev = &c->active_head;
+      // find first node that's admissible
+      while (tail->x < width)
+         tail = tail->next;
+      while (tail) {
+         int xpos = tail->x - width;
+         int y,waste;
+         STBRP_ASSERT(xpos >= 0);
+         // find the left position that matches this
+         while (node->next->x <= xpos) {
+            prev = &node->next;
+            node = node->next;
+         }
+         STBRP_ASSERT(node->next->x > xpos && node->x <= xpos);
+         y = stbrp__skyline_find_min_y(c, node, xpos, width, &waste);
+         if (y + height <= c->height) {
+            if (y <= best_y) {
+               if (y < best_y || waste < best_waste || (waste==best_waste && xpos < best_x)) {
+                  best_x = xpos;
+                  STBRP_ASSERT(y <= best_y);
+                  best_y = y;
+                  best_waste = waste;
+                  best = prev;
+               }
+            }
+         }
+         tail = tail->next;
+      }
+   }
+
+   fr.prev_link = best;
+   fr.x = best_x;
+   fr.y = best_y;
+   return fr;
+}
+
+static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, int width, int height)
+{
+   // find best position according to heuristic
+   stbrp__findresult res = stbrp__skyline_find_best_pos(context, width, height);
+   stbrp_node *node, *cur;
+
+   // bail if:
+   //    1. it failed
+   //    2. the best node doesn't fit (we don't always check this)
+   //    3. we're out of memory
+   if (res.prev_link == NULL || res.y + height > context->height || context->free_head == NULL) {
+      res.prev_link = NULL;
+      return res;
+   }
+
+   // on success, create new node
+   node = context->free_head;
+   node->x = (stbrp_coord) res.x;
+   node->y = (stbrp_coord) (res.y + height);
+
+   context->free_head = node->next;
+
+   // insert the new node into the right starting point, and
+   // let 'cur' point to the remaining nodes needing to be
+   // stiched back in
+
+   cur = *res.prev_link;
+   if (cur->x < res.x) {
+      // preserve the existing one, so start testing with the next one
+      stbrp_node *next = cur->next;
+      cur->next = node;
+      cur = next;
+   } else {
+      *res.prev_link = node;
+   }
+
+   // from here, traverse cur and free the nodes, until we get to one
+   // that shouldn't be freed
+   while (cur->next && cur->next->x <= res.x + width) {
+      stbrp_node *next = cur->next;
+      // move the current node to the free list
+      cur->next = context->free_head;
+      context->free_head = cur;
+      cur = next;
+   }
+
+   // stitch the list back in
+   node->next = cur;
+
+   if (cur->x < res.x + width)
+      cur->x = (stbrp_coord) (res.x + width);
+
+#ifdef _DEBUG
+   cur = context->active_head;
+   while (cur->x < context->width) {
+      STBRP_ASSERT(cur->x < cur->next->x);
+      cur = cur->next;
+   }
+   STBRP_ASSERT(cur->next == NULL);
+
+   {
+      int count=0;
+      cur = context->active_head;
+      while (cur) {
+         cur = cur->next;
+         ++count;
+      }
+      cur = context->free_head;
+      while (cur) {
+         cur = cur->next;
+         ++count;
+      }
+      STBRP_ASSERT(count == context->num_nodes+2);
+   }
+#endif
+
+   return res;
+}
+
+static int rect_height_compare(const void *a, const void *b)
+{
+   const stbrp_rect *p = (const stbrp_rect *) a;
+   const stbrp_rect *q = (const stbrp_rect *) b;
+   if (p->h > q->h)
+      return -1;
+   if (p->h < q->h)
+      return  1;
+   return (p->w > q->w) ? -1 : (p->w < q->w);
+}
+
+static int rect_original_order(const void *a, const void *b)
+{
+   const stbrp_rect *p = (const stbrp_rect *) a;
+   const stbrp_rect *q = (const stbrp_rect *) b;
+   return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
+}
+
+#ifdef STBRP_LARGE_RECTS
+#define STBRP__MAXVAL  0xffffffff
+#else
+#define STBRP__MAXVAL  0xffff
+#endif
+
+STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects)
+{
+   int i, all_rects_packed = 1;
+
+   // we use the 'was_packed' field internally to allow sorting/unsorting
+   for (i=0; i < num_rects; ++i) {
+      rects[i].was_packed = i;
+   }
+
+   // sort according to heuristic
+   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_height_compare);
+
+   for (i=0; i < num_rects; ++i) {
+      if (rects[i].w == 0 || rects[i].h == 0) {
+         rects[i].x = rects[i].y = 0;  // empty rect needs no space
+      } else {
+         stbrp__findresult fr = stbrp__skyline_pack_rectangle(context, rects[i].w, rects[i].h);
+         if (fr.prev_link) {
+            rects[i].x = (stbrp_coord) fr.x;
+            rects[i].y = (stbrp_coord) fr.y;
+         } else {
+            rects[i].x = rects[i].y = STBRP__MAXVAL;
+         }
+      }
+   }
+
+   // unsort
+   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_original_order);
+
+   // set was_packed flags and all_rects_packed status
+   for (i=0; i < num_rects; ++i) {
+      rects[i].was_packed = !(rects[i].x == STBRP__MAXVAL && rects[i].y == STBRP__MAXVAL);
+      if (!rects[i].was_packed)
+         all_rects_packed = 0;
+   }
+
+   // return the all_rects_packed status
+   return all_rects_packed;
+}
+#endif
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/thirdparty/oidn/.gitignore b/thirdparty/oidn/.gitignore
deleted file mode 100644
index 6be206fc29..0000000000
--- a/thirdparty/oidn/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-weights/rtlightmap_hdr.cpp
diff --git a/thirdparty/oidn/LICENSE.txt b/thirdparty/oidn/LICENSE.txt
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/thirdparty/oidn/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/r128/r128.h b/thirdparty/r128/r128.h
deleted file mode 100644
index 58933d7638..0000000000
--- a/thirdparty/r128/r128.h
+++ /dev/null
@@ -1,2124 +0,0 @@
-/*
-r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.4.3
-
-COMPILATION
------------
-Drop this header file somewhere in your project and include it wherever it is
-needed. There is no separate .c file for this library. To get the code, in ONE
-file in your project, put:
-
-#define R128_IMPLEMENTATION
-
-before you include this file. You may also provide a definition for R128_ASSERT
-to force the library to use a custom assert macro.
-
-COMPILER/LIBRARY SUPPORT
-------------------------
-This library requires a C89 compiler with support for 64-bit integers. If your
-compiler does not support the long long data type, the R128_U64, etc. macros
-must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
-for speed. If your compiler does not support these intrinsics, you can add
-#define R128_STDC_ONLY
-in your implementation file before including r128.h.
-
-The only C runtime library functionality used by this library is <assert.h>.
-This can be avoided by defining an R128_ASSERT macro in your implementation
-file. Since this library uses 64-bit arithmetic, this may implicitly add a
-runtime library dependency on 32-bit platforms.
-
-C++ SUPPORT
------------
-Operator overloads are supplied for C++ files that include this file. Since all
-C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
-file can be either C++ or C.
-
-LICENSE
--------
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef H_R128_H
-#define H_R128_H
-
-#include <stddef.h>
-
-// 64-bit integer support
-// If your compiler does not have stdint.h, add appropriate defines for these macros.
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-#  define R128_S32 __int32
-#  define R128_U32 unsigned __int32
-#  define R128_S64 __int64
-#  define R128_U64 unsigned __int64
-#  define R128_LIT_S64(x) x##i64
-#  define R128_LIT_U64(x) x##ui64
-#else
-#  include <stdint.h>
-#  define R128_S32 int32_t
-#  define R128_U32 uint32_t
-#  define R128_S64 int64_t
-#  define R128_U64 long long unsigned int
-#  define R128_LIT_S64(x) x##ll
-#  define R128_LIT_U64(x) x##ull
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct R128 {
-   R128_U64 lo;
-   R128_U64 hi;
-
-#ifdef __cplusplus
-   R128();
-   R128(double);
-   R128(int);
-   R128(R128_S64);
-   R128(R128_U64 low, R128_U64 high);
-
-   operator double() const;
-   operator R128_S64() const;
-   operator int() const;
-   operator bool() const;
-
-   bool operator!() const;
-   R128 operator~() const;
-   R128 operator-() const;
-   R128 &operator|=(const R128 &rhs);
-   R128 &operator&=(const R128 &rhs);
-   R128 &operator^=(const R128 &rhs);
-   R128 &operator+=(const R128 &rhs);
-   R128 &operator-=(const R128 &rhs);
-   R128 &operator*=(const R128 &rhs);
-   R128 &operator/=(const R128 &rhs);
-   R128 &operator%=(const R128 &rhs);
-   R128 &operator<<=(int amount);
-   R128 &operator>>=(int amount);
-#endif   //__cplusplus
-} R128;
-
-// Type conversion
-extern void r128FromInt(R128 *dst, R128_S64 v);
-extern void r128FromFloat(R128 *dst, double v);
-extern R128_S64 r128ToInt(const R128 *v);
-extern double r128ToFloat(const R128 *v);
-
-// Copy
-extern void r128Copy(R128 *dst, const R128 *src);
-
-// Negate
-extern void r128Neg(R128 *dst, const R128 *src);
-
-// Bitwise operations
-extern void r128Not(R128 *dst, const R128 *src);               // ~a
-extern void r128Or(R128 *dst, const R128 *a, const R128 *b);   // a | b
-extern void r128And(R128 *dst, const R128 *a, const R128 *b);  // a & b
-extern void r128Xor(R128 *dst, const R128 *a, const R128 *b);  // a ^ b
-extern void r128Shl(R128 *dst, const R128 *src, int amount);   // shift left by amount mod 128
-extern void r128Shr(R128 *dst, const R128 *src, int amount);   // shift right logical by amount mod 128
-extern void r128Sar(R128 *dst, const R128 *src, int amount);   // shift right arithmetic by amount mod 128
-
-// Arithmetic
-extern void r128Add(R128 *dst, const R128 *a, const R128 *b);  // a + b
-extern void r128Sub(R128 *dst, const R128 *a, const R128 *b);  // a - b
-extern void r128Mul(R128 *dst, const R128 *a, const R128 *b);  // a * b
-extern void r128Div(R128 *dst, const R128 *a, const R128 *b);  // a / b
-extern void r128Mod(R128 *dst, const R128 *a, const R128 *b);  // a - toInt(a / b) * b
-
-extern void r128Sqrt(R128 *dst, const R128 *v);  // sqrt(v)
-extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
-
-// Comparison
-extern int  r128Cmp(const R128 *a, const R128 *b);  // sign of a-b
-extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
-extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
-extern void r128Floor(R128 *dst, const R128 *v);
-extern void r128Ceil(R128 *dst, const R128 *v);
-extern int  r128IsNeg(const R128 *v); // quick check for < 0
-
-// String conversion
-//
-typedef enum R128ToStringSign {
-   R128ToStringSign_Default,  // no sign character for positive values
-   R128ToStringSign_Space,    // leading space for positive values
-   R128ToStringSign_Plus,     // leading '+' for positive values
-} R128ToStringSign;
-
-// Formatting options for use with r128ToStringOpt. The "defaults" correspond
-// to a format string of "%f".
-//
-typedef struct R128ToStringFormat {
-   // sign character for positive values. Default is R128ToStringSign_Default.
-   R128ToStringSign sign;
-
-   // minimum number of characters to write. Default is 0.
-   int width;
-
-   // place to the right of the decimal at which rounding is performed. If negative,
-   // a maximum of 20 decimal places will be written, with no trailing zeroes.
-   // (20 places is sufficient to ensure that r128FromString will convert back to the
-   // original value.) Default is -1. NOTE: This is not the same default that the C
-   // standard library uses for %f.
-   int precision;
-
-   // If non-zero, pads the output string with leading zeroes if the final result is
-   // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
-   int zeroPad;
-
-   // Always print a decimal point, even if the value is an integer. Default is 0.
-   int decimal;
-
-   // Left-align output if width specifier requires padding.
-   // Default is 0 (right align).
-   int leftAlign;
-} R128ToStringFormat;
-
-// r128ToStringOpt: convert R128 to a decimal string, with formatting.
-//
-// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
-// (including null terminator). No additional rounding is performed if dstSize is not large
-// enough to hold the entire string.
-//
-// opt: an R128ToStringFormat struct (q.v.) with formatting options.
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Number of bytes that will be written (i.e. how big does dst need to be?):
-// If width is specified: width + 1 bytes.
-// If precision is specified: at most precision + 22 bytes.
-// If neither is specified: at most 42 bytes.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
-
-// r128ToStringf: convert R128 to a decimal string, with formatting.
-//
-// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
-// (including null terminator).
-//
-// format: a printf-style format specifier, as one would use with floating point types.
-//    e.g. "%+5.2f". (The leading % and trailing f are optional.)
-//    NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
-//       that do not correspond to a format placeholder are ignored.
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Number of bytes that will be written (i.e. how big does dst need to be?):
-// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
-// Otherwise: at most max(width, 41) + 1 bytes.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
-
-// r128ToString: convert R128 to a decimal string, with default formatting.
-// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Will write at most 42 bytes (including NUL) to dst.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
-
-// r128FromString: Convert string to R128.
-//
-// The string can be formatted either as a decimal number with optional sign
-// or as hexadecimal with a prefix of 0x or 0X.
-//
-// endptr, if not NULL, is set to the character following the last character
-//   used in the conversion.
-//
-extern void r128FromString(R128 *dst, const char *s, char **endptr);
-
-// Constants
-extern const R128 R128_min;      // minimum (most negative) value
-extern const R128 R128_max;      // maximum (most positive) value
-extern const R128 R128_smallest; // smallest positive value
-extern const R128 R128_zero;     // zero
-extern const R128 R128_one;      // 1.0
-
-extern char R128_decimal;        // decimal point character used by r128From/ToString. defaults to '.'
-
-#ifdef __cplusplus
-}
-
-#include <limits>
-namespace std {
-template<>
-struct numeric_limits<R128>
-{
-   static const bool is_specialized = true;
-
-   static R128 min() throw() { return R128_min; }
-   static R128 max() throw() { return R128_max; }
-
-   static const int digits = 127;
-   static const int digits10 = 38;
-   static const bool is_signed = true;
-   static const bool is_integer = false;
-   static const bool is_exact = false;
-   static const int radix = 2;
-   static R128 epsilon() throw() { return R128_smallest; }
-   static R128 round_error() throw() { return R128_one; }
-
-   static const int min_exponent = 0;
-   static const int min_exponent10 = 0;
-   static const int max_exponent = 0;
-   static const int max_exponent10 = 0;
-
-   static const bool has_infinity = false;
-   static const bool has_quiet_NaN = false;
-   static const bool has_signaling_NaN = false;
-   static const float_denorm_style has_denorm = denorm_absent;
-   static const bool has_denorm_loss = false;
-
-   static R128 infinity() throw() { return R128_zero; }
-   static R128 quiet_NaN() throw() { return R128_zero; }
-   static R128 signaling_NaN() throw() { return R128_zero; }
-   static R128 denorm_min() throw() { return R128_zero; }
-
-   static const bool is_iec559 = false;
-   static const bool is_bounded = true;
-   static const bool is_modulo = true;
-
-   static const bool traps = numeric_limits<R128_U64>::traps;
-   static const bool tinyness_before = false;
-   static const float_round_style round_style = round_toward_zero;
-};
-}  //namespace std
-
-inline R128::R128() {}
-
-inline R128::R128(double v)
-{
-   r128FromFloat(this, v);
-}
-
-inline R128::R128(int v)
-{
-   r128FromInt(this, v);
-}
-
-inline R128::R128(R128_S64 v)
-{
-   r128FromInt(this, v);
-}
-
-inline R128::R128(R128_U64 low, R128_U64 high)
-{
-   lo = low;
-   hi = high;
-}
-
-inline R128::operator double() const
-{
-   return r128ToFloat(this);
-}
-
-inline R128::operator R128_S64() const
-{
-   return r128ToInt(this);
-}
-
-inline R128::operator int() const
-{
-   return (int) r128ToInt(this);
-}
-
-inline R128::operator bool() const
-{
-   return lo || hi;
-}
-
-inline bool R128::operator!() const
-{
-   return !lo && !hi;
-}
-
-inline R128 R128::operator~() const
-{
-   R128 r;
-   r128Not(&r, this);
-   return r;
-}
-
-inline R128 R128::operator-() const
-{
-   R128 r;
-   r128Neg(&r, this);
-   return r;
-}
-
-inline R128 &R128::operator|=(const R128 &rhs)
-{
-   r128Or(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator&=(const R128 &rhs)
-{
-   r128And(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator^=(const R128 &rhs)
-{
-   r128Xor(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator+=(const R128 &rhs)
-{
-   r128Add(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator-=(const R128 &rhs)
-{
-   r128Sub(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator*=(const R128 &rhs)
-{
-   r128Mul(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator/=(const R128 &rhs)
-{
-   r128Div(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator%=(const R128 &rhs)
-{
-   r128Mod(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator<<=(int amount)
-{
-   r128Shl(this, this, amount);
-   return *this;
-}
-
-inline R128 &R128::operator>>=(int amount)
-{
-   r128Sar(this, this, amount);
-   return *this;
-}
-
-static inline R128 operator|(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r |= rhs;
-}
-
-static inline R128 operator&(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r &= rhs;
-}
-
-static inline R128 operator^(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r ^= rhs;
-}
-
-static inline R128 operator+(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r += rhs;
-}
-
-static inline R128 operator-(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r -= rhs;
-}
-
-static inline R128 operator*(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r *= rhs;
-}
-
-static inline R128 operator/(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r /= rhs;
-}
-
-static inline R128 operator%(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r %= rhs;
-}
-
-static inline R128 operator<<(const R128 &lhs, int amount)
-{
-   R128 r(lhs);
-   return r <<= amount;
-}
-
-static inline R128 operator>>(const R128 &lhs, int amount)
-{
-   R128 r(lhs);
-   return r >>= amount;
-}
-
-static inline bool operator<(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) < 0;
-}
-
-static inline bool operator>(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) > 0;
-}
-
-static inline bool operator<=(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) <= 0;
-}
-
-static inline bool operator>=(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) >= 0;
-}
-
-static inline bool operator==(const R128 &lhs, const R128 &rhs)
-{
-   return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
-}
-
-static inline bool operator!=(const R128 &lhs, const R128 &rhs)
-{
-   return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
-}
-
-#endif   //__cplusplus
-#endif   //H_R128_H
-
-#ifdef R128_IMPLEMENTATION
-
-#ifdef R128_DEBUG_VIS
-#  define R128_DEBUG_SET(x)   r128ToString(R128_last, sizeof(R128_last), x)
-#else
-#  define R128_DEBUG_SET(x)
-#endif
-
-#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
-#define R128_R0(x) ((R128_U32)(x)->lo)
-#define R128_R2(x) ((R128_U32)(x)->hi)
-#if defined(_M_IX86)
-// workaround: MSVC x86's handling of 64-bit values is not great
-#  define R128_SET4(x, r0, r1, r2, r3) do { \
-      ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
-      ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
-      ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
-      ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
-      } while(0)
-#  define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
-#  define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
-#else
-#  define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
-      (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
-#  define R128_R1(x) ((R128_U32)((x)->lo >> 32))
-#  define R128_R3(x) ((R128_U32)((x)->hi >> 32))
-#endif
-
-#if defined(_M_X64)
-#  define R128_INTEL 1
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__x86_64__)
-#  define R128_INTEL 1
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <x86intrin.h>
-#  endif
-#elif defined(_M_IX86)
-#  define R128_INTEL 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__i386__)
-#  define R128_INTEL 1
-#  ifndef R128_STDC_ONLY
-#     include <x86intrin.h>
-#  endif
-#elif defined(_M_ARM)
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(_M_ARM64)
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__aarch64__)
-#  define R128_64BIT 1
-#endif
-
-#ifndef R128_INTEL
-#  define R128_INTEL 0
-#endif
-
-#ifndef R128_64BIT
-#  define R128_64BIT 0
-#endif
-
-#ifndef R128_ASSERT
-#  include <assert.h>
-#  define R128_ASSERT(x) assert(x)
-#endif
-
-#include <stdlib.h>  // for NULL
-
-static const R128ToStringFormat R128__defaultFormat = {
-   R128ToStringSign_Default,
-   0,
-   -1,
-   0,
-   0,
-   0
-};
-
-const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
-const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
-const R128 R128_smallest = { 1, 0 };
-const R128 R128_zero = { 0, 0 };
-const R128 R128_one = { 0, 1 };
-char R128_decimal = '.';
-#ifdef R128_DEBUG_VIS
-char R128_last[42];
-#endif
-
-static int r128__clz64(R128_U64 x)
-{
-#if defined(R128_STDC_ONLY)
-   R128_U64 n = 64, y;
-   y = x >> 32; if (y) { n -= 32; x = y; }
-   y = x >> 16; if (y) { n -= 16; x = y; }
-   y = x >>  8; if (y) { n -=  8; x = y; }
-   y = x >>  4; if (y) { n -=  4; x = y; }
-   y = x >>  2; if (y) { n -=  2; x = y; }
-   y = x >>  1; if (y) { n -=  1; x = y; }
-   return (int)(n - x);
-#elif defined(_M_X64) || defined(_M_ARM64)
-   unsigned long idx;
-   if (_BitScanReverse64(&idx, x)) {
-      return 63 - (int)idx;
-   } else {
-      return 64;
-   }
-#elif defined(_MSC_VER)
-   unsigned long idx;
-   if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
-      return 31 - (int)idx;
-   } else if (_BitScanReverse(&idx, (R128_U32)x)) {
-      return 63 - (int)idx;
-   } else {
-      return 64;
-   }
-#else
-   return x ? __builtin_clzll(x) : 64;
-#endif
-}
-
-#if !R128_64BIT
-// 32*32->64
-static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
-{
-#  if defined(_M_IX86) && !defined(R128_STDC_ONLY)
-   return __emulu(a, b);
-#  elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
-   return _arm_umull(a, b);
-#  else
-   return a * (R128_U64)b;
-#  endif
-}
-
-// 64/32->32
-static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
-{
-#  if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
-   unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
-   return _udiv64(n, d, rem);
-#  elif defined(_M_IX86) && !defined(R128_STDC_ONLY)
-   __asm {
-      mov eax, nlo
-      mov edx, nhi
-      div d
-      mov ecx, rem
-      mov dword ptr [ecx], edx
-   }
-#  elif defined(__i386__) && !defined(R128_STDC_ONLY)
-   R128_U32 q, r;
-   __asm("divl %4"
-      : "=a"(q), "=d"(r)
-      : "a"(nlo), "d"(nhi), "X"(d));
-   *rem = r;
-   return q;
-#  else
-   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
-   *rem = (R128_U32)(n64 % d);
-   return (R128_U32)(n64 / d);
-#  endif
-}
-#elif !defined(_M_X64) || defined(R128_STDC_ONLY)
-#define r128__umul64(a, b) ((a) * (R128_U64)(b))
-/*static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
-{
-   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
-   *rem = (R128_U32)(n64 % d);
-   return (R128_U32)(n64 / d);
-}*/
-#endif   //!R128_64BIT
-
-static void r128__neg(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-   {
-      unsigned char carry = 0;
-#  if R128_64BIT
-      carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
-      carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
-#  else
-      R128_U32 r0, r1, r2, r3;
-      carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
-      carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
-      carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
-      carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
-      R128_SET4(dst, r0, r1, r2, r3);
-#  endif //R128_64BIT
-   }
-#else
-   if (src->lo) {
-      dst->lo = ~src->lo + 1;
-      dst->hi = ~src->hi;
-   } else {
-      dst->lo = 0;
-      dst->hi = ~src->hi + 1;
-   }
-#endif   //R128_INTEL
-}
-
-// 64*64->128
-static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
-{
-#if defined(_M_X64) && !defined(R128_STDC_ONLY)
-   dst->lo = _umul128(a, b, &dst->hi);
-#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
-   unsigned __int128 p0 = a * (unsigned __int128)b;
-   dst->hi = (R128_U64)(p0 >> 64);
-   dst->lo = (R128_U64)p0;
-#else
-   R128_U32 alo = (R128_U32)a;
-   R128_U32 ahi = (R128_U32)(a >> 32);
-   R128_U32 blo = (R128_U32)b;
-   R128_U32 bhi = (R128_U32)(b >> 32);
-   R128_U64 p0, p1, p2, p3;
-
-   p0 = r128__umul64(alo, blo);
-   p1 = r128__umul64(alo, bhi);
-   p2 = r128__umul64(ahi, blo);
-   p3 = r128__umul64(ahi, bhi);
-
-   {
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-      R128_U32 r0, r1, r2, r3;
-      unsigned char carry;
-
-      r0 = (R128_U32)(p0);
-      r1 = (R128_U32)(p0 >> 32);
-      r2 = (R128_U32)(p1 >> 32);
-      r3 = (R128_U32)(p3 >> 32);
-
-      carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
-      carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
-      _addcarry_u32(carry, r3, 0, &r3);
-      carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
-      carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
-      _addcarry_u32(carry, r3, 0, &r3);
-
-      R128_SET4(dst, r0, r1, r2, r3);
-#else
-      R128_U64 carry, lo, hi;
-      carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
-
-      lo = p0 + ((p1 + p2) << 32);
-      hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
-
-      R128_SET2(dst, lo, hi);
-#endif
-   }
-#endif
-}
-
-// 128/64->64
-#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY)
-// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
-// "inline assembly" to avoid long division or outline assembly.
-#pragma code_seg(".text")
-__declspec(allocate(".text")) static const unsigned char r128__udiv128Code[] = {
-   0x48, 0x8B, 0xC1,       //mov  rax, rcx
-   0x49, 0xF7, 0xF0,       //div  rax, r8
-   0x49, 0x89, 0x11,       //mov  qword ptr [r9], rdx
-   0xC3                    //ret
-};
-typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
-static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
-#else
-static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
-{
-#if defined(_M_X64) && !defined(R128_STDC_ONLY)
-   return _udiv128(nhi, nlo, d, rem);
-#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
-   R128_U64 q, r;
-   __asm("divq %4"
-      : "=a"(q), "=d"(r)
-      : "a"(nlo), "d"(nhi), "X"(d));
-   *rem = r;
-   return q;
-#else
-   R128_U64 tmp;
-   R128_U32 d0, d1;
-   R128_U32 n3, n2, n1, n0;
-   R128_U32 q0, q1;
-   R128_U32 r;
-   int shift;
-
-   R128_ASSERT(d != 0);    //division by zero
-   R128_ASSERT(nhi < d);   //overflow
-
-   // normalize
-   shift = r128__clz64(d);
-
-   if (shift) {
-      R128 tmp128;
-      R128_SET2(&tmp128, nlo, nhi);
-      r128Shl(&tmp128, &tmp128, shift);
-      n3 = R128_R3(&tmp128);
-      n2 = R128_R2(&tmp128);
-      n1 = R128_R1(&tmp128);
-      n0 = R128_R0(&tmp128);
-      d <<= shift;
-   } else {
-      n3 = (R128_U32)(nhi >> 32);
-      n2 = (R128_U32)nhi;
-      n1 = (R128_U32)(nlo >> 32);
-      n0 = (R128_U32)nlo;
-   }
-
-   d1 = (R128_U32)(d >> 32);
-   d0 = (R128_U32)d;
-
-   // first digit
-   R128_ASSERT(n3 <= d1);
-   if (n3 < d1) {
-      q1 = r128__udiv64(n2, n3, d1, &r);
-   } else {
-      q1 = 0xffffffffu;
-      r = n2 + d1;
-   }
-refine1:
-   if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
-      --q1;
-      if (r < ~d1 + 1) {
-         r += d1;
-         goto refine1;
-      }
-   }
-
-   tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
-   n2 = (R128_U32)(tmp >> 32);
-   n1 = (R128_U32)tmp;
-
-   // second digit
-   R128_ASSERT(n2 <= d1);
-   if (n2 < d1) {
-      q0 = r128__udiv64(n1, n2, d1, &r);
-   } else {
-      q0 = 0xffffffffu;
-      r = n1 + d1;
-   }
-refine0:
-   if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
-      --q0;
-      if (r < ~d1 + 1) {
-         r += d1;
-         goto refine0;
-      }
-   }
-
-   tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
-   n1 = (R128_U32)(tmp >> 32);
-   n0 = (R128_U32)tmp;
-
-   *rem = (((R128_U64)n1 << 32) + n0) >> shift;
-   return ((R128_U64)q1 << 32) + q0;
-#endif
-}
-#endif
-
-static int r128__ucmp(const R128 *a, const R128 *b)
-{
-   if (a->hi != b->hi) {
-      if (a->hi > b->hi) {
-         return 1;
-      } else {
-         return -1;
-      }
-   } else {
-      if (a->lo == b->lo) {
-         return 0;
-      } else if (a->lo > b->lo) {
-         return 1;
-      } else {
-         return -1;
-      }
-   }
-}
-
-static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
-{
-#if defined(_M_X64) && !defined(R128_STDC_ONLY)
-   R128_U64 t0, t1;
-   R128_U64 lo, hi = 0;
-   unsigned char carry;
-
-   t0 = _umul128(a->lo, b->lo, &t1);
-   carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
-   _addcarry_u64(carry, hi, hi, &hi);
-
-   t0 = _umul128(a->lo, b->hi, &t1);
-   carry = _addcarry_u64(0, lo, t0, &lo);
-   _addcarry_u64(carry, hi, t1, &hi);
-
-   t0 = _umul128(a->hi, b->lo, &t1);
-   carry = _addcarry_u64(0, lo, t0, &lo);
-   _addcarry_u64(carry, hi, t1, &hi);
-
-   t0 = _umul128(a->hi, b->hi, &t1);
-   hi += t0;
-
-   R128_SET2(dst, lo, hi);
-#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
-   unsigned __int128 p0, p1, p2, p3;
-   p0 = a->lo * (unsigned __int128)b->lo;
-   p1 = a->lo * (unsigned __int128)b->hi;
-   p2 = a->hi * (unsigned __int128)b->lo;
-   p3 = a->hi * (unsigned __int128)b->hi;
-
-   p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
-   dst->lo = (R128_U64)p0;
-   dst->hi = (R128_U64)(p0 >> 64);
-#else
-   R128 p0, p1, p2, p3, round;
-
-   r128__umul128(&p0, a->lo, b->lo);
-   round.hi = 0; round.lo = p0.lo >> 63;
-   p0.lo = p0.hi; p0.hi = 0; //r128Shr(&p0, &p0, 64);
-   r128Add(&p0, &p0, &round);
-
-   r128__umul128(&p1, a->hi, b->lo);
-   r128Add(&p0, &p0, &p1);
-
-   r128__umul128(&p2, a->lo, b->hi);
-   r128Add(&p0, &p0, &p2);
-
-   r128__umul128(&p3, a->hi, b->hi);
-   p3.hi = p3.lo; p3.lo = 0; //r128Shl(&p3, &p3, 64);
-   r128Add(&p0, &p0, &p3);
-
-   R128_SET2(dst, p0.lo, p0.hi);
-#endif
-}
-
-// Shift d left until the high bit is set, and shift n left by the same amount.
-// returns non-zero on overflow.
-static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
-{
-   R128_U64 d0, d1;
-   R128_U64 n0, n1;
-   int shift;
-
-   d1 = d->hi;
-   d0 = d->lo;
-   n1 = n->hi;
-   n0 = n->lo;
-
-   if (d1) {
-      shift = r128__clz64(d1);
-      if (shift) {
-         d1 = (d1 << shift) | (d0 >> (64 - shift));
-         d0 = d0 << shift;
-         *n2 = n1 >> (64 - shift);
-         n1 = (n1 << shift) | (n0 >> (64 - shift));
-         n0 = n0 << shift;
-      } else {
-         *n2 = 0;
-      }
-   } else {
-      shift = r128__clz64(d0);
-      if (r128__clz64(n1) <= shift) {
-         return 1; // overflow
-      }
-
-      if (shift) {
-         d1 = d0 << shift;
-         d0 = 0;
-         *n2 = (n1 << shift) | (n0 >> (64 - shift));
-         n1 = n0 << shift;
-         n0 = 0;
-      } else {
-         d1 = d0;
-         d0 = 0;
-         *n2 = n1;
-         n1 = n0;
-         n0 = 0;
-      }
-   }
-
-   R128_SET2(n, n0, n1);
-   R128_SET2(d, d0, d1);
-   return 0;
-}
-
-static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
-{
-   R128 tmp;
-   R128_U64 d0, d1;
-   R128_U64 n1, n2, n3;
-   R128 q;
-
-   R128_ASSERT(dividend != NULL);
-   R128_ASSERT(divisor != NULL);
-   R128_ASSERT(quotient != NULL);
-   R128_ASSERT(divisor->hi != 0 || divisor->lo != 0);  // divide by zero
-
-   // scale dividend and normalize
-   {
-      R128 n, d;
-      R128_SET2(&n, dividend->lo, dividend->hi);
-      R128_SET2(&d, divisor->lo, divisor->hi);
-      if (r128__norm(&n, &d, &n3)) {
-         R128_SET2(quotient, R128_max.lo, R128_max.hi);
-         return;
-      }
-
-      d1 = d.hi;
-      d0 = d.lo;
-      n2 = n.hi;
-      n1 = n.lo;
-   }
-
-   // first digit
-   R128_ASSERT(n3 <= d1);
-   {
-      R128 t0, t1;
-      t0.lo = n1;
-      if (n3 < d1) {
-         q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
-      } else {
-         q.hi = R128_LIT_U64(0xffffffffffffffff);
-         t0.hi = n2 + d1;
-      }
-
-refine1:
-      r128__umul128(&t1, q.hi, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q.hi;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine1;
-         }
-      }
-   }
-
-   {
-      R128 t0, t1, t2;
-      t0.hi = n2;
-      t0.lo = n1;
-
-      r128__umul128(&t1, q.hi, d0);
-      r128__umul128(&t2, q.hi, d1);
-
-      t2.hi = t2.lo; t2.lo = 0;  //r128Shl(&t2, &t2, 64);
-      r128Add(&tmp, &t1, &t2);
-      r128Sub(&tmp, &t0, &tmp);
-   }
-   n2 = tmp.hi;
-   n1 = tmp.lo;
-
-   // second digit
-   R128_ASSERT(n2 <= d1);
-   {
-      R128 t0, t1;
-      t0.lo = 0;
-      if (n2 < d1) {
-         q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
-      } else {
-         q.lo = R128_LIT_U64(0xffffffffffffffff);
-         t0.hi = n1 + d1;
-      }
-
-   refine0:
-      r128__umul128(&t1, q.lo, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q.lo;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine0;
-         }
-      }
-   }
-
-   R128_SET2(quotient, q.lo, q.hi);
-}
-
-static R128_U64 r128__umod(R128 *n, R128 *d)
-{
-   R128_U64 d0, d1;
-   R128_U64 n3, n2, n1;
-   R128_U64 q;
-
-   R128_ASSERT(d != NULL);
-   R128_ASSERT(n != NULL);
-   R128_ASSERT(d->hi != 0 || d->lo != 0);  // divide by zero
-
-   if (r128__norm(n, d, &n3)) {
-      return R128_LIT_U64(0xffffffffffffffff);
-   }
-
-   d1 = d->hi;
-   d0 = d->lo;
-   n2 = n->hi;
-   n1 = n->lo;
-
-   R128_ASSERT(n3 < d1);
-   {
-      R128 t0, t1;
-      t0.lo = n1;
-      q = r128__udiv128(n2, n3, d1, &t0.hi);
-
-   refine1:
-      r128__umul128(&t1, q, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine1;
-         }
-      }
-   }
-
-   return q;
-}
-
-static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
-{
-   char buf[128];
-   R128 tmp;
-   R128_U64 whole;
-   char *cursor, *decimal, *dstp = dst;
-   int sign = 0;
-   int fullPrecision = 1;
-   int width, precision;
-   int padCnt, trail = 0;
-
-   R128_ASSERT(dst != NULL && dstSize > 0);
-   R128_ASSERT(v != NULL);
-   R128_ASSERT(format != NULL);
-
-   --dstSize;
-
-   R128_SET2(&tmp, v->lo, v->hi);
-   if (r128IsNeg(&tmp)) {
-      r128__neg(&tmp, &tmp);
-      sign = 1;
-   }
-
-   width = format->width;
-   if (width < 0) {
-      width = 0;
-   }
-
-   precision = format->precision;
-   if (precision < 0) {
-      // print a maximum of 20 digits
-      fullPrecision = 0;
-      precision = 20;
-   } else if (precision > (int)sizeof(buf) - 21) {
-      trail = precision - (sizeof(buf) - 21);
-      precision -= trail;
-   }
-
-   whole = tmp.hi;
-   decimal = cursor = buf;
-
-   // fractional part first in case a carry into the whole part is required
-   if (tmp.lo || format->decimal) {
-      while (tmp.lo || (fullPrecision && precision)) {
-         if ((int)(cursor - buf) == precision) {
-            if ((R128_S64)tmp.lo < 0) {
-               // round up, propagate carry backwards
-               char *c;
-               for (c = cursor - 1; c >= buf; --c) {
-                  char d = ++*c;
-                  if (d <= '9') {
-                     goto endfrac;
-                  } else {
-                     *c = '0';
-                  }
-               }
-
-               // carry out into the whole part
-               whole++;
-            }
-
-            break;
-         }
-
-         r128__umul128(&tmp, tmp.lo, 10);
-         *cursor++ = (char)tmp.hi + '0';
-      }
-
-   endfrac:
-      if (format->decimal || precision) {
-         decimal = cursor;
-         *cursor++ = R128_decimal;
-      }
-   }
-
-   // whole part
-   do {
-      char digit = (char)(whole % 10);
-      whole /= 10;
-      *cursor++ = digit + '0';
-   } while (whole);
-
-#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
-
-   padCnt = width - (int)(cursor - buf) - 1;
-
-   // left padding
-   if (!format->leftAlign) {
-      char padChar = format->zeroPad ? '0' : ' ';
-      if (format->zeroPad) {
-         if (sign) {
-            R128__WRITE('-');
-         } else if (format->sign == R128ToStringSign_Plus) {
-            R128__WRITE('+');
-         } else if (format->sign == R128ToStringSign_Space) {
-            R128__WRITE(' ');
-         } else {
-            ++padCnt;
-         }
-      }
-
-      for (; padCnt > 0; --padCnt) {
-         R128__WRITE(padChar);
-      }
-   }
-
-   if (format->leftAlign || !format->zeroPad) {
-      if (sign) {
-         R128__WRITE('-');
-      } else if (format->sign == R128ToStringSign_Plus) {
-         R128__WRITE('+');
-      } else if (format->sign == R128ToStringSign_Space) {
-         R128__WRITE(' ');
-      } else {
-         ++padCnt;
-      }
-   }
-
-   {
-      char *i;
-
-      // reverse the whole part
-      for (i = cursor - 1; i >= decimal; --i) {
-         R128__WRITE(*i);
-      }
-
-      // copy the fractional part
-      for (i = buf; i < decimal; ++i) {
-         R128__WRITE(*i);
-      }
-   }
-
-   // right padding
-   if (format->leftAlign) {
-      char padChar = format->zeroPad ? '0' : ' ';
-      for (; padCnt > 0; --padCnt) {
-         R128__WRITE(padChar);
-      }
-   }
-
-   // trailing zeroes for very large precision
-   while (trail--) {
-      R128__WRITE('0');
-   }
-
-#undef R128__WRITE
-
-   if (dstp <= dst + dstSize) {
-      *dstp = '\0';
-   } else {
-      dst[dstSize] = '\0';
-   }
-   return (int)(dstp - dst);
-}
-
-void r128FromInt(R128 *dst, R128_S64 v)
-{
-   R128_ASSERT(dst != NULL);
-   dst->lo = 0;
-   dst->hi = (R128_U64)v;
-   R128_DEBUG_SET(dst);
-}
-
-void r128FromFloat(R128 *dst, double v)
-{
-   R128_ASSERT(dst != NULL);
-
-   if (v < -9223372036854775808.0) {
-      r128Copy(dst, &R128_min);
-   } else if (v >= 9223372036854775808.0) {
-      r128Copy(dst, &R128_max);
-   } else {
-      R128 r;
-      int sign = 0;
-
-      if (v < 0) {
-         v = -v;
-         sign = 1;
-      }
-
-      r.hi = (R128_U64)(R128_S64)v;
-      v -= (R128_S64)v;
-      r.lo = (R128_U64)(v * 18446744073709551616.0);
-
-      if (sign) {
-         r128__neg(&r, &r);
-      }
-
-      r128Copy(dst, &r);
-   }
-}
-
-void r128FromString(R128 *dst, const char *s, char **endptr)
-{
-   R128_U64 lo = 0, hi = 0;
-   R128_U64 base = 10;
-
-   int sign = 0;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(s != NULL);
-
-   R128_SET2(dst, 0, 0);
-
-   // consume whitespace
-   for (;;) {
-      if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
-         ++s;
-      } else {
-         break;
-      }
-   }
-
-   // sign
-   if (*s == '-') {
-      sign = 1;
-      ++s;
-   } else if (*s == '+') {
-      ++s;
-   }
-
-   // parse base prefix
-   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
-      base = 16;
-      s += 2;
-   }
-
-   // whole part
-   for (;; ++s) {
-      R128_U64 digit;
-
-      if ('0' <= *s && *s <= '9') {
-         digit = *s - '0';
-      } else if (base == 16 && 'a' <= *s && *s <= 'f') {
-         digit = *s - 'a' + 10;
-      } else if (base == 16 && 'A' <= *s && *s <= 'F') {
-         digit = *s - 'A' + 10;
-      } else {
-         break;
-      }
-
-      hi = hi * base + digit;
-   }
-
-   // fractional part
-   if (*s == R128_decimal) {
-      const char *exp = ++s;
-
-      // find the last digit and work backwards
-      for (;; ++s) {
-         if ('0' <= *s && *s <= '9') {
-         } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
-         } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
-         } else {
-            break;
-         }
-      }
-
-      for (--s; s >= exp; --s) {
-         R128_U64 digit, unused;
-
-         if ('0' <= *s && *s <= '9') {
-            digit = *s - '0';
-         } else if ('a' <= *s && *s <= 'f') {
-            digit = *s - 'a' + 10;
-         } else {
-            digit = *s - 'A' + 10;
-         }
-
-         lo = r128__udiv128(lo, digit, base, &unused);
-      }
-   }
-
-   R128_SET2(dst, lo, hi);
-   if (sign) {
-      r128__neg(dst, dst);
-   }
-
-   if (endptr) {
-      *endptr = (char *) s;
-   }
-}
-
-R128_S64 r128ToInt(const R128 *v)
-{
-   R128_ASSERT(v != NULL);
-   return (R128_S64)v->hi;
-}
-
-double r128ToFloat(const R128 *v)
-{
-   R128 tmp;
-   int sign = 0;
-   double d;
-
-   R128_ASSERT(v != NULL);
-
-   R128_SET2(&tmp, v->lo, v->hi);
-   if (r128IsNeg(&tmp)) {
-      r128__neg(&tmp, &tmp);
-      sign = 1;
-   }
-
-   d = tmp.hi + tmp.lo * (1 / 18446744073709551616.0);
-   if (sign) {
-      d = -d;
-   }
-
-   return d;
-}
-
-int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
-{
-   return r128__format(dst, dstSize, v, opt);
-}
-
-int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
-{
-   R128ToStringFormat opts;
-
-   R128_ASSERT(dst != NULL && dstSize);
-   R128_ASSERT(format != NULL);
-   R128_ASSERT(v != NULL);
-
-   opts.sign = R128__defaultFormat.sign;
-   opts.precision = R128__defaultFormat.precision;
-   opts.zeroPad = R128__defaultFormat.zeroPad;
-   opts.decimal = R128__defaultFormat.decimal;
-   opts.leftAlign = R128__defaultFormat.leftAlign;
-
-   if (*format == '%') {
-      ++format;
-   }
-
-   // flags field
-   for (;; ++format) {
-      if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
-         opts.sign = R128ToStringSign_Space;
-      } else if (*format == '+') {
-         opts.sign = R128ToStringSign_Plus;
-      } else if (*format == '0') {
-         opts.zeroPad = 1;
-      } else if (*format == '-') {
-         opts.leftAlign = 1;
-      } else if (*format == '#') {
-         opts.decimal = 1;
-      } else {
-         break;
-      }
-   }
-
-   // width field
-   opts.width = 0;
-   for (;;) {
-      if ('0' <= *format && *format <= '9') {
-         opts.width = opts.width * 10 + *format++ - '0';
-      } else {
-         break;
-      }
-   }
-
-   // precision field
-   if (*format == '.') {
-      opts.precision = 0;
-      ++format;
-      for (;;) {
-         if ('0' <= *format && *format <= '9') {
-            opts.precision = opts.precision * 10 + *format++ - '0';
-         } else {
-            break;
-         }
-      }
-   }
-
-   return r128__format(dst, dstSize, v, &opts);
-}
-
-int r128ToString(char *dst, size_t dstSize, const R128 *v)
-{
-   return r128__format(dst, dstSize, v, &R128__defaultFormat);
-}
-
-void r128Copy(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-   dst->lo = src->lo;
-   dst->hi = src->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Neg(R128 *dst, const R128 *src)
-{
-   r128__neg(dst, src);
-   R128_DEBUG_SET(dst);
-}
-
-void r128Not(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-   dst->lo = ~src->lo;
-   dst->hi = ~src->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Or(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo | b->lo;
-   dst->hi = a->hi | b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128And(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo & b->lo;
-   dst->hi = a->hi & b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Xor(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo ^ b->lo;
-   dst->hi = a->hi ^ b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Shl(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shld eax, ebx, cl
-      shld ebx, esi, cl
-      shld esi, edi, cl
-      shl edi, cl
-
-      // clear out low 12 bytes of stack
-      xor edx, edx
-      mov dword ptr[r], edx
-      mov dword ptr[r + 4], edx
-      mov dword ptr[r + 8], edx
-
-      // store shifted amount offset by count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      mov dword ptr[r + ecx * 4 + 0], edi
-      mov dword ptr[r + ecx * 4 + 4], esi
-      mov dword ptr[r + ecx * 4 + 8], ebx
-      mov dword ptr[r + ecx * 4 + 12], eax
-   }
-#else
-
-   r[0] = src->lo;
-   r[1] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[1] = r[0] << (amount - 64);
-      r[0] = 0;
-   } else if (amount) {
-#  ifdef _M_X64
-      r[1] = __shiftleft128(r[0], r[1], (char) amount);
-#  else
-      r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
-#  endif
-      r[0] = r[0] << amount;
-   }
-#endif   //_M_IX86
-
-   dst->lo = r[0];
-   dst->hi = r[1];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Shr(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shrd edi, esi, cl
-      shrd esi, ebx, cl
-      shrd ebx, eax, cl
-      shr eax, cl
-
-      // clear out high 12 bytes of stack
-      xor edx, edx
-      mov dword ptr[r + 20], edx
-      mov dword ptr[r + 24], edx
-      mov dword ptr[r + 28], edx
-
-      // store shifted amount offset by -count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      neg ecx
-      mov dword ptr[r + ecx * 4 + 16], edi
-      mov dword ptr[r + ecx * 4 + 20], esi
-      mov dword ptr[r + ecx * 4 + 24], ebx
-      mov dword ptr[r + ecx * 4 + 28], eax
-   }
-#else
-   r[2] = src->lo;
-   r[3] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[2] = r[3] >> (amount - 64);
-      r[3] = 0;
-   } else if (amount) {
-#ifdef _M_X64
-      r[2] = __shiftright128(r[2], r[3], (char) amount);
-#else
-      r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
-#endif
-      r[3] = r[3] >> amount;
-   }
-#endif
-
-   dst->lo = r[2];
-   dst->hi = r[3];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Sar(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shrd edi, esi, cl
-      shrd esi, ebx, cl
-      shrd ebx, eax, cl
-      sar eax, cl
-
-      // copy sign to high 12 bytes of stack
-      cdq
-      mov dword ptr[r + 20], edx
-      mov dword ptr[r + 24], edx
-      mov dword ptr[r + 28], edx
-
-      // store shifted amount offset by -count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      neg ecx
-      mov dword ptr[r + ecx * 4 + 16], edi
-      mov dword ptr[r + ecx * 4 + 20], esi
-      mov dword ptr[r + ecx * 4 + 24], ebx
-      mov dword ptr[r + ecx * 4 + 28], eax
-   }
-#else
-   r[2] = src->lo;
-   r[3] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
-      r[3] = (R128_U64)((R128_S64)r[3] >> 63);
-   } else if (amount) {
-      r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
-      r[3] = (R128_U64)((R128_S64)r[3] >> amount);
-   }
-#endif
-
-   dst->lo = r[2];
-   dst->hi = r[3];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Add(R128 *dst, const R128 *a, const R128 *b)
-{
-   unsigned char carry = 0;
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-#  if R128_64BIT
-   carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
-   carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
-#  else
-   R128_U32 r0, r1, r2, r3;
-   carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
-   carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
-   carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
-   carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
-   R128_SET4(dst, r0, r1, r2, r3);
-#  endif //R128_64BIT
-#else
-   {
-      R128_U64 r = a->lo + b->lo;
-      carry = r < a->lo;
-      dst->lo = r;
-      dst->hi = a->hi + b->hi + carry;
-   }
-#endif   //R128_INTEL
-
-   R128_DEBUG_SET(dst);
-}
-
-void r128Sub(R128 *dst, const R128 *a, const R128 *b)
-{
-   unsigned char borrow = 0;
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-#  if R128_64BIT
-   borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
-   borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
-#  else
-   R128_U32 r0, r1, r2, r3;
-   borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
-   borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
-   borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
-   borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
-   R128_SET4(dst, r0, r1, r2, r3);
-#  endif //R128_64BIT
-#else
-   {
-      R128_U64 r = a->lo - b->lo;
-      borrow = r > a->lo;
-      dst->lo = r;
-      dst->hi = a->hi - b->hi - borrow;
-   }
-#endif   //R128_INTEL
-
-   R128_DEBUG_SET(dst);
-}
-
-void r128Mul(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 ta, tb, tc;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&ta, a->lo, a->hi);
-   R128_SET2(&tb, b->lo, b->hi);
-
-   if (r128IsNeg(&ta)) {
-      r128__neg(&ta, &ta);
-      sign = !sign;
-   }
-   if (r128IsNeg(&tb)) {
-      r128__neg(&tb, &tb);
-      sign = !sign;
-   }
-
-   r128__umul(&tc, &ta, &tb);
-   if (sign) {
-      r128__neg(&tc, &tc);
-   }
-
-   r128Copy(dst, &tc);
-}
-
-void r128Div(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 tn, td, tq;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&tn, a->lo, a->hi);
-   R128_SET2(&td, b->lo, b->hi);
-
-   if (r128IsNeg(&tn)) {
-      r128__neg(&tn, &tn);
-      sign = !sign;
-   }
-
-   if (td.lo == 0 && td.hi == 0) {
-      // divide by zero
-      if (sign) {
-         r128Copy(dst, &R128_min);
-      } else {
-         r128Copy(dst, &R128_max);
-      }
-      return;
-   } else if (r128IsNeg(&td)) {
-      r128__neg(&td, &td);
-      sign = !sign;
-   }
-
-   r128__udiv(&tq, &tn, &td);
-
-   if (sign) {
-      r128__neg(&tq, &tq);
-   }
-
-   r128Copy(dst, &tq);
-}
-
-void r128Mod(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 tn, td, tq;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&tn, a->lo, a->hi);
-   R128_SET2(&td, b->lo, b->hi);
-
-   if (r128IsNeg(&tn)) {
-      r128__neg(&tn, &tn);
-      sign = !sign;
-   }
-
-   if (td.lo == 0 && td.hi == 0) {
-      // divide by zero
-      if (sign) {
-         r128Copy(dst, &R128_min);
-      } else {
-         r128Copy(dst, &R128_max);
-      }
-      return;
-   } else if (r128IsNeg(&td)) {
-      r128__neg(&td, &td);
-      sign = !sign;
-   }
-
-   tq.hi = r128__umod(&tn, &td);
-   tq.lo = 0;
-
-   if (sign) {
-      tq.hi = ~tq.hi + 1;
-   }
-
-   r128Mul(&tq, &tq, b);
-   r128Sub(dst, a, &tq);
-}
-
-void r128Rsqrt(R128 *dst, const R128 *v)
-{
-   static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
-   R128 x, est;
-   int i;
-
-   if ((R128_S64)v->hi < 0) {
-      r128Copy(dst, &R128_min);
-      return;
-   }
-
-   R128_SET2(&x, v->lo, v->hi);
-
-   // get initial estimate
-   if (x.hi) {
-      int shift = (64 + r128__clz64(x.hi)) >> 1;
-      est.lo = R128_LIT_U64(1) << shift;
-      est.hi = 0;
-   } else if (x.lo) {
-      int shift = r128__clz64(x.lo) >> 1;
-      est.hi = R128_LIT_U64(1) << shift;
-      est.lo = 0;
-   } else {
-      R128_SET2(dst, 0, 0);
-      return;
-   }
-
-   // x /= 2
-   r128Shr(&x, &x, 1);
-
-   // Newton-Raphson iterate
-   for (i = 0; i < 7; ++i) {
-      R128 newEst;
-
-      // newEst = est * (threeHalves - (x / 2) * est * est);
-      r128__umul(&newEst, &est, &est);
-      r128__umul(&newEst, &newEst, &x);
-      r128Sub(&newEst, &threeHalves, &newEst);
-      r128__umul(&newEst, &est, &newEst);
-
-      if (newEst.lo == est.lo && newEst.hi == est.hi) {
-         break;
-      }
-      R128_SET2(&est, newEst.lo, newEst.hi);
-   }
-
-   r128Copy(dst, &est);
-}
-
-void r128Sqrt(R128 *dst, const R128 *v)
-{
-   R128 x, est;
-   int i;
-
-   if ((R128_S64)v->hi < 0) {
-      r128Copy(dst, &R128_min);
-      return;
-   }
-
-   R128_SET2(&x, v->lo, v->hi);
-
-   // get initial estimate
-   if (x.hi) {
-      int shift = (63 - r128__clz64(x.hi)) >> 1;
-      r128Shr(&est, &x, shift);
-   } else if (x.lo) {
-      int shift = (1 + r128__clz64(x.lo)) >> 1;
-      r128Shl(&est, &x, shift);
-   } else {
-      R128_SET2(dst, 0, 0);
-      return;
-   }
-
-   // Newton-Raphson iterate
-   for (i = 0; i < 7; ++i) {
-      R128 newEst;
-
-      // newEst = (est + x / est) / 2
-      r128__udiv(&newEst, &x, &est);
-      r128Add(&newEst, &newEst, &est);
-      r128Shr(&newEst, &newEst, 1);
-
-      if (newEst.lo == est.lo && newEst.hi == est.hi) {
-         break;
-      }
-      R128_SET2(&est, newEst.lo, newEst.hi);
-   }
-
-   r128Copy(dst, &est);
-}
-
-int r128Cmp(const R128 *a, const R128 *b)
-{
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (a->hi == b->hi) {
-      if (a->lo == b->lo) {
-         return 0;
-      } else if (a->lo > b->lo) {
-         return 1;
-      } else {
-         return -1;
-      }
-   } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
-      return 1;
-   } else {
-      return -1;
-   }
-}
-
-int r128IsNeg(const R128 *v)
-{
-   R128_ASSERT(v != NULL);
-
-   return (R128_S64)v->hi < 0;
-}
-
-void r128Min(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (r128Cmp(a, b) < 0) {
-      r128Copy(dst, a);
-   } else {
-      r128Copy(dst, b);
-   }
-}
-
-void r128Max(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (r128Cmp(a, b) > 0) {
-      r128Copy(dst, a);
-   } else {
-      r128Copy(dst, b);
-   }
-}
-
-void r128Floor(R128 *dst, const R128 *v)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(v != NULL);
-
-   if ((R128_S64)v->hi < 0) {
-      dst->hi = v->hi - (v->lo != 0);
-   } else {
-      dst->hi = v->hi;
-   }
-   dst->lo = 0;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Ceil(R128 *dst, const R128 *v)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(v != NULL);
-
-   if ((R128_S64)v->hi > 0) {
-      dst->hi = v->hi + (v->lo != 0);
-   } else {
-      dst->hi = v->hi;
-   }
-   dst->lo = 0;
-   R128_DEBUG_SET(dst);
-}
-
-#endif   //R128_IMPLEMENTATION
-
diff --git a/thirdparty/stb_rect_pack/stb_rect_pack.h b/thirdparty/stb_rect_pack/stb_rect_pack.h
deleted file mode 100644
index 3336fe7395..0000000000
--- a/thirdparty/stb_rect_pack/stb_rect_pack.h
+++ /dev/null
@@ -1,629 +0,0 @@
-// stb_rect_pack.h - v1.00 - public domain - rectangle packing
-// Sean Barrett 2014
-//
-// Useful for e.g. packing rectangular textures into an atlas.
-// Does not do rotation.
-//
-// Not necessarily the awesomest packing method, but better than
-// the totally naive one in stb_truetype (which is primarily what
-// this is meant to replace).
-//
-// Has only had a few tests run, may have issues.
-//
-// More docs to come.
-//
-// No memory allocations; uses qsort() and assert() from stdlib.
-// Can override those by defining STBRP_SORT and STBRP_ASSERT.
-//
-// This library currently uses the Skyline Bottom-Left algorithm.
-//
-// Please note: better rectangle packers are welcome! Please
-// implement them to the same API, but with a different init
-// function.
-//
-// Credits
-//
-//  Library
-//    Sean Barrett
-//  Minor features
-//    Martins Mozeiko
-//    github:IntellectualKitty
-//
-//  Bugfixes / warning fixes
-//    Jeremy Jaussaud
-//    Fabian Giesen
-//
-// Version history:
-//
-//     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
-//     0.99  (2019-02-07)  warning fixes
-//     0.11  (2017-03-03)  return packing success/fail result
-//     0.10  (2016-10-25)  remove cast-away-const to avoid warnings
-//     0.09  (2016-08-27)  fix compiler warnings
-//     0.08  (2015-09-13)  really fix bug with empty rects (w=0 or h=0)
-//     0.07  (2015-09-13)  fix bug with empty rects (w=0 or h=0)
-//     0.06  (2015-04-15)  added STBRP_SORT to allow replacing qsort
-//     0.05:  added STBRP_ASSERT to allow replacing assert
-//     0.04:  fixed minor bug in STBRP_LARGE_RECTS support
-//     0.01:  initial release
-//
-// LICENSE
-//
-//   See end of file for license information.
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//       INCLUDE SECTION
-//
-
-#ifndef STB_INCLUDE_STB_RECT_PACK_H
-#define STB_INCLUDE_STB_RECT_PACK_H
-
-#define STB_RECT_PACK_VERSION  1
-
-#ifdef STBRP_STATIC
-#define STBRP_DEF static
-#else
-#define STBRP_DEF extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct stbrp_context stbrp_context;
-typedef struct stbrp_node    stbrp_node;
-typedef struct stbrp_rect    stbrp_rect;
-
-#ifdef STBRP_LARGE_RECTS
-typedef int            stbrp_coord;
-#else
-typedef unsigned short stbrp_coord;
-#endif
-
-STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects);
-// Assign packed locations to rectangles. The rectangles are of type
-// 'stbrp_rect' defined below, stored in the array 'rects', and there
-// are 'num_rects' many of them.
-//
-// Rectangles which are successfully packed have the 'was_packed' flag
-// set to a non-zero value and 'x' and 'y' store the minimum location
-// on each axis (i.e. bottom-left in cartesian coordinates, top-left
-// if you imagine y increasing downwards). Rectangles which do not fit
-// have the 'was_packed' flag set to 0.
-//
-// You should not try to access the 'rects' array from another thread
-// while this function is running, as the function temporarily reorders
-// the array while it executes.
-//
-// To pack into another rectangle, you need to call stbrp_init_target
-// again. To continue packing into the same rectangle, you can call
-// this function again. Calling this multiple times with multiple rect
-// arrays will probably produce worse packing results than calling it
-// a single time with the full rectangle array, but the option is
-// available.
-//
-// The function returns 1 if all of the rectangles were successfully
-// packed and 0 otherwise.
-
-struct stbrp_rect
-{
-   // reserved for your use:
-   int            id;
-
-   // input:
-   stbrp_coord    w, h;
-
-   // output:
-   stbrp_coord    x, y;
-   int            was_packed;  // non-zero if valid packing
-
-}; // 16 bytes, nominally
-
-
-STBRP_DEF void stbrp_init_target (stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes);
-// Initialize a rectangle packer to:
-//    pack a rectangle that is 'width' by 'height' in dimensions
-//    using temporary storage provided by the array 'nodes', which is 'num_nodes' long
-//
-// You must call this function every time you start packing into a new target.
-//
-// There is no "shutdown" function. The 'nodes' memory must stay valid for
-// the following stbrp_pack_rects() call (or calls), but can be freed after
-// the call (or calls) finish.
-//
-// Note: to guarantee best results, either:
-//       1. make sure 'num_nodes' >= 'width'
-//   or  2. call stbrp_allow_out_of_mem() defined below with 'allow_out_of_mem = 1'
-//
-// If you don't do either of the above things, widths will be quantized to multiples
-// of small integers to guarantee the algorithm doesn't run out of temporary storage.
-//
-// If you do #2, then the non-quantized algorithm will be used, but the algorithm
-// may run out of temporary storage and be unable to pack some rectangles.
-
-STBRP_DEF void stbrp_setup_allow_out_of_mem (stbrp_context *context, int allow_out_of_mem);
-// Optionally call this function after init but before doing any packing to
-// change the handling of the out-of-temp-memory scenario, described above.
-// If you call init again, this will be reset to the default (false).
-
-
-STBRP_DEF void stbrp_setup_heuristic (stbrp_context *context, int heuristic);
-// Optionally select which packing heuristic the library should use. Different
-// heuristics will produce better/worse results for different data sets.
-// If you call init again, this will be reset to the default.
-
-enum
-{
-   STBRP_HEURISTIC_Skyline_default=0,
-   STBRP_HEURISTIC_Skyline_BL_sortHeight = STBRP_HEURISTIC_Skyline_default,
-   STBRP_HEURISTIC_Skyline_BF_sortHeight
-};
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// the details of the following structures don't matter to you, but they must
-// be visible so you can handle the memory allocations for them
-
-struct stbrp_node
-{
-   stbrp_coord  x,y;
-   stbrp_node  *next;
-};
-
-struct stbrp_context
-{
-   int width;
-   int height;
-   int align;
-   int init_mode;
-   int heuristic;
-   int num_nodes;
-   stbrp_node *active_head;
-   stbrp_node *free_head;
-   stbrp_node extra[2]; // we allocate two extra nodes so optimal user-node-count is 'width' not 'width+2'
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//     IMPLEMENTATION SECTION
-//
-
-#ifdef STB_RECT_PACK_IMPLEMENTATION
-#ifndef STBRP_SORT
-#include <stdlib.h>
-#define STBRP_SORT qsort
-#endif
-
-#ifndef STBRP_ASSERT
-#include <assert.h>
-#define STBRP_ASSERT assert
-#endif
-
-#ifdef _MSC_VER
-#define STBRP__NOTUSED(v)  (void)(v)
-#else
-#define STBRP__NOTUSED(v)  (void)sizeof(v)
-#endif
-
-enum
-{
-   STBRP__INIT_skyline = 1
-};
-
-STBRP_DEF void stbrp_setup_heuristic(stbrp_context *context, int heuristic)
-{
-   switch (context->init_mode) {
-      case STBRP__INIT_skyline:
-         STBRP_ASSERT(heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight || heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight);
-         context->heuristic = heuristic;
-         break;
-      default:
-         STBRP_ASSERT(0);
-   }
-}
-
-STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_out_of_mem)
-{
-   if (allow_out_of_mem)
-      // if it's ok to run out of memory, then don't bother aligning them;
-      // this gives better packing, but may fail due to OOM (even though
-      // the rectangles easily fit). @TODO a smarter approach would be to only
-      // quantize once we've hit OOM, then we could get rid of this parameter.
-      context->align = 1;
-   else {
-      // if it's not ok to run out of memory, then quantize the widths
-      // so that num_nodes is always enough nodes.
-      //
-      // I.e. num_nodes * align >= width
-      //                  align >= width / num_nodes
-      //                  align = ceil(width/num_nodes)
-
-      context->align = (context->width + context->num_nodes-1) / context->num_nodes;
-   }
-}
-
-STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes)
-{
-   int i;
-#ifndef STBRP_LARGE_RECTS
-   STBRP_ASSERT(width <= 0xffff && height <= 0xffff);
-#endif
-
-   for (i=0; i < num_nodes-1; ++i)
-      nodes[i].next = &nodes[i+1];
-   nodes[i].next = NULL;
-   context->init_mode = STBRP__INIT_skyline;
-   context->heuristic = STBRP_HEURISTIC_Skyline_default;
-   context->free_head = &nodes[0];
-   context->active_head = &context->extra[0];
-   context->width = width;
-   context->height = height;
-   context->num_nodes = num_nodes;
-   stbrp_setup_allow_out_of_mem(context, 0);
-
-   // node 0 is the full width, node 1 is the sentinel (lets us not store width explicitly)
-   context->extra[0].x = 0;
-   context->extra[0].y = 0;
-   context->extra[0].next = &context->extra[1];
-   context->extra[1].x = (stbrp_coord) width;
-#ifdef STBRP_LARGE_RECTS
-   context->extra[1].y = (1<<30);
-#else
-   context->extra[1].y = 65535;
-#endif
-   context->extra[1].next = NULL;
-}
-
-// find minimum y position if it starts at x1
-static int stbrp__skyline_find_min_y(stbrp_context *c, stbrp_node *first, int x0, int width, int *pwaste)
-{
-   stbrp_node *node = first;
-   int x1 = x0 + width;
-   int min_y, visited_width, waste_area;
-
-   STBRP__NOTUSED(c);
-
-   STBRP_ASSERT(first->x <= x0);
-
-   #if 0
-   // skip in case we're past the node
-   while (node->next->x <= x0)
-      ++node;
-   #else
-   STBRP_ASSERT(node->next->x > x0); // we ended up handling this in the caller for efficiency
-   #endif
-
-   STBRP_ASSERT(node->x <= x0);
-
-   min_y = 0;
-   waste_area = 0;
-   visited_width = 0;
-   while (node->x < x1) {
-      if (node->y > min_y) {
-         // raise min_y higher.
-         // we've accounted for all waste up to min_y,
-         // but we'll now add more waste for everything we've visted
-         waste_area += visited_width * (node->y - min_y);
-         min_y = node->y;
-         // the first time through, visited_width might be reduced
-         if (node->x < x0)
-            visited_width += node->next->x - x0;
-         else
-            visited_width += node->next->x - node->x;
-      } else {
-         // add waste area
-         int under_width = node->next->x - node->x;
-         if (under_width + visited_width > width)
-            under_width = width - visited_width;
-         waste_area += under_width * (min_y - node->y);
-         visited_width += under_width;
-      }
-      node = node->next;
-   }
-
-   *pwaste = waste_area;
-   return min_y;
-}
-
-typedef struct
-{
-   int x,y;
-   stbrp_node **prev_link;
-} stbrp__findresult;
-
-static stbrp__findresult stbrp__skyline_find_best_pos(stbrp_context *c, int width, int height)
-{
-   int best_waste = (1<<30), best_x, best_y = (1 << 30);
-   stbrp__findresult fr;
-   stbrp_node **prev, *node, *tail, **best = NULL;
-
-   // align to multiple of c->align
-   width = (width + c->align - 1);
-   width -= width % c->align;
-   STBRP_ASSERT(width % c->align == 0);
-
-   // if it can't possibly fit, bail immediately
-   if (width > c->width || height > c->height) {
-      fr.prev_link = NULL;
-      fr.x = fr.y = 0;
-      return fr;
-   }
-
-   node = c->active_head;
-   prev = &c->active_head;
-   while (node->x + width <= c->width) {
-      int y,waste;
-      y = stbrp__skyline_find_min_y(c, node, node->x, width, &waste);
-      if (c->heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight) { // actually just want to test BL
-         // bottom left
-         if (y < best_y) {
-            best_y = y;
-            best = prev;
-         }
-      } else {
-         // best-fit
-         if (y + height <= c->height) {
-            // can only use it if it first vertically
-            if (y < best_y || (y == best_y && waste < best_waste)) {
-               best_y = y;
-               best_waste = waste;
-               best = prev;
-            }
-         }
-      }
-      prev = &node->next;
-      node = node->next;
-   }
-
-   best_x = (best == NULL) ? 0 : (*best)->x;
-
-   // if doing best-fit (BF), we also have to try aligning right edge to each node position
-   //
-   // e.g, if fitting
-   //
-   //     ____________________
-   //    |____________________|
-   //
-   //            into
-   //
-   //   |                         |
-   //   |             ____________|
-   //   |____________|
-   //
-   // then right-aligned reduces waste, but bottom-left BL is always chooses left-aligned
-   //
-   // This makes BF take about 2x the time
-
-   if (c->heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight) {
-      tail = c->active_head;
-      node = c->active_head;
-      prev = &c->active_head;
-      // find first node that's admissible
-      while (tail->x < width)
-         tail = tail->next;
-      while (tail) {
-         int xpos = tail->x - width;
-         int y,waste;
-         STBRP_ASSERT(xpos >= 0);
-         // find the left position that matches this
-         while (node->next->x <= xpos) {
-            prev = &node->next;
-            node = node->next;
-         }
-         STBRP_ASSERT(node->next->x > xpos && node->x <= xpos);
-         y = stbrp__skyline_find_min_y(c, node, xpos, width, &waste);
-         if (y + height <= c->height) {
-            if (y <= best_y) {
-               if (y < best_y || waste < best_waste || (waste==best_waste && xpos < best_x)) {
-                  best_x = xpos;
-                  STBRP_ASSERT(y <= best_y);
-                  best_y = y;
-                  best_waste = waste;
-                  best = prev;
-               }
-            }
-         }
-         tail = tail->next;
-      }
-   }
-
-   fr.prev_link = best;
-   fr.x = best_x;
-   fr.y = best_y;
-   return fr;
-}
-
-static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, int width, int height)
-{
-   // find best position according to heuristic
-   stbrp__findresult res = stbrp__skyline_find_best_pos(context, width, height);
-   stbrp_node *node, *cur;
-
-   // bail if:
-   //    1. it failed
-   //    2. the best node doesn't fit (we don't always check this)
-   //    3. we're out of memory
-   if (res.prev_link == NULL || res.y + height > context->height || context->free_head == NULL) {
-      res.prev_link = NULL;
-      return res;
-   }
-
-   // on success, create new node
-   node = context->free_head;
-   node->x = (stbrp_coord) res.x;
-   node->y = (stbrp_coord) (res.y + height);
-
-   context->free_head = node->next;
-
-   // insert the new node into the right starting point, and
-   // let 'cur' point to the remaining nodes needing to be
-   // stiched back in
-
-   cur = *res.prev_link;
-   if (cur->x < res.x) {
-      // preserve the existing one, so start testing with the next one
-      stbrp_node *next = cur->next;
-      cur->next = node;
-      cur = next;
-   } else {
-      *res.prev_link = node;
-   }
-
-   // from here, traverse cur and free the nodes, until we get to one
-   // that shouldn't be freed
-   while (cur->next && cur->next->x <= res.x + width) {
-      stbrp_node *next = cur->next;
-      // move the current node to the free list
-      cur->next = context->free_head;
-      context->free_head = cur;
-      cur = next;
-   }
-
-   // stitch the list back in
-   node->next = cur;
-
-   if (cur->x < res.x + width)
-      cur->x = (stbrp_coord) (res.x + width);
-
-#ifdef _DEBUG
-   cur = context->active_head;
-   while (cur->x < context->width) {
-      STBRP_ASSERT(cur->x < cur->next->x);
-      cur = cur->next;
-   }
-   STBRP_ASSERT(cur->next == NULL);
-
-   {
-      int count=0;
-      cur = context->active_head;
-      while (cur) {
-         cur = cur->next;
-         ++count;
-      }
-      cur = context->free_head;
-      while (cur) {
-         cur = cur->next;
-         ++count;
-      }
-      STBRP_ASSERT(count == context->num_nodes+2);
-   }
-#endif
-
-   return res;
-}
-
-static int rect_height_compare(const void *a, const void *b)
-{
-   const stbrp_rect *p = (const stbrp_rect *) a;
-   const stbrp_rect *q = (const stbrp_rect *) b;
-   if (p->h > q->h)
-      return -1;
-   if (p->h < q->h)
-      return  1;
-   return (p->w > q->w) ? -1 : (p->w < q->w);
-}
-
-static int rect_original_order(const void *a, const void *b)
-{
-   const stbrp_rect *p = (const stbrp_rect *) a;
-   const stbrp_rect *q = (const stbrp_rect *) b;
-   return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
-}
-
-#ifdef STBRP_LARGE_RECTS
-#define STBRP__MAXVAL  0xffffffff
-#else
-#define STBRP__MAXVAL  0xffff
-#endif
-
-STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects)
-{
-   int i, all_rects_packed = 1;
-
-   // we use the 'was_packed' field internally to allow sorting/unsorting
-   for (i=0; i < num_rects; ++i) {
-      rects[i].was_packed = i;
-   }
-
-   // sort according to heuristic
-   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_height_compare);
-
-   for (i=0; i < num_rects; ++i) {
-      if (rects[i].w == 0 || rects[i].h == 0) {
-         rects[i].x = rects[i].y = 0;  // empty rect needs no space
-      } else {
-         stbrp__findresult fr = stbrp__skyline_pack_rectangle(context, rects[i].w, rects[i].h);
-         if (fr.prev_link) {
-            rects[i].x = (stbrp_coord) fr.x;
-            rects[i].y = (stbrp_coord) fr.y;
-         } else {
-            rects[i].x = rects[i].y = STBRP__MAXVAL;
-         }
-      }
-   }
-
-   // unsort
-   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_original_order);
-
-   // set was_packed flags and all_rects_packed status
-   for (i=0; i < num_rects; ++i) {
-      rects[i].was_packed = !(rects[i].x == STBRP__MAXVAL && rects[i].y == STBRP__MAXVAL);
-      if (!rects[i].was_packed)
-         all_rects_packed = 0;
-   }
-
-   // return the all_rects_packed status
-   return all_rects_packed;
-}
-#endif
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/
-
-- 
cgit v1.2.3