99 files changed, 12104 insertions, 9183 deletions
diff --git a/core/config/engine.cpp b/core/config/engine.cpp
index 21e910be5b..7aa5f4d06d 100644
--- a/core/config/engine.cpp
+++ b/core/config/engine.cpp
@@ -48,6 +48,15 @@ int Engine::get_physics_ticks_per_second() const {
 	return ips;
 }
 
+void Engine::set_max_physics_steps_per_frame(int p_max_physics_steps) {
+	ERR_FAIL_COND_MSG(p_max_physics_steps <= 0, "Maximum number of physics steps per frame must be greater than 0.");
+	max_physics_steps_per_frame = p_max_physics_steps;
+}
+
+int Engine::get_max_physics_steps_per_frame() const {
+	return max_physics_steps_per_frame;
+}
+
 void Engine::set_physics_jitter_fix(double p_threshold) {
 	if (p_threshold < 0) {
 		p_threshold = 0;
diff --git a/core/config/engine.h b/core/config/engine.h
index 21517e46b7..1b179c5727 100644
--- a/core/config/engine.h
+++ b/core/config/engine.h
@@ -63,6 +63,7 @@ private:
 	int _max_fps = 0;
 	double _time_scale = 1.0;
 	uint64_t _physics_frames = 0;
+	int max_physics_steps_per_frame = 8;
 	double _physics_interpolation_fraction = 0.0f;
 	bool abort_on_gpu_errors = false;
 	bool use_validation_layers = false;
@@ -93,6 +94,9 @@ public:
 	virtual void set_physics_ticks_per_second(int p_ips);
 	virtual int get_physics_ticks_per_second() const;
 
+	virtual void set_max_physics_steps_per_frame(int p_max_physics_steps);
+	virtual int get_max_physics_steps_per_frame() const;
+
 	void set_physics_jitter_fix(double p_threshold);
 	double get_physics_jitter_fix() const;
 
diff --git a/core/core_bind.cpp b/core/core_bind.cpp
index 87b36f7a21..1fe34cb4fd 100644
--- a/core/core_bind.cpp
+++ b/core/core_bind.cpp
@@ -1477,6 +1477,14 @@ int Engine::get_physics_ticks_per_second() const {
 	return ::Engine::get_singleton()->get_physics_ticks_per_second();
 }
 
+void Engine::set_max_physics_steps_per_frame(int p_max_physics_steps) {
+	::Engine::get_singleton()->set_max_physics_steps_per_frame(p_max_physics_steps);
+}
+
+int Engine::get_max_physics_steps_per_frame() const {
+	return ::Engine::get_singleton()->get_max_physics_steps_per_frame();
+}
+
 void Engine::set_physics_jitter_fix(double p_threshold) {
 	::Engine::get_singleton()->set_physics_jitter_fix(p_threshold);
 }
@@ -1628,6 +1636,8 @@ bool Engine::is_printing_error_messages() const {
 void Engine::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_physics_ticks_per_second", "physics_ticks_per_second"), &Engine::set_physics_ticks_per_second);
 	ClassDB::bind_method(D_METHOD("get_physics_ticks_per_second"), &Engine::get_physics_ticks_per_second);
+	ClassDB::bind_method(D_METHOD("set_max_physics_steps_per_frame", "max_physics_steps"), &Engine::set_max_physics_steps_per_frame);
+	ClassDB::bind_method(D_METHOD("get_max_physics_steps_per_frame"), &Engine::get_max_physics_steps_per_frame);
 	ClassDB::bind_method(D_METHOD("set_physics_jitter_fix", "physics_jitter_fix"), &Engine::set_physics_jitter_fix);
 	ClassDB::bind_method(D_METHOD("get_physics_jitter_fix"), &Engine::get_physics_jitter_fix);
 	ClassDB::bind_method(D_METHOD("get_physics_interpolation_fraction"), &Engine::get_physics_interpolation_fraction);
@@ -1675,6 +1685,7 @@ void Engine::_bind_methods() {
 
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "print_error_messages"), "set_print_error_messages", "is_printing_error_messages");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "physics_ticks_per_second"), "set_physics_ticks_per_second", "get_physics_ticks_per_second");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "max_physics_steps_per_frame"), "set_max_physics_steps_per_frame", "get_max_physics_steps_per_frame");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "max_fps"), "set_max_fps", "get_max_fps");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "time_scale"), "set_time_scale", "get_time_scale");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "physics_jitter_fix"), "set_physics_jitter_fix", "get_physics_jitter_fix");
diff --git a/core/core_bind.h b/core/core_bind.h
index 784f3e63b1..748ecb4929 100644
--- a/core/core_bind.h
+++ b/core/core_bind.h
@@ -486,6 +486,9 @@ public:
 	void set_physics_ticks_per_second(int p_ips);
 	int get_physics_ticks_per_second() const;
 
+	void set_max_physics_steps_per_frame(int p_max_physics_steps);
+	int get_max_physics_steps_per_frame() const;
+
 	void set_physics_jitter_fix(double p_threshold);
 	double get_physics_jitter_fix() const;
 	double get_physics_interpolation_fraction() const;
diff --git a/doc/classes/Engine.xml b/doc/classes/Engine.xml
index 821fae37a6..d583e07f59 100644
--- a/doc/classes/Engine.xml
+++ b/doc/classes/Engine.xml
@@ -275,13 +275,16 @@
 			If [member ProjectSettings.display/window/vsync/vsync_mode] is [code]Disabled[/code], limiting the FPS to a high value that can be consistently reached on the system can reduce input lag compared to an uncapped framerate. Since this works by ensuring the GPU load is lower than 100%, this latency reduction is only effective in GPU-bottlenecked scenarios, not CPU-bottlenecked scenarios.
 			See also [member physics_ticks_per_second] and [member ProjectSettings.application/run/max_fps].
 		</member>
+		<member name="max_physics_steps_per_frame" type="int" setter="set_max_physics_steps_per_frame" getter="get_max_physics_steps_per_frame" default="8">
+			Controls the maximum number of physics steps that can be simulated each rendered frame. The default value is tuned to avoid "spiral of death" situations where expensive physics simulations trigger more expensive simulations indefinitely. However, the game will appear to slow down if the rendering FPS is less than [code]1 / max_physics_steps_per_frame[/code] of [member physics_ticks_per_second]. This occurs even if [code]delta[/code] is consistently used in physics calculations. To avoid this, increase [member max_physics_steps_per_frame] if you have increased [member physics_ticks_per_second] significantly above its default value.
+		</member>
 		<member name="physics_jitter_fix" type="float" setter="set_physics_jitter_fix" getter="get_physics_jitter_fix" default="0.5">
 			Controls how much physics ticks are synchronized with real time. For 0 or less, the ticks are synchronized. Such values are recommended for network games, where clock synchronization matters. Higher values cause higher deviation of the in-game clock and real clock but smooth out framerate jitters. The default value of 0.5 should be fine for most; values above 2 could cause the game to react to dropped frames with a noticeable delay and are not recommended.
 			[b]Note:[/b] For best results, when using a custom physics interpolation solution, the physics jitter fix should be disabled by setting [member physics_jitter_fix] to [code]0[/code].
 		</member>
 		<member name="physics_ticks_per_second" type="int" setter="set_physics_ticks_per_second" getter="get_physics_ticks_per_second" default="60">
 			The number of fixed iterations per second. This controls how often physics simulation and [method Node._physics_process] methods are run. This value should generally always be set to [code]60[/code] or above, as Godot doesn't interpolate the physics step. As a result, values lower than [code]60[/code] will look stuttery. This value can be increased to make input more reactive or work around collision tunneling issues, but keep in mind doing so will increase CPU usage. See also [member max_fps] and [member ProjectSettings.physics/common/physics_ticks_per_second].
-			[b]Note:[/b] Only 8 physics ticks may be simulated per rendered frame at most. If more than 8 physics ticks have to be simulated per rendered frame to keep up with rendering, the game will appear to slow down (even if [code]delta[/code] is used consistently in physics calculations). Therefore, it is recommended not to increase [member physics_ticks_per_second] above 240. Otherwise, the game will slow down when the rendering framerate goes below 30 FPS.
+			[b]Note:[/b] Only [member max_physics_steps_per_frame] physics ticks may be simulated per rendered frame at most. If more physics ticks have to be simulated per rendered frame to keep up with rendering, the project will appear to slow down (even if [code]delta[/code] is used consistently in physics calculations). Therefore, it is recommended to also increase [member max_physics_steps_per_frame] if increasing [member physics_ticks_per_second] significantly above its default value.
 		</member>
 		<member name="print_error_messages" type="bool" setter="set_print_error_messages" getter="is_printing_error_messages" default="true">
 			If [code]false[/code], stops printing error and warning messages to the console and editor Output log. This can be used to hide error and warning messages during unit test suite runs. This property is equivalent to the [member ProjectSettings.application/run/disable_stderr] project setting.
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 9990b38925..fe6f384603 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -1812,6 +1812,10 @@
 		<member name="physics/common/enable_object_picking" type="bool" setter="" getter="" default="true">
 			Enables [member Viewport.physics_object_picking] on the root viewport.
 		</member>
+		<member name="physics/common/max_physics_steps_per_frame" type="int" setter="" getter="" default="8">
+			Controls the maximum number of physics steps that can be simulated each rendered frame. The default value is tuned to avoid "spiral of death" situations where expensive physics simulations trigger more expensive simulations indefinitely. However, the game will appear to slow down if the rendering FPS is less than [code]1 / max_physics_steps_per_frame[/code] of [member physics/common/physics_ticks_per_second]. This occurs even if [code]delta[/code] is consistently used in physics calculations. To avoid this, increase [member physics/common/max_physics_steps_per_frame] if you have increased [member physics/common/physics_ticks_per_second] significantly above its default value.
+			[b]Note:[/b] This property is only read when the project starts. To change the maximum number of simulated physics steps per frame at runtime, set [member Engine.max_physics_steps_per_frame] instead.
+		</member>
 		<member name="physics/common/physics_jitter_fix" type="float" setter="" getter="" default="0.5">
 			Controls how much physics ticks are synchronized with real time. For 0 or less, the ticks are synchronized. Such values are recommended for network games, where clock synchronization matters. Higher values cause higher deviation of in-game clock and real clock, but allows smoothing out framerate jitters. The default value of 0.5 should be fine for most; values above 2 could cause the game to react to dropped frames with a noticeable delay and are not recommended.
 			[b]Note:[/b] For best results, when using a custom physics interpolation solution, the physics jitter fix should be disabled by setting [member physics/common/physics_jitter_fix] to [code]0[/code].
@@ -1820,7 +1824,7 @@
 		<member name="physics/common/physics_ticks_per_second" type="int" setter="" getter="" default="60">
 			The number of fixed iterations per second. This controls how often physics simulation and [method Node._physics_process] methods are run. See also [member application/run/max_fps].
 			[b]Note:[/b] This property is only read when the project starts. To change the physics FPS at runtime, set [member Engine.physics_ticks_per_second] instead.
-			[b]Note:[/b] Only 8 physics ticks may be simulated per rendered frame at most. If more than 8 physics ticks have to be simulated per rendered frame to keep up with rendering, the game will appear to slow down (even if [code]delta[/code] is used consistently in physics calculations). Therefore, it is recommended not to increase [member physics/common/physics_ticks_per_second] above 240. Otherwise, the game will slow down when the rendering framerate goes below 30 FPS.
+			[b]Note:[/b] Only [member physics/common/max_physics_steps_per_frame] physics ticks may be simulated per rendered frame at most. If more physics ticks have to be simulated per rendered frame to keep up with rendering, the project will appear to slow down (even if [code]delta[/code] is used consistently in physics calculations). Therefore, it is recommended to also increase [member physics/common/max_physics_steps_per_frame] if increasing [member physics/common/physics_ticks_per_second] significantly above its default value.
 		</member>
 		<member name="rendering/2d/sdf/oversize" type="int" setter="" getter="" default="1">
 		</member>
diff --git a/editor/editor_help.cpp b/editor/editor_help.cpp
index 973e4acbcb..efa85dadee 100644
--- a/editor/editor_help.cpp
+++ b/editor/editor_help.cpp
@@ -203,13 +203,20 @@ void EditorHelp::_class_desc_resized(bool p_force_update_theme) {
 }
 
 void EditorHelp::_add_type(const String &p_type, const String &p_enum) {
-	String t = p_type;
-	if (t.is_empty()) {
-		t = "void";
+	if (p_type.is_empty() || p_type == "void") {
+		class_desc->push_color(Color(type_color, 0.5));
+		class_desc->push_hint(TTR("No return value."));
+		class_desc->add_text("void");
+		class_desc->pop();
+		class_desc->pop();
+		return;
 	}
-	bool can_ref = (t != "void" && !t.contains("*")) || !p_enum.is_empty();
 
-	if (!p_enum.is_empty()) {
+	bool is_enum_type = !p_enum.is_empty();
+	bool can_ref = !p_type.contains("*") || is_enum_type;
+
+	String t = p_type;
+	if (is_enum_type) {
 		if (p_enum.get_slice_count(".") > 1) {
 			t = p_enum.get_slice(".", 1);
 		} else {
@@ -223,21 +230,24 @@ void EditorHelp::_add_type(const String &p_type, const String &p_enum) {
 		if (t.ends_with("[]")) {
 			add_array = true;
 			t = t.replace("[]", "");
+
+			class_desc->push_meta("#Array"); //class
+			class_desc->add_text("Array");
+			class_desc->pop();
+			class_desc->add_text("[");
 		}
-		if (p_enum.is_empty()) {
-			class_desc->push_meta("#" + t); //class
-		} else {
+
+		if (is_enum_type) {
 			class_desc->push_meta("$" + p_enum); //class
+		} else {
+			class_desc->push_meta("#" + t); //class
 		}
 	}
 	class_desc->add_text(t);
 	if (can_ref) {
-		class_desc->pop();
+		class_desc->pop(); // Pushed meta above.
 		if (add_array) {
-			class_desc->add_text(" ");
-			class_desc->push_meta("#Array"); //class
-			class_desc->add_text("[]");
-			class_desc->pop();
+			class_desc->add_text("]");
 		}
 	}
 	class_desc->pop();
diff --git a/main/main.cpp b/main/main.cpp
index 0a8de56e01..91d38ff6d9 100644
--- a/main/main.cpp
+++ b/main/main.cpp
@@ -1789,6 +1789,12 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph
 	ProjectSettings::get_singleton()->set_custom_property_info("physics/common/physics_ticks_per_second",
 			PropertyInfo(Variant::INT, "physics/common/physics_ticks_per_second",
 					PROPERTY_HINT_RANGE, "1,1000,1"));
+
+	Engine::get_singleton()->set_max_physics_steps_per_frame(GLOBAL_DEF("physics/common/max_physics_steps_per_frame", 8));
+	ProjectSettings::get_singleton()->set_custom_property_info("physics/common/max_physics_steps_per_frame",
+			PropertyInfo(Variant::INT, "physics/common/max_physics_steps_per_frame",
+					PROPERTY_HINT_RANGE, "1,100,1"));
+
 	Engine::get_singleton()->set_physics_jitter_fix(GLOBAL_DEF("physics/common/physics_jitter_fix", 0.5));
 	Engine::get_singleton()->set_max_fps(GLOBAL_DEF("application/run/max_fps", 0));
 	ProjectSettings::get_singleton()->set_custom_property_info("application/run/max_fps",
@@ -3105,7 +3111,7 @@ bool Main::iteration() {
 
 	last_ticks = ticks;
 
-	static const int max_physics_steps = 8;
+	const int max_physics_steps = Engine::get_singleton()->get_max_physics_steps_per_frame();
 	if (fixed_fps == -1 && advance.physics_steps > max_physics_steps) {
 		process_step -= (advance.physics_steps - max_physics_steps) * physics_step;
 		advance.physics_steps = max_physics_steps;
diff --git a/misc/scripts/file_format.sh b/misc/scripts/file_format.sh
index 1200b96ea0..a2f33692f9 100755
--- a/misc/scripts/file_format.sh
+++ b/misc/scripts/file_format.sh
@@ -31,7 +31,9 @@ while IFS= read -rd '' f; do
         continue
     elif [[ "$f" == *"po" ]]; then
         continue
-    elif [[ "$f" == "thirdparty"* ]]; then
+    elif [[ "$f" == "thirdparty/"* ]]; then
+        continue
+    elif [[ "$f" == *"/thirdparty/"* ]]; then
         continue
     elif [[ "$f" == "platform/android/java/lib/src/com/google"* ]]; then
         continue
diff --git a/modules/gdscript/gdscript_editor.cpp b/modules/gdscript/gdscript_editor.cpp
index 68508b20da..48a6e3fb51 100644
--- a/modules/gdscript/gdscript_editor.cpp
+++ b/modules/gdscript/gdscript_editor.cpp
@@ -2511,6 +2511,57 @@ static void _find_call_arguments(GDScriptParser::CompletionContext &p_context, c
 	}
 }
 
+static bool _get_subscript_type(GDScriptParser::CompletionContext &p_context, const GDScriptParser::SubscriptNode *p_subscript, GDScriptParser::DataType &r_base_type, Variant *r_base = nullptr) {
+	if (p_subscript->base->type == GDScriptParser::Node::IDENTIFIER) {
+		const GDScriptParser::GetNodeNode *get_node = nullptr;
+		const GDScriptParser::IdentifierNode *identifier_node = static_cast<GDScriptParser::IdentifierNode *>(p_subscript->base);
+
+		switch (identifier_node->source) {
+			case GDScriptParser::IdentifierNode::Source::MEMBER_VARIABLE: {
+				if (p_context.current_class != nullptr) {
+					const StringName &member_name = identifier_node->name;
+					const GDScriptParser::ClassNode *current_class = p_context.current_class;
+
+					if (current_class->has_member(member_name)) {
+						const GDScriptParser::ClassNode::Member &member = current_class->get_member(member_name);
+
+						if (member.type == GDScriptParser::ClassNode::Member::VARIABLE) {
+							const GDScriptParser::VariableNode *variable = static_cast<GDScriptParser::VariableNode *>(member.variable);
+
+							if (variable->initializer && variable->initializer->type == GDScriptParser::Node::GET_NODE) {
+								get_node = static_cast<GDScriptParser::GetNodeNode *>(variable->initializer);
+							}
+						}
+					}
+				}
+			} break;
+			case GDScriptParser::IdentifierNode::Source::LOCAL_VARIABLE: {
+				if (identifier_node->next != nullptr && identifier_node->next->type == GDScriptParser::ClassNode::Node::GET_NODE) {
+					get_node = static_cast<GDScriptParser::GetNodeNode *>(identifier_node->next);
+				}
+			} break;
+			default:
+				break;
+		}
+
+		if (get_node != nullptr) {
+			const Object *node = p_context.base->call("get_node_or_null", NodePath(get_node->full_path));
+			if (node != nullptr) {
+				if (r_base != nullptr) {
+					*r_base = node;
+				}
+				r_base_type.type_source = GDScriptParser::DataType::ANNOTATED_EXPLICIT;
+				r_base_type.kind = GDScriptParser::DataType::NATIVE;
+				r_base_type.native_type = node->get_class_name();
+				r_base_type.builtin_type = Variant::OBJECT;
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 static void _find_call_arguments(GDScriptParser::CompletionContext &p_context, const GDScriptParser::Node *p_call, int p_argidx, HashMap<String, ScriptLanguage::CodeCompletionOption> &r_result, bool &r_forced, String &r_arghint) {
 	if (p_call->type == GDScriptParser::Node::PRELOAD) {
 		if (p_argidx == 0 && bool(EDITOR_GET("text_editor/completion/complete_file_paths"))) {
@@ -2561,13 +2612,17 @@ static void _find_call_arguments(GDScriptParser::CompletionContext &p_context, c
 			}
 		}
 
-		if (subscript->is_attribute) {
-			GDScriptCompletionIdentifier ci;
-			if (_guess_expression_type(p_context, subscript->base, ci)) {
-				base_type = ci.type;
-				base = ci.value;
-			} else {
-				return;
+		if (p_context.base != nullptr && subscript->is_attribute) {
+			bool found_type = _get_subscript_type(p_context, subscript, base_type, &base);
+
+			if (!found_type) {
+				GDScriptCompletionIdentifier ci;
+				if (_guess_expression_type(p_context, subscript->base, ci)) {
+					base_type = ci.type;
+					base = ci.value;
+				} else {
+					return;
+				}
 			}
 
 			_static = base_type.is_meta_type;
@@ -2765,60 +2820,7 @@ static void _find_call_arguments(GDScriptParser::CompletionContext &p_context, c
 			const GDScriptParser::SubscriptNode *attr = static_cast<const GDScriptParser::SubscriptNode *>(completion_context.node);
 			if (attr->base) {
 				GDScriptCompletionIdentifier base;
-				bool found_type = false;
-
-				if (p_owner != nullptr && attr->base->type == GDScriptParser::Node::IDENTIFIER) {
-					const GDScriptParser::GetNodeNode *get_node = nullptr;
-					const GDScriptParser::IdentifierNode *identifier_node = static_cast<GDScriptParser::IdentifierNode *>(attr->base);
-
-					switch (identifier_node->source) {
-						case GDScriptParser::IdentifierNode::Source::MEMBER_VARIABLE: {
-							if (completion_context.current_class != nullptr) {
-								const StringName &member_name = identifier_node->name;
-								const GDScriptParser::ClassNode *current_class = completion_context.current_class;
-
-								if (current_class->has_member(member_name)) {
-									const GDScriptParser::ClassNode::Member &member = current_class->get_member(member_name);
-
-									if (member.type == GDScriptParser::ClassNode::Member::VARIABLE) {
-										const GDScriptParser::VariableNode *variable = static_cast<GDScriptParser::VariableNode *>(member.variable);
-
-										if (variable->initializer && variable->initializer->type == GDScriptParser::Node::GET_NODE) {
-											get_node = static_cast<GDScriptParser::GetNodeNode *>(variable->initializer);
-										}
-									}
-								}
-							}
-						} break;
-						case GDScriptParser::IdentifierNode::Source::LOCAL_VARIABLE: {
-							if (identifier_node->next != nullptr && identifier_node->next->type == GDScriptParser::ClassNode::Node::GET_NODE) {
-								get_node = static_cast<GDScriptParser::GetNodeNode *>(identifier_node->next);
-							}
-						} break;
-						default:
-							break;
-					}
-
-					if (get_node != nullptr) {
-						const Object *node = p_owner->call("get_node_or_null", NodePath(get_node->full_path));
-						if (node != nullptr) {
-							found_type = true;
-
-							GDScriptParser::DataType type;
-							type.type_source = GDScriptParser::DataType::ANNOTATED_EXPLICIT;
-							type.kind = GDScriptParser::DataType::NATIVE;
-							type.native_type = node->get_class_name();
-							type.builtin_type = Variant::OBJECT;
-
-							base.type = type;
-						}
-
-						if (!found_type) {
-							break;
-						}
-					}
-				}
-
+				bool found_type = _get_subscript_type(completion_context, attr, base.type);
 				if (!found_type && !_guess_expression_type(completion_context, attr->base, base)) {
 					break;
 				}
diff --git a/modules/theora/SCsub b/modules/theora/SCsub
index 6038ea086a..ca666050dd 100644
--- a/modules/theora/SCsub
+++ b/modules/theora/SCsub
@@ -15,7 +15,7 @@ if env["builtin_libtheora"]:
         # "analyze.c",
         # "apiwrapper.c",
         "bitpack.c",
-        "cpu.c",
+        # "collect.c",
         # "decapiwrapper.c",
         "decinfo.c",
         "decode.c",
@@ -47,8 +47,12 @@ if env["builtin_libtheora"]:
         "x86/mmxfrag.c",
         "x86/mmxidct.c",
         "x86/mmxstate.c",
+        # "x86/sse2encfrag.c",
         # "x86/sse2fdct.c",
+        "x86/sse2idct.c",
+        "x86/x86cpu.c",
         # "x86/x86enc.c",
+        # "x86/x86enquant.c"
         "x86/x86state.c",
     ]
 
@@ -58,6 +62,7 @@ if env["builtin_libtheora"]:
         "x86_vc/mmxfrag.c",
         "x86_vc/mmxidct.c",
         "x86_vc/mmxstate.c",
+        "x86_vc/x86cpu.c",
         # "x86_vc/x86enc.c",
         "x86_vc/x86state.c",
     ]
diff --git a/platform/linuxbsd/detect.py b/platform/linuxbsd/detect.py
index ac69f3806b..004bcb8674 100644
--- a/platform/linuxbsd/detect.py
+++ b/platform/linuxbsd/detect.py
@@ -356,7 +356,6 @@ def configure(env: "Environment"):
 
     if env["opengl3"]:
         env.Append(CPPDEFINES=["GLES3_ENABLED"])
-        env.ParseConfig("pkg-config gl --cflags --libs")
 
     env.Append(LIBS=["pthread"])
 
diff --git a/platform/linuxbsd/x11/SCsub b/platform/linuxbsd/x11/SCsub
index 974ad98fb9..30c6080355 100644
--- a/platform/linuxbsd/x11/SCsub
+++ b/platform/linuxbsd/x11/SCsub
@@ -11,7 +11,7 @@ if env["vulkan"]:
     source_files.append("vulkan_context_x11.cpp")
 
 if env["opengl3"]:
-    source_files.append(["gl_manager_x11.cpp", "detect_prime_x11.cpp"])
+    source_files.append(["gl_manager_x11.cpp", "detect_prime_x11.cpp", "#thirdparty/glad/glx.c"])
 
 objects = []
 
diff --git a/platform/linuxbsd/x11/detect_prime_x11.cpp b/platform/linuxbsd/x11/detect_prime_x11.cpp
index 78a10fa2b0..ed046432d8 100644
--- a/platform/linuxbsd/x11/detect_prime_x11.cpp
+++ b/platform/linuxbsd/x11/detect_prime_x11.cpp
@@ -38,8 +38,9 @@
 
 #include <stdlib.h>
 
-#include <GL/gl.h>
-#include <GL/glx.h>
+#include "thirdparty/glad/glad/gl.h"
+#include "thirdparty/glad/glad/glx.h"
+
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
 
@@ -77,8 +78,6 @@ void create_context() {
 	Window x11_window;
 	GLXContext glx_context;
 
-	GLXCREATECONTEXTATTRIBSARBPROC glXCreateContextAttribsARB = (GLXCREATECONTEXTATTRIBSARBPROC)glXGetProcAddress((const GLubyte *)"glXCreateContextAttribsARB");
-
 	static int visual_attribs[] = {
 		GLX_RENDER_TYPE, GLX_RGBA_BIT,
 		GLX_DRAWABLE_TYPE, GLX_WINDOW_BIT,
@@ -101,7 +100,7 @@ void create_context() {
 
 	GLXFBConfig *fbc = glXChooseFBConfig(x11_display, DefaultScreen(x11_display), visual_attribs, &fbcount);
 	if (!fbc) {
-		exit(1);
+		quick_exit(1);
 	}
 
 	vi = glXGetVisualFromFBConfig(x11_display, fbc[0]);
@@ -122,7 +121,7 @@ void create_context() {
 	x11_window = XCreateWindow(x11_display, RootWindow(x11_display, vi->screen), 0, 0, 10, 10, 0, vi->depth, InputOutput, vi->visual, valuemask, &swa);
 
 	if (!x11_window) {
-		exit(1);
+		quick_exit(1);
 	}
 
 	glXMakeCurrent(x11_display, x11_window, glx_context);
@@ -189,8 +188,20 @@ int detect_prime() {
 			if (i) {
 				setenv("DRI_PRIME", "1", 1);
 			}
+
+			if (gladLoaderLoadGLX(NULL, 0) == 0) {
+				print_verbose("Unable to load GLX, GPU detection skipped.");
+				quick_exit(1);
+			}
+
 			create_context();
 
+			PFNGLGETSTRINGPROC glGetString = (PFNGLGETSTRINGPROC)glXGetProcAddressARB((GLubyte *)"glGetString");
+			if (!glGetString) {
+				print_verbose("Unable to get glGetString, GPU detection skipped.");
+				quick_exit(1);
+			}
+
 			const char *vendor = (const char *)glGetString(GL_VENDOR);
 			const char *renderer = (const char *)glGetString(GL_RENDERER);
 
diff --git a/platform/linuxbsd/x11/gl_manager_x11.cpp b/platform/linuxbsd/x11/gl_manager_x11.cpp
index 893a22e75e..4d8d63c64a 100644
--- a/platform/linuxbsd/x11/gl_manager_x11.cpp
+++ b/platform/linuxbsd/x11/gl_manager_x11.cpp
@@ -37,9 +37,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 
-#define GLX_GLXEXT_PROTOTYPES
-#include <GL/glx.h>
-#include <GL/glxext.h>
+#include "thirdparty/glad/glad/glx.h"
 
 #define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091
 #define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092
@@ -324,11 +322,14 @@ void GLManager_X11::swap_buffers() {
 }
 
 Error GLManager_X11::initialize() {
+	if (!gladLoaderLoadGLX(nullptr, 0)) {
+		return ERR_CANT_CREATE;
+	}
+
 	return OK;
 }
 
 void GLManager_X11::set_use_vsync(bool p_use) {
-	static bool setup = false;
 	static PFNGLXSWAPINTERVALEXTPROC glXSwapIntervalEXT = nullptr;
 	static PFNGLXSWAPINTERVALSGIPROC glXSwapIntervalMESA = nullptr;
 	static PFNGLXSWAPINTERVALSGIPROC glXSwapIntervalSGI = nullptr;
@@ -345,25 +346,12 @@ void GLManager_X11::set_use_vsync(bool p_use) {
 	}
 	const GLDisplay &disp = get_current_display();
 
-	if (!setup) {
-		setup = true;
-		String extensions = glXQueryExtensionsString(disp.x11_display, DefaultScreen(disp.x11_display));
-		if (extensions.find("GLX_EXT_swap_control") != -1) {
-			glXSwapIntervalEXT = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddressARB((const GLubyte *)"glXSwapIntervalEXT");
-		}
-		if (extensions.find("GLX_MESA_swap_control") != -1) {
-			glXSwapIntervalMESA = (PFNGLXSWAPINTERVALSGIPROC)glXGetProcAddressARB((const GLubyte *)"glXSwapIntervalMESA");
-		}
-		if (extensions.find("GLX_SGI_swap_control") != -1) {
-			glXSwapIntervalSGI = (PFNGLXSWAPINTERVALSGIPROC)glXGetProcAddressARB((const GLubyte *)"glXSwapIntervalSGI");
-		}
-	}
 	int val = p_use ? 1 : 0;
-	if (glXSwapIntervalMESA) {
+	if (GLAD_GLX_MESA_swap_control) {
 		glXSwapIntervalMESA(val);
-	} else if (glXSwapIntervalSGI) {
+	} else if (GLAD_GLX_SGI_swap_control) {
 		glXSwapIntervalSGI(val);
-	} else if (glXSwapIntervalEXT) {
+	} else if (GLAD_GLX_EXT_swap_control) {
 		GLXDrawable drawable = glXGetCurrentDrawable();
 		glXSwapIntervalEXT(disp.x11_display, drawable, val);
 	} else {
diff --git a/scene/2d/joint_2d.cpp b/scene/2d/joint_2d.cpp
index 6000508f36..8de4c281f4 100644
--- a/scene/2d/joint_2d.cpp
+++ b/scene/2d/joint_2d.cpp
@@ -133,7 +133,13 @@ void Joint2D::set_node_a(const NodePath &p_node_a) {
 	}
 
 	a = p_node_a;
-	_update_joint();
+	if (Engine::get_singleton()->is_editor_hint()) {
+		// When in editor, the setter may be called as a result of node rename.
+		// It happens before the node actually changes its name, which triggers false warning.
+		callable_mp(this, &Joint2D::_update_joint).call_deferred();
+	} else {
+		_update_joint();
+	}
 }
 
 NodePath Joint2D::get_node_a() const {
@@ -150,7 +156,11 @@ void Joint2D::set_node_b(const NodePath &p_node_b) {
 	}
 
 	b = p_node_b;
-	_update_joint();
+	if (Engine::get_singleton()->is_editor_hint()) {
+		callable_mp(this, &Joint2D::_update_joint).call_deferred();
+	} else {
+		_update_joint();
+	}
 }
 
 NodePath Joint2D::get_node_b() const {
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 2dced503a8..84ed095fd9 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -186,6 +186,8 @@ Files generated from [upstream web instance](https://gen.glad.sh/):
 - `KHR/khrplatform.h`
 - `gl.c`
 - `glad/gl.h`
+- `glx.c`
+- `glad/glx.h`
 
 
 ## glslang
@@ -305,18 +307,15 @@ Files extracted from upstream source:
 ## libtheora
 
 - Upstream: https://www.theora.org
-- Version: 1.1.1 (2010)
+- Version: git (7180717276af1ebc7da15c83162d6c5d6203aabf, 2020)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
 
-- all .c, .h in lib/
+- all .c, .h in lib/, except arm/ and c64x/ folders
 - all .h files in include/theora/ as theora/
 - COPYING and LICENSE
 
-Upstream patches included in the `patches` directory have been applied
-on top of the 1.1.1 source (not included in any stable release yet).
-
 
 ## libvorbis
 
diff --git a/thirdparty/glad/glad/glx.h b/thirdparty/glad/glad/glx.h
new file mode 100644
index 0000000000..ac115fa63b
--- /dev/null
+++ b/thirdparty/glad/glad/glx.h
@@ -0,0 +1,605 @@
+/**
+ * Loader generated by glad 2.0.2 on Tue Nov 15 09:49:49 2022
+ *
+ * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0
+ *
+ * Generator: C/C++
+ * Specification: glx
+ * Extensions: 6
+ *
+ * APIs:
+ *  - glx=1.4
+ *
+ * Options:
+ *  - ALIAS = False
+ *  - DEBUG = False
+ *  - HEADER_ONLY = False
+ *  - LOADER = True
+ *  - MX = False
+ *  - ON_DEMAND = False
+ *
+ * Commandline:
+ *    --api='glx=1.4' --extensions='GLX_ARB_create_context,GLX_ARB_create_context_profile,GLX_ARB_get_proc_address,GLX_EXT_swap_control,GLX_MESA_swap_control,GLX_SGI_swap_control' c --loader
+ *
+ * Online:
+ *    http://glad.sh/#api=glx%3D1.4&extensions=GLX_ARB_create_context%2CGLX_ARB_create_context_profile%2CGLX_ARB_get_proc_address%2CGLX_EXT_swap_control%2CGLX_MESA_swap_control%2CGLX_SGI_swap_control&generator=c&options=LOADER
+ *
+ */
+
+#ifndef GLAD_GLX_H_
+#define GLAD_GLX_H_
+
+#ifdef GLX_H
+  #error GLX header already included (API: glx), remove previous include!
+#endif
+#define GLX_H 1
+
+
+#include <X11/X.h>
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+#include <glad/gl.h>
+
+#define GLAD_GLX
+#define GLAD_OPTION_GLX_LOADER
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef GLAD_PLATFORM_H_
+#define GLAD_PLATFORM_H_
+
+#ifndef GLAD_PLATFORM_WIN32
+  #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__MINGW32__)
+    #define GLAD_PLATFORM_WIN32 1
+  #else
+    #define GLAD_PLATFORM_WIN32 0
+  #endif
+#endif
+
+#ifndef GLAD_PLATFORM_APPLE
+  #ifdef __APPLE__
+    #define GLAD_PLATFORM_APPLE 1
+  #else
+    #define GLAD_PLATFORM_APPLE 0
+  #endif
+#endif
+
+#ifndef GLAD_PLATFORM_EMSCRIPTEN
+  #ifdef __EMSCRIPTEN__
+    #define GLAD_PLATFORM_EMSCRIPTEN 1
+  #else
+    #define GLAD_PLATFORM_EMSCRIPTEN 0
+  #endif
+#endif
+
+#ifndef GLAD_PLATFORM_UWP
+  #if defined(_MSC_VER) && !defined(GLAD_INTERNAL_HAVE_WINAPIFAMILY)
+    #ifdef __has_include
+      #if __has_include(<winapifamily.h>)
+        #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1
+      #endif
+    #elif _MSC_VER >= 1700 && !_USING_V110_SDK71_
+      #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1
+    #endif
+  #endif
+
+  #ifdef GLAD_INTERNAL_HAVE_WINAPIFAMILY
+    #include <winapifamily.h>
+    #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define GLAD_PLATFORM_UWP 1
+    #endif
+  #endif
+
+  #ifndef GLAD_PLATFORM_UWP
+    #define GLAD_PLATFORM_UWP 0
+  #endif
+#endif
+
+#ifdef __GNUC__
+  #define GLAD_GNUC_EXTENSION __extension__
+#else
+  #define GLAD_GNUC_EXTENSION
+#endif
+
+#define GLAD_UNUSED(x) (void)(x)
+
+#ifndef GLAD_API_CALL
+  #if defined(GLAD_API_CALL_EXPORT)
+    #if GLAD_PLATFORM_WIN32 || defined(__CYGWIN__)
+      #if defined(GLAD_API_CALL_EXPORT_BUILD)
+        #if defined(__GNUC__)
+          #define GLAD_API_CALL __attribute__ ((dllexport)) extern
+        #else
+          #define GLAD_API_CALL __declspec(dllexport) extern
+        #endif
+      #else
+        #if defined(__GNUC__)
+          #define GLAD_API_CALL __attribute__ ((dllimport)) extern
+        #else
+          #define GLAD_API_CALL __declspec(dllimport) extern
+        #endif
+      #endif
+    #elif defined(__GNUC__) && defined(GLAD_API_CALL_EXPORT_BUILD)
+      #define GLAD_API_CALL __attribute__ ((visibility ("default"))) extern
+    #else
+      #define GLAD_API_CALL extern
+    #endif
+  #else
+    #define GLAD_API_CALL extern
+  #endif
+#endif
+
+#ifdef APIENTRY
+  #define GLAD_API_PTR APIENTRY
+#elif GLAD_PLATFORM_WIN32
+  #define GLAD_API_PTR __stdcall
+#else
+  #define GLAD_API_PTR
+#endif
+
+#ifndef GLAPI
+#define GLAPI GLAD_API_CALL
+#endif
+
+#ifndef GLAPIENTRY
+#define GLAPIENTRY GLAD_API_PTR
+#endif
+
+#define GLAD_MAKE_VERSION(major, minor) (major * 10000 + minor)
+#define GLAD_VERSION_MAJOR(version) (version / 10000)
+#define GLAD_VERSION_MINOR(version) (version % 10000)
+
+#define GLAD_GENERATOR_VERSION "2.0.2"
+
+typedef void (*GLADapiproc)(void);
+
+typedef GLADapiproc (*GLADloadfunc)(const char *name);
+typedef GLADapiproc (*GLADuserptrloadfunc)(void *userptr, const char *name);
+
+typedef void (*GLADprecallback)(const char *name, GLADapiproc apiproc, int len_args, ...);
+typedef void (*GLADpostcallback)(void *ret, const char *name, GLADapiproc apiproc, int len_args, ...);
+
+#endif /* GLAD_PLATFORM_H_ */
+
+#define GLX_ACCUM_ALPHA_SIZE 17
+#define GLX_ACCUM_BLUE_SIZE 16
+#define GLX_ACCUM_BUFFER_BIT 0x00000080
+#define GLX_ACCUM_GREEN_SIZE 15
+#define GLX_ACCUM_RED_SIZE 14
+#define GLX_ALPHA_SIZE 11
+#define GLX_AUX_BUFFERS 7
+#define GLX_AUX_BUFFERS_BIT 0x00000010
+#define GLX_BACK_LEFT_BUFFER_BIT 0x00000004
+#define GLX_BACK_RIGHT_BUFFER_BIT 0x00000008
+#define GLX_BAD_ATTRIBUTE 2
+#define GLX_BAD_CONTEXT 5
+#define GLX_BAD_ENUM 7
+#define GLX_BAD_SCREEN 1
+#define GLX_BAD_VALUE 6
+#define GLX_BAD_VISUAL 4
+#define GLX_BLUE_SIZE 10
+#define GLX_BUFFER_SIZE 2
+#define GLX_BufferSwapComplete 1
+#define GLX_COLOR_INDEX_BIT 0x00000002
+#define GLX_COLOR_INDEX_TYPE 0x8015
+#define GLX_CONFIG_CAVEAT 0x20
+#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002
+#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB 0x00000001
+#define GLX_CONTEXT_DEBUG_BIT_ARB 0x00000001
+#define GLX_CONTEXT_FLAGS_ARB 0x2094
+#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB 0x00000002
+#define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091
+#define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092
+#define GLX_CONTEXT_PROFILE_MASK_ARB 0x9126
+#define GLX_DAMAGED 0x8020
+#define GLX_DEPTH_BUFFER_BIT 0x00000020
+#define GLX_DEPTH_SIZE 12
+#define GLX_DIRECT_COLOR 0x8003
+#define GLX_DONT_CARE 0xFFFFFFFF
+#define GLX_DOUBLEBUFFER 5
+#define GLX_DRAWABLE_TYPE 0x8010
+#define GLX_EVENT_MASK 0x801F
+#define GLX_EXTENSIONS 0x3
+#define GLX_EXTENSION_NAME "GLX"
+#define GLX_FBCONFIG_ID 0x8013
+#define GLX_FRONT_LEFT_BUFFER_BIT 0x00000001
+#define GLX_FRONT_RIGHT_BUFFER_BIT 0x00000002
+#define GLX_GRAY_SCALE 0x8006
+#define GLX_GREEN_SIZE 9
+#define GLX_HEIGHT 0x801E
+#define GLX_LARGEST_PBUFFER 0x801C
+#define GLX_LEVEL 3
+#define GLX_MAX_PBUFFER_HEIGHT 0x8017
+#define GLX_MAX_PBUFFER_PIXELS 0x8018
+#define GLX_MAX_PBUFFER_WIDTH 0x8016
+#define GLX_MAX_SWAP_INTERVAL_EXT 0x20F2
+#define GLX_NONE 0x8000
+#define GLX_NON_CONFORMANT_CONFIG 0x800D
+#define GLX_NO_EXTENSION 3
+#define GLX_PBUFFER 0x8023
+#define GLX_PBUFFER_BIT 0x00000004
+#define GLX_PBUFFER_CLOBBER_MASK 0x08000000
+#define GLX_PBUFFER_HEIGHT 0x8040
+#define GLX_PBUFFER_WIDTH 0x8041
+#define GLX_PIXMAP_BIT 0x00000002
+#define GLX_PRESERVED_CONTENTS 0x801B
+#define GLX_PSEUDO_COLOR 0x8004
+#define GLX_PbufferClobber 0
+#define GLX_RED_SIZE 8
+#define GLX_RENDER_TYPE 0x8011
+#define GLX_RGBA 4
+#define GLX_RGBA_BIT 0x00000001
+#define GLX_RGBA_TYPE 0x8014
+#define GLX_SAMPLES 100001
+#define GLX_SAMPLE_BUFFERS 100000
+#define GLX_SAVED 0x8021
+#define GLX_SCREEN 0x800C
+#define GLX_SLOW_CONFIG 0x8001
+#define GLX_STATIC_COLOR 0x8005
+#define GLX_STATIC_GRAY 0x8007
+#define GLX_STENCIL_BUFFER_BIT 0x00000040
+#define GLX_STENCIL_SIZE 13
+#define GLX_STEREO 6
+#define GLX_SWAP_INTERVAL_EXT 0x20F1
+#define GLX_TRANSPARENT_ALPHA_VALUE 0x28
+#define GLX_TRANSPARENT_BLUE_VALUE 0x27
+#define GLX_TRANSPARENT_GREEN_VALUE 0x26
+#define GLX_TRANSPARENT_INDEX 0x8009
+#define GLX_TRANSPARENT_INDEX_VALUE 0x24
+#define GLX_TRANSPARENT_RED_VALUE 0x25
+#define GLX_TRANSPARENT_RGB 0x8008
+#define GLX_TRANSPARENT_TYPE 0x23
+#define GLX_TRUE_COLOR 0x8002
+#define GLX_USE_GL 1
+#define GLX_VENDOR 0x1
+#define GLX_VERSION 0x2
+#define GLX_VISUAL_ID 0x800B
+#define GLX_WIDTH 0x801D
+#define GLX_WINDOW 0x8022
+#define GLX_WINDOW_BIT 0x00000001
+#define GLX_X_RENDERABLE 0x8012
+#define GLX_X_VISUAL_TYPE 0x22
+#define __GLX_NUMBER_EVENTS 17
+
+
+#ifndef GLEXT_64_TYPES_DEFINED
+/* This code block is duplicated in glext.h, so must be protected */
+#define GLEXT_64_TYPES_DEFINED
+/* Define int32_t, int64_t, and uint64_t types for UST/MSC */
+/* (as used in the GLX_OML_sync_control extension). */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#include <inttypes.h>
+#elif defined(__sun__) || defined(__digital__)
+#include <inttypes.h>
+#if defined(__STDC__)
+#if defined(__arch64__) || defined(_LP64)
+typedef long int int64_t;
+typedef unsigned long int uint64_t;
+#else
+typedef long long int int64_t;
+typedef unsigned long long int uint64_t;
+#endif /* __arch64__ */
+#endif /* __STDC__ */
+#elif defined( __VMS ) || defined(__sgi)
+#include <inttypes.h>
+#elif defined(__SCO__) || defined(__USLC__)
+#include <stdint.h>
+#elif defined(__UNIXOS2__) || defined(__SOL64__)
+typedef long int int32_t;
+typedef long long int int64_t;
+typedef unsigned long long int uint64_t;
+#elif defined(_WIN32) && defined(__GNUC__)
+#include <stdint.h>
+#elif defined(_WIN32)
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+/* Fallback if nothing above works */
+#include <inttypes.h>
+#endif
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060)
+
+#else
+
+#endif
+
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060)
+
+#else
+
+#endif
+
+
+
+
+
+
+
+typedef XID GLXFBConfigID;
+typedef struct __GLXFBConfigRec *GLXFBConfig;
+typedef XID GLXContextID;
+typedef struct __GLXcontextRec *GLXContext;
+typedef XID GLXPixmap;
+typedef XID GLXDrawable;
+typedef XID GLXWindow;
+typedef XID GLXPbuffer;
+typedef void (GLAD_API_PTR *__GLXextFuncPtr)(void);
+typedef XID GLXVideoCaptureDeviceNV;
+typedef unsigned int GLXVideoDeviceNV;
+typedef XID GLXVideoSourceSGIX;
+typedef XID GLXFBConfigIDSGIX;
+typedef struct __GLXFBConfigRec *GLXFBConfigSGIX;
+typedef XID GLXPbufferSGIX;
+typedef struct {
+    int event_type;             /* GLX_DAMAGED or GLX_SAVED */
+    int draw_type;              /* GLX_WINDOW or GLX_PBUFFER */
+    unsigned long serial;       /* # of last request processed by server */
+    Bool send_event;            /* true if this came for SendEvent request */
+    Display *display;           /* display the event was read from */
+    GLXDrawable drawable;       /* XID of Drawable */
+    unsigned int buffer_mask;   /* mask indicating which buffers are affected */
+    unsigned int aux_buffer;    /* which aux buffer was affected */
+    int x, y;
+    int width, height;
+    int count;                  /* if nonzero, at least this many more */
+} GLXPbufferClobberEvent;
+typedef struct {
+    int type;
+    unsigned long serial;       /* # of last request processed by server */
+    Bool send_event;            /* true if this came from a SendEvent request */
+    Display *display;           /* Display the event was read from */
+    GLXDrawable drawable;       /* drawable on which event was requested in event mask */
+    int event_type;
+    int64_t ust;
+    int64_t msc;
+    int64_t sbc;
+} GLXBufferSwapComplete;
+typedef union __GLXEvent {
+    GLXPbufferClobberEvent glxpbufferclobber;
+    GLXBufferSwapComplete glxbufferswapcomplete;
+    long pad[24];
+} GLXEvent;
+typedef struct {
+    int type;
+    unsigned long serial;
+    Bool send_event;
+    Display *display;
+    int extension;
+    int evtype;
+    GLXDrawable window;
+    Bool stereo_tree;
+} GLXStereoNotifyEventEXT;
+typedef struct {
+    int type;
+    unsigned long serial;   /* # of last request processed by server */
+    Bool send_event;        /* true if this came for SendEvent request */
+    Display *display;       /* display the event was read from */
+    GLXDrawable drawable;   /* i.d. of Drawable */
+    int event_type;         /* GLX_DAMAGED_SGIX or GLX_SAVED_SGIX */
+    int draw_type;          /* GLX_WINDOW_SGIX or GLX_PBUFFER_SGIX */
+    unsigned int mask;      /* mask indicating which buffers are affected*/
+    int x, y;
+    int width, height;
+    int count;              /* if nonzero, at least this many more */
+} GLXBufferClobberEventSGIX;
+typedef struct {
+    char    pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */
+    int     networkId;
+} GLXHyperpipeNetworkSGIX;
+typedef struct {
+    char    pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */
+    int     channel;
+    unsigned int participationType;
+    int     timeSlice;
+} GLXHyperpipeConfigSGIX;
+typedef struct {
+    char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */
+    int srcXOrigin, srcYOrigin, srcWidth, srcHeight;
+    int destXOrigin, destYOrigin, destWidth, destHeight;
+} GLXPipeRect;
+typedef struct {
+    char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */
+    int XOrigin, YOrigin, maxHeight, maxWidth;
+} GLXPipeRectLimits;
+
+
+#define GLX_VERSION_1_0 1
+GLAD_API_CALL int GLAD_GLX_VERSION_1_0;
+#define GLX_VERSION_1_1 1
+GLAD_API_CALL int GLAD_GLX_VERSION_1_1;
+#define GLX_VERSION_1_2 1
+GLAD_API_CALL int GLAD_GLX_VERSION_1_2;
+#define GLX_VERSION_1_3 1
+GLAD_API_CALL int GLAD_GLX_VERSION_1_3;
+#define GLX_VERSION_1_4 1
+GLAD_API_CALL int GLAD_GLX_VERSION_1_4;
+#define GLX_ARB_create_context 1
+GLAD_API_CALL int GLAD_GLX_ARB_create_context;
+#define GLX_ARB_create_context_profile 1
+GLAD_API_CALL int GLAD_GLX_ARB_create_context_profile;
+#define GLX_ARB_get_proc_address 1
+GLAD_API_CALL int GLAD_GLX_ARB_get_proc_address;
+#define GLX_EXT_swap_control 1
+GLAD_API_CALL int GLAD_GLX_EXT_swap_control;
+#define GLX_MESA_swap_control 1
+GLAD_API_CALL int GLAD_GLX_MESA_swap_control;
+#define GLX_SGI_swap_control 1
+GLAD_API_CALL int GLAD_GLX_SGI_swap_control;
+
+
+typedef GLXFBConfig * (GLAD_API_PTR *PFNGLXCHOOSEFBCONFIGPROC)(Display * dpy, int screen, const int * attrib_list, int * nelements);
+typedef XVisualInfo * (GLAD_API_PTR *PFNGLXCHOOSEVISUALPROC)(Display * dpy, int screen, int * attribList);
+typedef void (GLAD_API_PTR *PFNGLXCOPYCONTEXTPROC)(Display * dpy, GLXContext src, GLXContext dst, unsigned long mask);
+typedef GLXContext (GLAD_API_PTR *PFNGLXCREATECONTEXTPROC)(Display * dpy, XVisualInfo * vis, GLXContext shareList, Bool direct);
+typedef GLXContext (GLAD_API_PTR *PFNGLXCREATECONTEXTATTRIBSARBPROC)(Display * dpy, GLXFBConfig config, GLXContext share_context, Bool direct, const int * attrib_list);
+typedef GLXPixmap (GLAD_API_PTR *PFNGLXCREATEGLXPIXMAPPROC)(Display * dpy, XVisualInfo * visual, Pixmap pixmap);
+typedef GLXContext (GLAD_API_PTR *PFNGLXCREATENEWCONTEXTPROC)(Display * dpy, GLXFBConfig config, int render_type, GLXContext share_list, Bool direct);
+typedef GLXPbuffer (GLAD_API_PTR *PFNGLXCREATEPBUFFERPROC)(Display * dpy, GLXFBConfig config, const int * attrib_list);
+typedef GLXPixmap (GLAD_API_PTR *PFNGLXCREATEPIXMAPPROC)(Display * dpy, GLXFBConfig config, Pixmap pixmap, const int * attrib_list);
+typedef GLXWindow (GLAD_API_PTR *PFNGLXCREATEWINDOWPROC)(Display * dpy, GLXFBConfig config, Window win, const int * attrib_list);
+typedef void (GLAD_API_PTR *PFNGLXDESTROYCONTEXTPROC)(Display * dpy, GLXContext ctx);
+typedef void (GLAD_API_PTR *PFNGLXDESTROYGLXPIXMAPPROC)(Display * dpy, GLXPixmap pixmap);
+typedef void (GLAD_API_PTR *PFNGLXDESTROYPBUFFERPROC)(Display * dpy, GLXPbuffer pbuf);
+typedef void (GLAD_API_PTR *PFNGLXDESTROYPIXMAPPROC)(Display * dpy, GLXPixmap pixmap);
+typedef void (GLAD_API_PTR *PFNGLXDESTROYWINDOWPROC)(Display * dpy, GLXWindow win);
+typedef const char * (GLAD_API_PTR *PFNGLXGETCLIENTSTRINGPROC)(Display * dpy, int name);
+typedef int (GLAD_API_PTR *PFNGLXGETCONFIGPROC)(Display * dpy, XVisualInfo * visual, int attrib, int * value);
+typedef GLXContext (GLAD_API_PTR *PFNGLXGETCURRENTCONTEXTPROC)(void);
+typedef Display * (GLAD_API_PTR *PFNGLXGETCURRENTDISPLAYPROC)(void);
+typedef GLXDrawable (GLAD_API_PTR *PFNGLXGETCURRENTDRAWABLEPROC)(void);
+typedef GLXDrawable (GLAD_API_PTR *PFNGLXGETCURRENTREADDRAWABLEPROC)(void);
+typedef int (GLAD_API_PTR *PFNGLXGETFBCONFIGATTRIBPROC)(Display * dpy, GLXFBConfig config, int attribute, int * value);
+typedef GLXFBConfig * (GLAD_API_PTR *PFNGLXGETFBCONFIGSPROC)(Display * dpy, int screen, int * nelements);
+typedef __GLXextFuncPtr (GLAD_API_PTR *PFNGLXGETPROCADDRESSPROC)(const GLubyte * procName);
+typedef __GLXextFuncPtr (GLAD_API_PTR *PFNGLXGETPROCADDRESSARBPROC)(const GLubyte * procName);
+typedef void (GLAD_API_PTR *PFNGLXGETSELECTEDEVENTPROC)(Display * dpy, GLXDrawable draw, unsigned long * event_mask);
+typedef int (GLAD_API_PTR *PFNGLXGETSWAPINTERVALMESAPROC)(void);
+typedef XVisualInfo * (GLAD_API_PTR *PFNGLXGETVISUALFROMFBCONFIGPROC)(Display * dpy, GLXFBConfig config);
+typedef Bool (GLAD_API_PTR *PFNGLXISDIRECTPROC)(Display * dpy, GLXContext ctx);
+typedef Bool (GLAD_API_PTR *PFNGLXMAKECONTEXTCURRENTPROC)(Display * dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx);
+typedef Bool (GLAD_API_PTR *PFNGLXMAKECURRENTPROC)(Display * dpy, GLXDrawable drawable, GLXContext ctx);
+typedef int (GLAD_API_PTR *PFNGLXQUERYCONTEXTPROC)(Display * dpy, GLXContext ctx, int attribute, int * value);
+typedef void (GLAD_API_PTR *PFNGLXQUERYDRAWABLEPROC)(Display * dpy, GLXDrawable draw, int attribute, unsigned int * value);
+typedef Bool (GLAD_API_PTR *PFNGLXQUERYEXTENSIONPROC)(Display * dpy, int * errorb, int * event);
+typedef const char * (GLAD_API_PTR *PFNGLXQUERYEXTENSIONSSTRINGPROC)(Display * dpy, int screen);
+typedef const char * (GLAD_API_PTR *PFNGLXQUERYSERVERSTRINGPROC)(Display * dpy, int screen, int name);
+typedef Bool (GLAD_API_PTR *PFNGLXQUERYVERSIONPROC)(Display * dpy, int * maj, int * min);
+typedef void (GLAD_API_PTR *PFNGLXSELECTEVENTPROC)(Display * dpy, GLXDrawable draw, unsigned long event_mask);
+typedef void (GLAD_API_PTR *PFNGLXSWAPBUFFERSPROC)(Display * dpy, GLXDrawable drawable);
+typedef void (GLAD_API_PTR *PFNGLXSWAPINTERVALEXTPROC)(Display * dpy, GLXDrawable drawable, int interval);
+typedef int (GLAD_API_PTR *PFNGLXSWAPINTERVALMESAPROC)(unsigned int interval);
+typedef int (GLAD_API_PTR *PFNGLXSWAPINTERVALSGIPROC)(int interval);
+typedef void (GLAD_API_PTR *PFNGLXUSEXFONTPROC)(Font font, int first, int count, int list);
+typedef void (GLAD_API_PTR *PFNGLXWAITGLPROC)(void);
+typedef void (GLAD_API_PTR *PFNGLXWAITXPROC)(void);
+
+GLAD_API_CALL PFNGLXCHOOSEFBCONFIGPROC glad_glXChooseFBConfig;
+#define glXChooseFBConfig glad_glXChooseFBConfig
+GLAD_API_CALL PFNGLXCHOOSEVISUALPROC glad_glXChooseVisual;
+#define glXChooseVisual glad_glXChooseVisual
+GLAD_API_CALL PFNGLXCOPYCONTEXTPROC glad_glXCopyContext;
+#define glXCopyContext glad_glXCopyContext
+GLAD_API_CALL PFNGLXCREATECONTEXTPROC glad_glXCreateContext;
+#define glXCreateContext glad_glXCreateContext
+GLAD_API_CALL PFNGLXCREATECONTEXTATTRIBSARBPROC glad_glXCreateContextAttribsARB;
+#define glXCreateContextAttribsARB glad_glXCreateContextAttribsARB
+GLAD_API_CALL PFNGLXCREATEGLXPIXMAPPROC glad_glXCreateGLXPixmap;
+#define glXCreateGLXPixmap glad_glXCreateGLXPixmap
+GLAD_API_CALL PFNGLXCREATENEWCONTEXTPROC glad_glXCreateNewContext;
+#define glXCreateNewContext glad_glXCreateNewContext
+GLAD_API_CALL PFNGLXCREATEPBUFFERPROC glad_glXCreatePbuffer;
+#define glXCreatePbuffer glad_glXCreatePbuffer
+GLAD_API_CALL PFNGLXCREATEPIXMAPPROC glad_glXCreatePixmap;
+#define glXCreatePixmap glad_glXCreatePixmap
+GLAD_API_CALL PFNGLXCREATEWINDOWPROC glad_glXCreateWindow;
+#define glXCreateWindow glad_glXCreateWindow
+GLAD_API_CALL PFNGLXDESTROYCONTEXTPROC glad_glXDestroyContext;
+#define glXDestroyContext glad_glXDestroyContext
+GLAD_API_CALL PFNGLXDESTROYGLXPIXMAPPROC glad_glXDestroyGLXPixmap;
+#define glXDestroyGLXPixmap glad_glXDestroyGLXPixmap
+GLAD_API_CALL PFNGLXDESTROYPBUFFERPROC glad_glXDestroyPbuffer;
+#define glXDestroyPbuffer glad_glXDestroyPbuffer
+GLAD_API_CALL PFNGLXDESTROYPIXMAPPROC glad_glXDestroyPixmap;
+#define glXDestroyPixmap glad_glXDestroyPixmap
+GLAD_API_CALL PFNGLXDESTROYWINDOWPROC glad_glXDestroyWindow;
+#define glXDestroyWindow glad_glXDestroyWindow
+GLAD_API_CALL PFNGLXGETCLIENTSTRINGPROC glad_glXGetClientString;
+#define glXGetClientString glad_glXGetClientString
+GLAD_API_CALL PFNGLXGETCONFIGPROC glad_glXGetConfig;
+#define glXGetConfig glad_glXGetConfig
+GLAD_API_CALL PFNGLXGETCURRENTCONTEXTPROC glad_glXGetCurrentContext;
+#define glXGetCurrentContext glad_glXGetCurrentContext
+GLAD_API_CALL PFNGLXGETCURRENTDISPLAYPROC glad_glXGetCurrentDisplay;
+#define glXGetCurrentDisplay glad_glXGetCurrentDisplay
+GLAD_API_CALL PFNGLXGETCURRENTDRAWABLEPROC glad_glXGetCurrentDrawable;
+#define glXGetCurrentDrawable glad_glXGetCurrentDrawable
+GLAD_API_CALL PFNGLXGETCURRENTREADDRAWABLEPROC glad_glXGetCurrentReadDrawable;
+#define glXGetCurrentReadDrawable glad_glXGetCurrentReadDrawable
+GLAD_API_CALL PFNGLXGETFBCONFIGATTRIBPROC glad_glXGetFBConfigAttrib;
+#define glXGetFBConfigAttrib glad_glXGetFBConfigAttrib
+GLAD_API_CALL PFNGLXGETFBCONFIGSPROC glad_glXGetFBConfigs;
+#define glXGetFBConfigs glad_glXGetFBConfigs
+GLAD_API_CALL PFNGLXGETPROCADDRESSPROC glad_glXGetProcAddress;
+#define glXGetProcAddress glad_glXGetProcAddress
+GLAD_API_CALL PFNGLXGETPROCADDRESSARBPROC glad_glXGetProcAddressARB;
+#define glXGetProcAddressARB glad_glXGetProcAddressARB
+GLAD_API_CALL PFNGLXGETSELECTEDEVENTPROC glad_glXGetSelectedEvent;
+#define glXGetSelectedEvent glad_glXGetSelectedEvent
+GLAD_API_CALL PFNGLXGETSWAPINTERVALMESAPROC glad_glXGetSwapIntervalMESA;
+#define glXGetSwapIntervalMESA glad_glXGetSwapIntervalMESA
+GLAD_API_CALL PFNGLXGETVISUALFROMFBCONFIGPROC glad_glXGetVisualFromFBConfig;
+#define glXGetVisualFromFBConfig glad_glXGetVisualFromFBConfig
+GLAD_API_CALL PFNGLXISDIRECTPROC glad_glXIsDirect;
+#define glXIsDirect glad_glXIsDirect
+GLAD_API_CALL PFNGLXMAKECONTEXTCURRENTPROC glad_glXMakeContextCurrent;
+#define glXMakeContextCurrent glad_glXMakeContextCurrent
+GLAD_API_CALL PFNGLXMAKECURRENTPROC glad_glXMakeCurrent;
+#define glXMakeCurrent glad_glXMakeCurrent
+GLAD_API_CALL PFNGLXQUERYCONTEXTPROC glad_glXQueryContext;
+#define glXQueryContext glad_glXQueryContext
+GLAD_API_CALL PFNGLXQUERYDRAWABLEPROC glad_glXQueryDrawable;
+#define glXQueryDrawable glad_glXQueryDrawable
+GLAD_API_CALL PFNGLXQUERYEXTENSIONPROC glad_glXQueryExtension;
+#define glXQueryExtension glad_glXQueryExtension
+GLAD_API_CALL PFNGLXQUERYEXTENSIONSSTRINGPROC glad_glXQueryExtensionsString;
+#define glXQueryExtensionsString glad_glXQueryExtensionsString
+GLAD_API_CALL PFNGLXQUERYSERVERSTRINGPROC glad_glXQueryServerString;
+#define glXQueryServerString glad_glXQueryServerString
+GLAD_API_CALL PFNGLXQUERYVERSIONPROC glad_glXQueryVersion;
+#define glXQueryVersion glad_glXQueryVersion
+GLAD_API_CALL PFNGLXSELECTEVENTPROC glad_glXSelectEvent;
+#define glXSelectEvent glad_glXSelectEvent
+GLAD_API_CALL PFNGLXSWAPBUFFERSPROC glad_glXSwapBuffers;
+#define glXSwapBuffers glad_glXSwapBuffers
+GLAD_API_CALL PFNGLXSWAPINTERVALEXTPROC glad_glXSwapIntervalEXT;
+#define glXSwapIntervalEXT glad_glXSwapIntervalEXT
+GLAD_API_CALL PFNGLXSWAPINTERVALMESAPROC glad_glXSwapIntervalMESA;
+#define glXSwapIntervalMESA glad_glXSwapIntervalMESA
+GLAD_API_CALL PFNGLXSWAPINTERVALSGIPROC glad_glXSwapIntervalSGI;
+#define glXSwapIntervalSGI glad_glXSwapIntervalSGI
+GLAD_API_CALL PFNGLXUSEXFONTPROC glad_glXUseXFont;
+#define glXUseXFont glad_glXUseXFont
+GLAD_API_CALL PFNGLXWAITGLPROC glad_glXWaitGL;
+#define glXWaitGL glad_glXWaitGL
+GLAD_API_CALL PFNGLXWAITXPROC glad_glXWaitX;
+#define glXWaitX glad_glXWaitX
+
+
+
+
+
+GLAD_API_CALL int gladLoadGLXUserPtr(Display *display, int screen, GLADuserptrloadfunc load, void *userptr);
+GLAD_API_CALL int gladLoadGLX(Display *display, int screen, GLADloadfunc load);
+
+#ifdef GLAD_GLX
+
+GLAD_API_CALL int gladLoaderLoadGLX(Display *display, int screen);
+
+GLAD_API_CALL void gladLoaderUnloadGLX(void);
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/thirdparty/glad/glx.c b/thirdparty/glad/glx.c
new file mode 100644
index 0000000000..6391027db2
--- /dev/null
+++ b/thirdparty/glad/glx.c
@@ -0,0 +1,395 @@
+/**
+ * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <glad/glx.h>
+
+#ifndef GLAD_IMPL_UTIL_C_
+#define GLAD_IMPL_UTIL_C_
+
+#ifdef _MSC_VER
+#define GLAD_IMPL_UTIL_SSCANF sscanf_s
+#else
+#define GLAD_IMPL_UTIL_SSCANF sscanf
+#endif
+
+#endif /* GLAD_IMPL_UTIL_C_ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+int GLAD_GLX_VERSION_1_0 = 0;
+int GLAD_GLX_VERSION_1_1 = 0;
+int GLAD_GLX_VERSION_1_2 = 0;
+int GLAD_GLX_VERSION_1_3 = 0;
+int GLAD_GLX_VERSION_1_4 = 0;
+int GLAD_GLX_ARB_create_context = 0;
+int GLAD_GLX_ARB_create_context_profile = 0;
+int GLAD_GLX_ARB_get_proc_address = 0;
+int GLAD_GLX_EXT_swap_control = 0;
+int GLAD_GLX_MESA_swap_control = 0;
+int GLAD_GLX_SGI_swap_control = 0;
+
+
+
+PFNGLXCHOOSEFBCONFIGPROC glad_glXChooseFBConfig = NULL;
+PFNGLXCHOOSEVISUALPROC glad_glXChooseVisual = NULL;
+PFNGLXCOPYCONTEXTPROC glad_glXCopyContext = NULL;
+PFNGLXCREATECONTEXTPROC glad_glXCreateContext = NULL;
+PFNGLXCREATECONTEXTATTRIBSARBPROC glad_glXCreateContextAttribsARB = NULL;
+PFNGLXCREATEGLXPIXMAPPROC glad_glXCreateGLXPixmap = NULL;
+PFNGLXCREATENEWCONTEXTPROC glad_glXCreateNewContext = NULL;
+PFNGLXCREATEPBUFFERPROC glad_glXCreatePbuffer = NULL;
+PFNGLXCREATEPIXMAPPROC glad_glXCreatePixmap = NULL;
+PFNGLXCREATEWINDOWPROC glad_glXCreateWindow = NULL;
+PFNGLXDESTROYCONTEXTPROC glad_glXDestroyContext = NULL;
+PFNGLXDESTROYGLXPIXMAPPROC glad_glXDestroyGLXPixmap = NULL;
+PFNGLXDESTROYPBUFFERPROC glad_glXDestroyPbuffer = NULL;
+PFNGLXDESTROYPIXMAPPROC glad_glXDestroyPixmap = NULL;
+PFNGLXDESTROYWINDOWPROC glad_glXDestroyWindow = NULL;
+PFNGLXGETCLIENTSTRINGPROC glad_glXGetClientString = NULL;
+PFNGLXGETCONFIGPROC glad_glXGetConfig = NULL;
+PFNGLXGETCURRENTCONTEXTPROC glad_glXGetCurrentContext = NULL;
+PFNGLXGETCURRENTDISPLAYPROC glad_glXGetCurrentDisplay = NULL;
+PFNGLXGETCURRENTDRAWABLEPROC glad_glXGetCurrentDrawable = NULL;
+PFNGLXGETCURRENTREADDRAWABLEPROC glad_glXGetCurrentReadDrawable = NULL;
+PFNGLXGETFBCONFIGATTRIBPROC glad_glXGetFBConfigAttrib = NULL;
+PFNGLXGETFBCONFIGSPROC glad_glXGetFBConfigs = NULL;
+PFNGLXGETPROCADDRESSPROC glad_glXGetProcAddress = NULL;
+PFNGLXGETPROCADDRESSARBPROC glad_glXGetProcAddressARB = NULL;
+PFNGLXGETSELECTEDEVENTPROC glad_glXGetSelectedEvent = NULL;
+PFNGLXGETSWAPINTERVALMESAPROC glad_glXGetSwapIntervalMESA = NULL;
+PFNGLXGETVISUALFROMFBCONFIGPROC glad_glXGetVisualFromFBConfig = NULL;
+PFNGLXISDIRECTPROC glad_glXIsDirect = NULL;
+PFNGLXMAKECONTEXTCURRENTPROC glad_glXMakeContextCurrent = NULL;
+PFNGLXMAKECURRENTPROC glad_glXMakeCurrent = NULL;
+PFNGLXQUERYCONTEXTPROC glad_glXQueryContext = NULL;
+PFNGLXQUERYDRAWABLEPROC glad_glXQueryDrawable = NULL;
+PFNGLXQUERYEXTENSIONPROC glad_glXQueryExtension = NULL;
+PFNGLXQUERYEXTENSIONSSTRINGPROC glad_glXQueryExtensionsString = NULL;
+PFNGLXQUERYSERVERSTRINGPROC glad_glXQueryServerString = NULL;
+PFNGLXQUERYVERSIONPROC glad_glXQueryVersion = NULL;
+PFNGLXSELECTEVENTPROC glad_glXSelectEvent = NULL;
+PFNGLXSWAPBUFFERSPROC glad_glXSwapBuffers = NULL;
+PFNGLXSWAPINTERVALEXTPROC glad_glXSwapIntervalEXT = NULL;
+PFNGLXSWAPINTERVALMESAPROC glad_glXSwapIntervalMESA = NULL;
+PFNGLXSWAPINTERVALSGIPROC glad_glXSwapIntervalSGI = NULL;
+PFNGLXUSEXFONTPROC glad_glXUseXFont = NULL;
+PFNGLXWAITGLPROC glad_glXWaitGL = NULL;
+PFNGLXWAITXPROC glad_glXWaitX = NULL;
+
+
+static void glad_glx_load_GLX_VERSION_1_0( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_VERSION_1_0) return;
+    glad_glXChooseVisual = (PFNGLXCHOOSEVISUALPROC) load(userptr, "glXChooseVisual");
+    glad_glXCopyContext = (PFNGLXCOPYCONTEXTPROC) load(userptr, "glXCopyContext");
+    glad_glXCreateContext = (PFNGLXCREATECONTEXTPROC) load(userptr, "glXCreateContext");
+    glad_glXCreateGLXPixmap = (PFNGLXCREATEGLXPIXMAPPROC) load(userptr, "glXCreateGLXPixmap");
+    glad_glXDestroyContext = (PFNGLXDESTROYCONTEXTPROC) load(userptr, "glXDestroyContext");
+    glad_glXDestroyGLXPixmap = (PFNGLXDESTROYGLXPIXMAPPROC) load(userptr, "glXDestroyGLXPixmap");
+    glad_glXGetConfig = (PFNGLXGETCONFIGPROC) load(userptr, "glXGetConfig");
+    glad_glXGetCurrentContext = (PFNGLXGETCURRENTCONTEXTPROC) load(userptr, "glXGetCurrentContext");
+    glad_glXGetCurrentDrawable = (PFNGLXGETCURRENTDRAWABLEPROC) load(userptr, "glXGetCurrentDrawable");
+    glad_glXIsDirect = (PFNGLXISDIRECTPROC) load(userptr, "glXIsDirect");
+    glad_glXMakeCurrent = (PFNGLXMAKECURRENTPROC) load(userptr, "glXMakeCurrent");
+    glad_glXQueryExtension = (PFNGLXQUERYEXTENSIONPROC) load(userptr, "glXQueryExtension");
+    glad_glXQueryVersion = (PFNGLXQUERYVERSIONPROC) load(userptr, "glXQueryVersion");
+    glad_glXSwapBuffers = (PFNGLXSWAPBUFFERSPROC) load(userptr, "glXSwapBuffers");
+    glad_glXUseXFont = (PFNGLXUSEXFONTPROC) load(userptr, "glXUseXFont");
+    glad_glXWaitGL = (PFNGLXWAITGLPROC) load(userptr, "glXWaitGL");
+    glad_glXWaitX = (PFNGLXWAITXPROC) load(userptr, "glXWaitX");
+}
+static void glad_glx_load_GLX_VERSION_1_1( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_VERSION_1_1) return;
+    glad_glXGetClientString = (PFNGLXGETCLIENTSTRINGPROC) load(userptr, "glXGetClientString");
+    glad_glXQueryExtensionsString = (PFNGLXQUERYEXTENSIONSSTRINGPROC) load(userptr, "glXQueryExtensionsString");
+    glad_glXQueryServerString = (PFNGLXQUERYSERVERSTRINGPROC) load(userptr, "glXQueryServerString");
+}
+static void glad_glx_load_GLX_VERSION_1_2( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_VERSION_1_2) return;
+    glad_glXGetCurrentDisplay = (PFNGLXGETCURRENTDISPLAYPROC) load(userptr, "glXGetCurrentDisplay");
+}
+static void glad_glx_load_GLX_VERSION_1_3( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_VERSION_1_3) return;
+    glad_glXChooseFBConfig = (PFNGLXCHOOSEFBCONFIGPROC) load(userptr, "glXChooseFBConfig");
+    glad_glXCreateNewContext = (PFNGLXCREATENEWCONTEXTPROC) load(userptr, "glXCreateNewContext");
+    glad_glXCreatePbuffer = (PFNGLXCREATEPBUFFERPROC) load(userptr, "glXCreatePbuffer");
+    glad_glXCreatePixmap = (PFNGLXCREATEPIXMAPPROC) load(userptr, "glXCreatePixmap");
+    glad_glXCreateWindow = (PFNGLXCREATEWINDOWPROC) load(userptr, "glXCreateWindow");
+    glad_glXDestroyPbuffer = (PFNGLXDESTROYPBUFFERPROC) load(userptr, "glXDestroyPbuffer");
+    glad_glXDestroyPixmap = (PFNGLXDESTROYPIXMAPPROC) load(userptr, "glXDestroyPixmap");
+    glad_glXDestroyWindow = (PFNGLXDESTROYWINDOWPROC) load(userptr, "glXDestroyWindow");
+    glad_glXGetCurrentReadDrawable = (PFNGLXGETCURRENTREADDRAWABLEPROC) load(userptr, "glXGetCurrentReadDrawable");
+    glad_glXGetFBConfigAttrib = (PFNGLXGETFBCONFIGATTRIBPROC) load(userptr, "glXGetFBConfigAttrib");
+    glad_glXGetFBConfigs = (PFNGLXGETFBCONFIGSPROC) load(userptr, "glXGetFBConfigs");
+    glad_glXGetSelectedEvent = (PFNGLXGETSELECTEDEVENTPROC) load(userptr, "glXGetSelectedEvent");
+    glad_glXGetVisualFromFBConfig = (PFNGLXGETVISUALFROMFBCONFIGPROC) load(userptr, "glXGetVisualFromFBConfig");
+    glad_glXMakeContextCurrent = (PFNGLXMAKECONTEXTCURRENTPROC) load(userptr, "glXMakeContextCurrent");
+    glad_glXQueryContext = (PFNGLXQUERYCONTEXTPROC) load(userptr, "glXQueryContext");
+    glad_glXQueryDrawable = (PFNGLXQUERYDRAWABLEPROC) load(userptr, "glXQueryDrawable");
+    glad_glXSelectEvent = (PFNGLXSELECTEVENTPROC) load(userptr, "glXSelectEvent");
+}
+static void glad_glx_load_GLX_VERSION_1_4( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_VERSION_1_4) return;
+    glad_glXGetProcAddress = (PFNGLXGETPROCADDRESSPROC) load(userptr, "glXGetProcAddress");
+}
+static void glad_glx_load_GLX_ARB_create_context( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_ARB_create_context) return;
+    glad_glXCreateContextAttribsARB = (PFNGLXCREATECONTEXTATTRIBSARBPROC) load(userptr, "glXCreateContextAttribsARB");
+}
+static void glad_glx_load_GLX_ARB_get_proc_address( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_ARB_get_proc_address) return;
+    glad_glXGetProcAddressARB = (PFNGLXGETPROCADDRESSARBPROC) load(userptr, "glXGetProcAddressARB");
+}
+static void glad_glx_load_GLX_EXT_swap_control( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_EXT_swap_control) return;
+    glad_glXSwapIntervalEXT = (PFNGLXSWAPINTERVALEXTPROC) load(userptr, "glXSwapIntervalEXT");
+}
+static void glad_glx_load_GLX_MESA_swap_control( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_MESA_swap_control) return;
+    glad_glXGetSwapIntervalMESA = (PFNGLXGETSWAPINTERVALMESAPROC) load(userptr, "glXGetSwapIntervalMESA");
+    glad_glXSwapIntervalMESA = (PFNGLXSWAPINTERVALMESAPROC) load(userptr, "glXSwapIntervalMESA");
+}
+static void glad_glx_load_GLX_SGI_swap_control( GLADuserptrloadfunc load, void* userptr) {
+    if(!GLAD_GLX_SGI_swap_control) return;
+    glad_glXSwapIntervalSGI = (PFNGLXSWAPINTERVALSGIPROC) load(userptr, "glXSwapIntervalSGI");
+}
+
+
+
+static int glad_glx_has_extension(Display *display, int screen, const char *ext) {
+#ifndef GLX_VERSION_1_1
+    GLAD_UNUSED(display);
+    GLAD_UNUSED(screen);
+    GLAD_UNUSED(ext);
+#else
+    const char *terminator;
+    const char *loc;
+    const char *extensions;
+
+    if (glXQueryExtensionsString == NULL) {
+        return 0;
+    }
+
+    extensions = glXQueryExtensionsString(display, screen);
+
+    if(extensions == NULL || ext == NULL) {
+        return 0;
+    }
+
+    while(1) {
+        loc = strstr(extensions, ext);
+        if(loc == NULL)
+            break;
+
+        terminator = loc + strlen(ext);
+        if((loc == extensions || *(loc - 1) == ' ') &&
+            (*terminator == ' ' || *terminator == '\0')) {
+            return 1;
+        }
+        extensions = terminator;
+    }
+#endif
+
+    return 0;
+}
+
+static GLADapiproc glad_glx_get_proc_from_userptr(void *userptr, const char* name) {
+    return (GLAD_GNUC_EXTENSION (GLADapiproc (*)(const char *name)) userptr)(name);
+}
+
+static int glad_glx_find_extensions(Display *display, int screen) {
+    GLAD_GLX_ARB_create_context = glad_glx_has_extension(display, screen, "GLX_ARB_create_context");
+    GLAD_GLX_ARB_create_context_profile = glad_glx_has_extension(display, screen, "GLX_ARB_create_context_profile");
+    GLAD_GLX_ARB_get_proc_address = glad_glx_has_extension(display, screen, "GLX_ARB_get_proc_address");
+    GLAD_GLX_EXT_swap_control = glad_glx_has_extension(display, screen, "GLX_EXT_swap_control");
+    GLAD_GLX_MESA_swap_control = glad_glx_has_extension(display, screen, "GLX_MESA_swap_control");
+    GLAD_GLX_SGI_swap_control = glad_glx_has_extension(display, screen, "GLX_SGI_swap_control");
+    return 1;
+}
+
+static int glad_glx_find_core_glx(Display **display, int *screen) {
+    int major = 0, minor = 0;
+    if(*display == NULL) {
+#ifdef GLAD_GLX_NO_X11
+        GLAD_UNUSED(screen);
+        return 0;
+#else
+        *display = XOpenDisplay(0);
+        if (*display == NULL) {
+            return 0;
+        }
+        *screen = XScreenNumberOfScreen(XDefaultScreenOfDisplay(*display));
+#endif
+    }
+    glXQueryVersion(*display, &major, &minor);
+    GLAD_GLX_VERSION_1_0 = (major == 1 && minor >= 0) || major > 1;
+    GLAD_GLX_VERSION_1_1 = (major == 1 && minor >= 1) || major > 1;
+    GLAD_GLX_VERSION_1_2 = (major == 1 && minor >= 2) || major > 1;
+    GLAD_GLX_VERSION_1_3 = (major == 1 && minor >= 3) || major > 1;
+    GLAD_GLX_VERSION_1_4 = (major == 1 && minor >= 4) || major > 1;
+    return GLAD_MAKE_VERSION(major, minor);
+}
+
+int gladLoadGLXUserPtr(Display *display, int screen, GLADuserptrloadfunc load, void *userptr) {
+    int version;
+    glXQueryVersion = (PFNGLXQUERYVERSIONPROC) load(userptr, "glXQueryVersion");
+    if(glXQueryVersion == NULL) return 0;
+    version = glad_glx_find_core_glx(&display, &screen);
+
+    glad_glx_load_GLX_VERSION_1_0(load, userptr);
+    glad_glx_load_GLX_VERSION_1_1(load, userptr);
+    glad_glx_load_GLX_VERSION_1_2(load, userptr);
+    glad_glx_load_GLX_VERSION_1_3(load, userptr);
+    glad_glx_load_GLX_VERSION_1_4(load, userptr);
+
+    if (!glad_glx_find_extensions(display, screen)) return 0;
+    glad_glx_load_GLX_ARB_create_context(load, userptr);
+    glad_glx_load_GLX_ARB_get_proc_address(load, userptr);
+    glad_glx_load_GLX_EXT_swap_control(load, userptr);
+    glad_glx_load_GLX_MESA_swap_control(load, userptr);
+    glad_glx_load_GLX_SGI_swap_control(load, userptr);
+
+
+    return version;
+}
+
+int gladLoadGLX(Display *display, int screen, GLADloadfunc load) {
+    return gladLoadGLXUserPtr(display, screen, glad_glx_get_proc_from_userptr, GLAD_GNUC_EXTENSION (void*) load);
+}
+
+ 
+
+#ifdef GLAD_GLX
+
+#ifndef GLAD_LOADER_LIBRARY_C_
+#define GLAD_LOADER_LIBRARY_C_
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#if GLAD_PLATFORM_WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+
+static void* glad_get_dlopen_handle(const char *lib_names[], int length) {
+    void *handle = NULL;
+    int i;
+
+    for (i = 0; i < length; ++i) {
+#if GLAD_PLATFORM_WIN32
+  #if GLAD_PLATFORM_UWP
+        size_t buffer_size = (strlen(lib_names[i]) + 1) * sizeof(WCHAR);
+        LPWSTR buffer = (LPWSTR) malloc(buffer_size);
+        if (buffer != NULL) {
+            int ret = MultiByteToWideChar(CP_ACP, 0, lib_names[i], -1, buffer, buffer_size);
+            if (ret != 0) {
+                handle = (void*) LoadPackagedLibrary(buffer, 0);
+            }
+            free((void*) buffer);
+        }
+  #else
+        handle = (void*) LoadLibraryA(lib_names[i]);
+  #endif
+#else
+        handle = dlopen(lib_names[i], RTLD_LAZY | RTLD_LOCAL);
+#endif
+        if (handle != NULL) {
+            return handle;
+        }
+    }
+
+    return NULL;
+}
+
+static void glad_close_dlopen_handle(void* handle) {
+    if (handle != NULL) {
+#if GLAD_PLATFORM_WIN32
+        FreeLibrary((HMODULE) handle);
+#else
+        dlclose(handle);
+#endif
+    }
+}
+
+static GLADapiproc glad_dlsym_handle(void* handle, const char *name) {
+    if (handle == NULL) {
+        return NULL;
+    }
+
+#if GLAD_PLATFORM_WIN32
+    return (GLADapiproc) GetProcAddress((HMODULE) handle, name);
+#else
+    return GLAD_GNUC_EXTENSION (GLADapiproc) dlsym(handle, name);
+#endif
+}
+
+#endif /* GLAD_LOADER_LIBRARY_C_ */
+
+typedef void* (GLAD_API_PTR *GLADglxprocaddrfunc)(const char*);
+
+static GLADapiproc glad_glx_get_proc(void *userptr, const char *name) {
+    return GLAD_GNUC_EXTENSION ((GLADapiproc (*)(const char *name)) userptr)(name);
+}
+
+static void* _glx_handle;
+
+static void* glad_glx_dlopen_handle(void) {
+    static const char *NAMES[] = {
+#if defined __CYGWIN__
+        "libGL-1.so",
+#endif
+        "libGL.so.1",
+        "libGL.so"
+    };
+
+    if (_glx_handle == NULL) {
+        _glx_handle = glad_get_dlopen_handle(NAMES, sizeof(NAMES) / sizeof(NAMES[0]));
+    }
+
+    return _glx_handle;
+}
+
+int gladLoaderLoadGLX(Display *display, int screen) {
+    int version = 0;
+    void *handle = NULL;
+    int did_load = 0;
+    GLADglxprocaddrfunc loader;
+
+    did_load = _glx_handle == NULL;
+    handle = glad_glx_dlopen_handle();
+    if (handle != NULL) {
+        loader = (GLADglxprocaddrfunc) glad_dlsym_handle(handle, "glXGetProcAddressARB");
+        if (loader != NULL) {
+            version = gladLoadGLXUserPtr(display, screen, glad_glx_get_proc, GLAD_GNUC_EXTENSION (void*) loader);
+        }
+
+        if (!version && did_load) {
+            gladLoaderUnloadGLX();
+        }
+    }
+
+    return version;
+}
+
+
+void gladLoaderUnloadGLX() {
+    if (_glx_handle != NULL) {
+        glad_close_dlopen_handle(_glx_handle);
+        _glx_handle = NULL;
+    }
+}
+
+#endif /* GLAD_GLX */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/libtheora/LICENSE b/thirdparty/libtheora/LICENSE
index 5e5ec08469..97e8431790 100644
--- a/thirdparty/libtheora/LICENSE
+++ b/thirdparty/libtheora/LICENSE
@@ -4,13 +4,13 @@ In addition to and irrespective of the copyright license associated
 with this software, On2 Technologies, Inc. makes the following statement
 regarding technology used in this software:
 
-  On2 represents and warrants that it shall not assert any rights 
+  On2 represents and warrants that it shall not assert any rights
   relating to infringement of On2's registered patents, nor initiate
   any litigation asserting such rights, against any person who, or
-  entity which utilizes the On2 VP3 Codec Software, including any 
-  use, distribution, and sale of said Software; which make changes, 
+  entity which utilizes the On2 VP3 Codec Software, including any
+  use, distribution, and sale of said Software; which make changes,
   modifications, and improvements in said Software; and to use,
-  distribute, and sell said changes as well as applications for other 
+  distribute, and sell said changes as well as applications for other
   fields of use.
 
 This reference implementation is originally derived from the On2 VP3
diff --git a/thirdparty/libtheora/analyze.c b/thirdparty/libtheora/analyze.c
index af01b60dff..19d7612d23 100644
--- a/thirdparty/libtheora/analyze.c
+++ b/thirdparty/libtheora/analyze.c
@@ -18,12 +18,12 @@
 #include <string.h>
 #include "encint.h"
 #include "modedec.h"
+#if defined(OC_COLLECT_METRICS)
+# include "collect.c"
+#endif
 
 
 
-typedef struct oc_fr_state           oc_fr_state;
-typedef struct oc_qii_state          oc_qii_state;
-typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
 typedef struct oc_rd_metric          oc_rd_metric;
 typedef struct oc_mode_choice        oc_mode_choice;
 
@@ -42,7 +42,7 @@ typedef struct oc_mode_choice        oc_mode_choice;
   This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
    decoder.*/
 static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
-  /*Last MV dominates.*/ 
+  /*Last MV dominates.*/
   /*L P M N I G GM 4*/
   {3,4,2,0,1,5,6,7},
   /*L P N M I G GM 4*/
@@ -87,6 +87,29 @@ static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
   }
 }
 
+/*Return the cost of coding _mb_mode in the specified scheme.*/
+static int oc_mode_scheme_chooser_scheme_mb_cost(
+ const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
+  int codebook;
+  int ri;
+  codebook=_scheme+1>>3;
+  /*For any scheme except 0, we can just use the bit cost of the mode's rank
+     in that scheme.*/
+  ri=_chooser->mode_ranks[_scheme][_mb_mode];
+  if(_scheme==0){
+    int mc;
+    /*For scheme 0, incrementing the mode count could potentially change the
+       mode's rank.
+      Find the index where the mode would be moved to in the optimal list,
+       and use its bit cost instead of the one for the mode's current
+       position in the list.*/
+    /*We don't actually reorder the list; this is for computing opportunity
+       cost, not an update.*/
+    mc=_chooser->mode_counts[_mb_mode];
+    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
+  }
+  return OC_MODE_BITS[codebook][ri];
+}
 
 /*This is the real purpose of this data structure: not actually selecting a
    mode scheme, but estimating the cost of coding a given mode given all the
@@ -108,46 +131,32 @@ static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
   int best_bits;
   int mode_bits;
   int si;
-  int scheme_bits;
+  int scheme0_bits;
+  int scheme1_bits;
   scheme0=_chooser->scheme_list[0];
   scheme1=_chooser->scheme_list[1];
-  best_bits=_chooser->scheme_bits[scheme0];
-  mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]];
+  scheme0_bits=_chooser->scheme_bits[scheme0];
+  scheme1_bits=_chooser->scheme_bits[scheme1];
+  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
   /*Typical case: If the difference between the best scheme and the next best
      is greater than 6 bits, then adding just one mode cannot change which
      scheme we use.*/
-  if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
+  if(scheme1_bits-scheme0_bits>6)return mode_bits;
   /*Otherwise, check to see if adding this mode selects a different scheme as
      the best.*/
   si=1;
-  best_bits+=mode_bits;
+  best_bits=scheme0_bits+mode_bits;
   do{
-    /*For any scheme except 0, we can just use the bit cost of the mode's rank
-       in that scheme.*/
-    if(scheme1!=0){
-      scheme_bits=_chooser->scheme_bits[scheme1]+
-       OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]];
-    }
-    else{
-      int ri;
-      /*For scheme 0, incrementing the mode count could potentially change the
-         mode's rank.
-        Find the index where the mode would be moved to in the optimal list,
-         and use its bit cost instead of the one for the mode's current
-         position in the list.*/
-      /*We don't recompute scheme bits; this is computing opportunity cost, not
-         an update.*/
-      for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&&
-       _chooser->mode_counts[_mb_mode]>=
-       _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
-      scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri];
-    }
-    if(scheme_bits<best_bits)best_bits=scheme_bits;
+    int cur_bits;
+    cur_bits=scheme1_bits+
+     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
+    if(cur_bits<best_bits)best_bits=cur_bits;
     if(++si>=8)break;
     scheme1=_chooser->scheme_list[si];
+    scheme1_bits=_chooser->scheme_bits[scheme1];
   }
-  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
-  return best_bits-_chooser->scheme_bits[scheme0];
+  while(scheme1_bits-scheme0_bits<=6);
+  return best_bits-scheme0_bits;
 }
 
 /*Incrementally update the mode counts and per-scheme bit counts and re-order
@@ -211,22 +220,6 @@ static int oc_block_run_bits(int _run_count){
 
 
 
-/*State to track coded block flags and their bit cost.*/
-struct oc_fr_state{
-  ptrdiff_t  bits;
-  unsigned   sb_partial_count:16;
-  unsigned   sb_full_count:16;
-  unsigned   b_coded_count_prev:8;
-  unsigned   b_coded_count:8;
-  unsigned   b_count:8;
-  signed int sb_partial:2;
-  signed int sb_full:2;
-  signed int b_coded_prev:2;
-  signed int b_coded:2;
-};
-
-
-
 static void oc_fr_state_init(oc_fr_state *_fr){
   _fr->bits=0;
   _fr->sb_partial_count=0;
@@ -234,6 +227,8 @@ static void oc_fr_state_init(oc_fr_state *_fr){
   _fr->b_coded_count_prev=0;
   _fr->b_coded_count=0;
   _fr->b_count=0;
+  _fr->sb_prefer_partial=0;
+  _fr->sb_bits=0;
   _fr->sb_partial=-1;
   _fr->sb_full=-1;
   _fr->b_coded_prev=-1;
@@ -241,14 +236,14 @@ static void oc_fr_state_init(oc_fr_state *_fr){
 }
 
 
-static void oc_fr_state_advance_sb(oc_fr_state *_fr,
+static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
  int _sb_partial,int _sb_full){
-  ptrdiff_t bits;
-  int       sb_partial_count;
-  int       sb_full_count;
-  bits=_fr->bits;
+  int bits;
+  int sb_partial_count;
+  int sb_full_count;
+  bits=0;
+  sb_partial_count=_fr->sb_partial_count;
   /*Extend the sb_partial run, or start a new one.*/
-  sb_partial_count=_fr->sb_partial;
   if(_fr->sb_partial==_sb_partial){
     if(sb_partial_count>=4129){
       bits++;
@@ -257,8 +252,7 @@ static void oc_fr_state_advance_sb(oc_fr_state *_fr,
     else bits-=oc_sb_run_bits(sb_partial_count);
   }
   else sb_partial_count=0;
-  sb_partial_count++;
-  bits+=oc_sb_run_bits(sb_partial_count);
+  bits+=oc_sb_run_bits(++sb_partial_count);
   if(!_sb_partial){
     /*Extend the sb_full run, or start a new one.*/
     sb_full_count=_fr->sb_full_count;
@@ -270,98 +264,161 @@ static void oc_fr_state_advance_sb(oc_fr_state *_fr,
       else bits-=oc_sb_run_bits(sb_full_count);
     }
     else sb_full_count=0;
+    bits+=oc_sb_run_bits(++sb_full_count);
+  }
+  return bits;
+}
+
+static void oc_fr_state_advance_sb(oc_fr_state *_fr,
+ int _sb_partial,int _sb_full){
+  int sb_partial_count;
+  int sb_full_count;
+  sb_partial_count=_fr->sb_partial_count;
+  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
+  sb_partial_count++;
+  if(!_sb_partial){
+    sb_full_count=_fr->sb_full_count;
+    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
     sb_full_count++;
-    bits+=oc_sb_run_bits(sb_full_count);
-    _fr->sb_full=_sb_full;
     _fr->sb_full_count=sb_full_count;
+    _fr->sb_full=_sb_full;
+    /*Roll back the partial block state.*/
+    _fr->b_coded=_fr->b_coded_prev;
+    _fr->b_coded_count=_fr->b_coded_count_prev;
+  }
+  else{
+    /*Commit back the partial block state.*/
+    _fr->b_coded_prev=_fr->b_coded;
+    _fr->b_coded_count_prev=_fr->b_coded_count;
   }
-  _fr->bits=bits;
-  _fr->sb_partial=_sb_partial;
   _fr->sb_partial_count=sb_partial_count;
+  _fr->sb_partial=_sb_partial;
+  _fr->b_count=0;
+  _fr->sb_prefer_partial=0;
+  _fr->sb_bits=0;
 }
 
-/*Flush any outstanding block flags for a SB (e.g., one with fewer than 16
-   blocks).*/
+/*Commit the state of the current super block and advance to the next.*/
 static void oc_fr_state_flush_sb(oc_fr_state *_fr){
-  ptrdiff_t bits;
-  int       sb_partial;
-  int       sb_full=sb_full;
-  int       b_coded_count;
-  int       b_coded;
-  int       b_count;
+  int sb_partial;
+  int sb_full;
+  int b_coded_count;
+  int b_count;
   b_count=_fr->b_count;
-  if(b_count>0){
-    bits=_fr->bits;
-    b_coded=_fr->b_coded;
-    b_coded_count=_fr->b_coded_count;
-    if(b_coded_count>=b_count){
-      /*This SB was fully coded/uncoded; roll back the partial block flags.*/
-      bits-=oc_block_run_bits(b_coded_count);
-      if(b_coded_count>b_count)bits+=oc_block_run_bits(b_coded_count-b_count);
-      sb_partial=0;
-      sb_full=b_coded;
-      b_coded=_fr->b_coded_prev;
-      b_coded_count=_fr->b_coded_count_prev;
-    }
-    else{
-      /*It was partially coded.*/
-      sb_partial=1;
-      /*sb_full is unused.*/
+  b_coded_count=_fr->b_coded_count;
+  sb_full=_fr->b_coded;
+  sb_partial=b_coded_count<b_count;
+  if(!sb_partial){
+    /*If the super block is fully coded/uncoded...*/
+    if(_fr->sb_prefer_partial){
+      /*So far coding this super block as partial was cheaper anyway.*/
+      if(b_coded_count>15||_fr->b_coded_prev<0){
+        int sb_bits;
+        /*If the block run is too long, this will limit how far it can be
+           extended into the next partial super block.
+          If we need to extend it farther, we don't want to have to roll all
+           the way back here (since there could be many full SBs between now
+           and then), so we disallow this.
+          Similarly, if this is the start of a stripe, we don't know how the
+           length of the outstanding block run from the previous stripe.*/
+        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
+        _fr->bits+=sb_bits-_fr->sb_bits;
+        _fr->sb_bits=sb_bits;
+      }
+      else sb_partial=1;
     }
-    _fr->bits=bits;
-    _fr->b_coded_count=b_coded_count;
-    _fr->b_coded_count_prev=b_coded_count;
-    _fr->b_count=0;
-    _fr->b_coded=b_coded;
-    _fr->b_coded_prev=b_coded;
-    oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
   }
+  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
 }
 
 static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
   ptrdiff_t bits;
+  int       sb_bits;
   int       b_coded_count;
   int       b_count;
-  int       sb_partial;
-  int       sb_full=sb_full;
-  bits=_fr->bits;
-  /*Extend the b_coded run, or start a new one.*/
+  int       sb_prefer_partial;
+  sb_bits=_fr->sb_bits;
+  bits=_fr->bits-sb_bits;
+  b_count=_fr->b_count;
   b_coded_count=_fr->b_coded_count;
-  if(_fr->b_coded==_b_coded)bits-=oc_block_run_bits(b_coded_count);
-  else b_coded_count=0;
-  b_coded_count++;
-  b_count=_fr->b_count+1;
-  if(b_count>=16){
-    /*We finished a superblock.*/
-    if(b_coded_count>=16){
-      /*It was fully coded/uncoded; roll back the partial block flags.*/
-      if(b_coded_count>16)bits+=oc_block_run_bits(b_coded_count-16);
-      sb_partial=0;
-      sb_full=_b_coded;
-      _b_coded=_fr->b_coded_prev;
-      b_coded_count=_fr->b_coded_count_prev;
+  sb_prefer_partial=_fr->sb_prefer_partial;
+  if(b_coded_count>=b_count){
+    int sb_partial_bits;
+    /*This super block is currently fully coded/uncoded.*/
+    if(b_count<=0){
+      /*This is the first block in this SB.*/
+      b_count=1;
+      /*Check to see whether it's cheaper to code it partially or fully.*/
+      if(_fr->b_coded==_b_coded){
+        sb_partial_bits=-oc_block_run_bits(b_coded_count);
+        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
+      }
+      else{
+        b_coded_count=1;
+        sb_partial_bits=2;
+      }
+      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
+      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+      sb_prefer_partial=sb_partial_bits<sb_bits;
+      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
+    }
+    else if(_fr->b_coded==_b_coded){
+      b_coded_count++;
+      if(++b_count<16){
+        if(sb_prefer_partial){
+          /*Check to see if it's cheaper to code it fully.*/
+          sb_partial_bits=sb_bits;
+          sb_partial_bits+=oc_block_run_bits(b_coded_count);
+          if(b_coded_count>0){
+            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
+          }
+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+          sb_prefer_partial=sb_partial_bits<sb_bits;
+          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
+        }
+        /*There's no need to check the converse (whether it's cheaper to code
+           this SB partially if we were coding it fully), since the cost to
+           code a SB partially can only increase as we add more blocks, whereas
+           the cost to code it fully stays constant.*/
+      }
+      else{
+        /*If we get to the end and this SB is still full, then force it to be
+           coded full.
+          Otherwise we might not be able to extend the block run far enough
+           into the next partial SB.*/
+        if(sb_prefer_partial){
+          sb_prefer_partial=0;
+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+        }
+      }
     }
     else{
-      bits+=oc_block_run_bits(b_coded_count);
-      /*It was partially coded.*/
-      sb_partial=1;
-      /*sb_full is unused.*/
+      /*This SB was full, but now must be made partial.*/
+      if(!sb_prefer_partial){
+        sb_bits=oc_block_run_bits(b_coded_count);
+        if(b_coded_count>b_count){
+          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
+        }
+        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
+      }
+      b_count++;
+      b_coded_count=1;
+      sb_prefer_partial=1;
+      sb_bits+=2;
     }
-    _fr->bits=bits;
-    _fr->b_coded_count=b_coded_count;
-    _fr->b_coded_count_prev=b_coded_count;
-    _fr->b_count=0;
-    _fr->b_coded=_b_coded;
-    _fr->b_coded_prev=_b_coded;
-    oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
   }
   else{
-    bits+=oc_block_run_bits(b_coded_count);
-    _fr->bits=bits;
-    _fr->b_coded_count=b_coded_count;
-    _fr->b_count=b_count;
-    _fr->b_coded=_b_coded;
+    b_count++;
+    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
+    else b_coded_count=0;
+    sb_bits+=oc_block_run_bits(++b_coded_count);
   }
+  _fr->bits=bits+sb_bits;
+  _fr->b_coded_count=b_coded_count;
+  _fr->b_coded=_b_coded;
+  _fr->b_count=b_count;
+  _fr->sb_prefer_partial=sb_prefer_partial;
+  _fr->sb_bits=sb_bits;
 }
 
 static void oc_fr_skip_block(oc_fr_state *_fr){
@@ -395,16 +452,6 @@ static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
 
 
 
-struct oc_qii_state{
-  ptrdiff_t  bits;
-  unsigned   qi01_count:14;
-  signed int qi01:2;
-  unsigned   qi12_count:14;
-  signed int qi12:2;
-};
-
-
-
 static void oc_qii_state_init(oc_qii_state *_qs){
   _qs->bits=0;
   _qs->qi01_count=0;
@@ -458,49 +505,17 @@ static void oc_qii_state_advance(oc_qii_state *_qd,
 
 
 
-/*Temporary encoder state for the analysis pipeline.*/
-struct oc_enc_pipeline_state{
-  int                 bounding_values[256];
-  oc_fr_state         fr[3];
-  oc_qii_state        qs[3];
-  /*Condensed dequantization tables.*/
-  const ogg_uint16_t *dequant[3][3][2];
-  /*Condensed quantization tables.*/
-  const oc_iquant    *enquant[3][3][2];
-  /*Skip SSD storage for the current MCU in each plane.*/
-  unsigned           *skip_ssd[3];
-  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
-  ptrdiff_t          *coded_fragis[3];
-  ptrdiff_t          *uncoded_fragis[3];
-  ptrdiff_t           ncoded_fragis[3];
-  ptrdiff_t           nuncoded_fragis[3];
-  /*The starting fragment for the current MCU in each plane.*/
-  ptrdiff_t           froffset[3];
-  /*The starting row for the current MCU in each plane.*/
-  int                 fragy0[3];
-  /*The ending row for the current MCU in each plane.*/
-  int                 fragy_end[3];
-  /*The starting superblock for the current MCU in each plane.*/
-  unsigned            sbi0[3];
-  /*The ending superblock for the current MCU in each plane.*/
-  unsigned            sbi_end[3];
-  /*The number of tokens for zzi=1 for each color plane.*/
-  int                 ndct_tokens1[3];
-  /*The outstanding eob_run count for zzi=1 for each color plane.*/
-  int                 eob_run1[3];
-  /*Whether or not the loop filter is enabled.*/
-  int                 loop_filter;
-};
-
-
 static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
   ptrdiff_t *coded_fragis;
   unsigned   mcu_nvsbs;
   ptrdiff_t  mcu_nfrags;
+  int        flimit;
   int        hdec;
   int        vdec;
   int        pli;
+  int        nqis;
   int        qii;
+  int        qi0;
   int        qti;
   /*Initialize the per-plane coded block flag trackers.
     These are used for bit-estimation purposes only; the real flag bits span
@@ -529,24 +544,36 @@ static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
   memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
   memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
   /*Set up condensed quantizer tables.*/
+  qi0=_enc->state.qis[0];
+  nqis=_enc->state.nqis;
   for(pli=0;pli<3;pli++){
-    for(qii=0;qii<_enc->state.nqis;qii++){
+    for(qii=0;qii<nqis;qii++){
       int qi;
       qi=_enc->state.qis[qii];
       for(qti=0;qti<2;qti++){
-        _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
-        _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+        /*Set the DC coefficient in the dequantization table.*/
+        _enc->state.dequant_tables[qi][pli][qti][0]=
+         _enc->dequant_dc[qi0][pli][qti];
+        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+        /*Copy over the quantization table.*/
+        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
+         _enc->opt_data.enquant_table_size);
       }
     }
   }
+  /*Fix up the DC coefficients in the quantization tables.*/
+  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
   /*Initialize the tokenization state.*/
   for(pli=0;pli<3;pli++){
     _pipe->ndct_tokens1[pli]=0;
     _pipe->eob_run1[pli]=0;
   }
   /*Initialize the bounding value array for the loop filter.*/
-  _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
-   _pipe->bounding_values);
+  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
+  /*Clear the temporary DCT scratch space.*/
+  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
 }
 
 /*Sets the current MCU stripe to super block row _sby.
@@ -585,13 +612,17 @@ static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
 
 static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
-  int refi;
   /*Copy over all the uncoded fragments from this plane and advance the uncoded
      fragment list.*/
-  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
-   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
-  _pipe->nuncoded_fragis[_pli]=0;
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_enc->state,
+     _enc->state.ref_frame_data[OC_FRAME_SELF],
+     _enc->state.ref_frame_data[OC_FRAME_PREV],
+     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
+    _pipe->nuncoded_fragis[_pli]=0;
+  }
   /*Perform DC prediction.*/
   oc_enc_pred_dc_frag_rows(_enc,_pli,
    _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
@@ -606,17 +637,18 @@ static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
   _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
   _pipe->ncoded_fragis[_pli]=0;
   /*Apply the loop filter if necessary.*/
-  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   if(_pipe->loop_filter){
-    oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
-     refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+    oc_state_loop_filter_frag_rows(&_enc->state,
+     _pipe->bounding_values,OC_FRAME_SELF,_pli,
+     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
   }
   else _sdelay=_edelay=0;
   /*To fill borders, we have an additional two pixel delay, since a fragment
      in the next row could filter its top edge, using two pixels from a
      fragment in this row.
     But there's no reason to delay a full fragment between the two.*/
-  oc_state_borders_fill_rows(&_enc->state,refi,_pli,
+  oc_state_borders_fill_rows(&_enc->state,
+   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
    (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
    (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
 }
@@ -634,62 +666,62 @@ struct oc_rd_metric{
 
 
 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
- oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
- oc_rd_metric *_mo,oc_token_checkpoint **_stack){
-  OC_ALIGN16(ogg_int16_t  dct[64]);
-  OC_ALIGN16(ogg_int16_t  data[64]);
-  ogg_uint16_t            dc_dequant;
+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
+ unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
+ oc_fr_state *_fr,oc_token_checkpoint **_stack){
+  ogg_int16_t            *data;
+  ogg_int16_t            *dct;
+  ogg_int16_t            *idct;
+  oc_qii_state            qs;
   const ogg_uint16_t     *dequant;
-  const oc_iquant        *enquant;
+  ogg_uint16_t            dequant_dc;
   ptrdiff_t               frag_offs;
   int                     ystride;
   const unsigned char    *src;
   const unsigned char    *ref;
   unsigned char          *dst;
-  int                     frame_type;
   int                     nonzero;
   unsigned                uncoded_ssd;
   unsigned                coded_ssd;
-  int                     coded_dc;
   oc_token_checkpoint    *checkpoint;
   oc_fragment            *frags;
   int                     mb_mode;
+  int                     refi;
   int                     mv_offs[2];
   int                     nmv_offs;
   int                     ac_bits;
   int                     borderi;
+  int                     nqis;
   int                     qti;
   int                     qii;
-  int                     pi;
-  int                     zzi;
-  int                     v;
-  int                     val;
-  int                     d;
-  int                     s;
   int                     dc;
+  nqis=_enc->state.nqis;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
   src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
   borderi=frags[_fragi].borderi;
   qii=frags[_fragi].qii;
+  data=_enc->pipe.dct_data;
+  dct=data+64;
+  idct=data+128;
   if(qii&~3){
 #if !defined(OC_COLLECT_METRICS)
     if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
       /*Enable early skip detection.*/
       frags[_fragi].coded=0;
+      frags[_fragi].refi=OC_FRAME_NONE;
+      oc_fr_skip_block(_fr);
       return 0;
     }
 #endif
     /*Try and code this block anyway.*/
     qii&=3;
-    frags[_fragi].qii=qii;
   }
+  refi=frags[_fragi].refi;
   mb_mode=frags[_fragi].mb_mode;
-  ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
-  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
-   +frag_offs;
+  ref=_enc->state.ref_frame_data[refi]+frag_offs;
+  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
   /*Motion compensation:*/
   switch(mb_mode){
     case OC_MODE_INTRA:{
@@ -704,9 +736,9 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
     }break;
     default:{
       const oc_mv *frag_mvs;
-      frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
-      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
-       frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
+      frag_mvs=_enc->state.frag_mvs;
+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
+       _pli,frag_mvs[_fragi]);
       if(nmv_offs>1){
         oc_enc_frag_copy2(_enc,dst,
          ref+mv_offs[0],ref+mv_offs[1],ystride);
@@ -717,126 +749,121 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
   }
 #if defined(OC_COLLECT_METRICS)
   {
+    unsigned sad;
     unsigned satd;
     switch(nmv_offs){
-      case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break;
+      case 0:{
+        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
+        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
+      }break;
       case 1:{
-        satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
+        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
+        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
+        satd+=abs(dc);
       }break;
       default:{
-        satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX);
-      }
+        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
+        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
+        satd+=abs(dc);
+      }break;
     }
+    _enc->frag_sad[_fragi]=sad;
     _enc->frag_satd[_fragi]=satd;
   }
 #endif
   /*Transform:*/
   oc_enc_fdct8x8(_enc,dct,data);
-  /*Quantize the DC coefficient:*/
+  /*Quantize:*/
   qti=mb_mode!=OC_MODE_INTRA;
-  enquant=_pipe->enquant[_pli][0][qti];
-  dc_dequant=_pipe->dequant[_pli][0][qti][0];
-  v=dct[0];
-  val=v<<1;
-  s=OC_SIGNMASK(val);
-  val+=dc_dequant+s^s;
-  val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
-  dc=OC_CLAMPI(-580,val,580);
-  nonzero=0;
-  /*Quantize the AC coefficients:*/
-  dequant=_pipe->dequant[_pli][qii][qti];
-  enquant=_pipe->enquant[_pli][qii][qti];
-  for(zzi=1;zzi<64;zzi++){
-    v=dct[OC_FZIG_ZAG[zzi]];
-    d=dequant[zzi];
-    val=v<<1;
-    v=abs(val);
-    if(v>=d){
-      s=OC_SIGNMASK(val);
-      /*The bias added here rounds ties away from zero, since token
-         optimization can only decrease the magnitude of the quantized
-         value.*/
-      val+=d+s^s;
-      /*Note the arithmetic right shift is not guaranteed by ANSI C.
-        Hopefully no one still uses ones-complement architectures.*/
-      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
-      data[zzi]=OC_CLAMPI(-580,val,580);
-      nonzero=zzi;
-    }
-    else data[zzi]=0;
-  }
+  dequant=_enc->dequant[_pli][qii][qti];
+  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
+  dc=data[0];
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
-   _stack,qti?0:3);
+  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
+  else{
+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
+  dequant_dc=dequant[0];
   if(nonzero==0){
     ogg_int16_t p;
     int         ci;
+    int         qi01;
+    int         qi12;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
-    p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
     /*LOOP VECTORIZES.*/
     for(ci=0;ci<64;ci++)data[ci]=p;
+    /*We didn't code any AC coefficients, so don't change the quantizer.*/
+    qi01=_pipe->qs[_pli].qi01;
+    qi12=_pipe->qs[_pli].qi12;
+    if(qi01>0)qii=1+qi12;
+    else if(qi01>=0)qii=0;
   }
   else{
-    data[0]=dc*dc_dequant;
-    oc_idct8x8(&_enc->state,data,nonzero+1);
+    idct[0]=dc*dequant_dc;
+    /*Note: This clears idct[] back to zero for the next block.*/
+    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
+  }
+  frags[_fragi].qii=qii;
+  if(nqis>1){
+    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
+    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
   }
   if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
   else{
     oc_enc_frag_recon_inter(_enc,dst,
      nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
   }
-  frame_type=_enc->state.frame_type;
+  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
 #if !defined(OC_COLLECT_METRICS)
-  if(frame_type!=OC_INTRA_FRAME)
+  if(_fr!=NULL)
 #endif
   {
     /*In retrospect, should we have skipped this block?*/
-    oc_enc_frag_sub(_enc,data,src,dst,ystride);
-    coded_ssd=coded_dc=0;
     if(borderi<0){
-      for(pi=0;pi<64;pi++){
-        coded_ssd+=data[pi]*data[pi];
-        coded_dc+=data[pi];
-      }
+      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
     }
     else{
-      ogg_int64_t mask;
-      mask=_enc->state.borders[borderi].mask;
-      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        coded_ssd+=data[pi]*data[pi];
-        coded_dc+=data[pi];
-      }
+      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
+       _enc->state.borders[borderi].mask);
     }
     /*Scale to match DCT domain.*/
     coded_ssd<<=4;
-    /*We actually only want the AC contribution to the SSD.*/
-    coded_ssd-=coded_dc*coded_dc>>2;
 #if defined(OC_COLLECT_METRICS)
     _enc->frag_ssd[_fragi]=coded_ssd;
   }
-  if(frame_type!=OC_INTRA_FRAME){
+  if(_fr!=NULL){
 #endif
+    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
     uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
-    if(uncoded_ssd<UINT_MAX){
+    if(uncoded_ssd<UINT_MAX&&
+     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
+        is enabled.*/
+     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
+      int overhead_bits;
+      overhead_bits=oc_fr_cost1(_fr);
       /*Although the fragment coding overhead determination is accurate, it is
          greedy, using very coarse-grained local information.
         Allowing it to mildly discourage coding turns out to be beneficial, but
          it's not clear that allowing it to encourage coding through negative
          coding overhead deltas is useful.
-        For that reason, we disallow negative coding_overheads.*/
-      if(_overhead_bits<0)_overhead_bits=0;
-      if(uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&&
-       /*Don't allow luma blocks to be skipped in 4MV mode when VP3
-          compatibility is enabled.*/
-       (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
+        For that reason, we disallow negative coding overheads.*/
+      if(overhead_bits<0)overhead_bits=0;
+      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
         /*Hm, not worth it; roll back.*/
         oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
         *_stack=checkpoint;
         frags[_fragi].coded=0;
+        frags[_fragi].refi=OC_FRAME_NONE;
+        oc_fr_skip_block(_fr);
         return 0;
       }
     }
@@ -844,15 +871,20 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
     _mo->uncoded_ac_ssd+=uncoded_ssd;
     _mo->coded_ac_ssd+=coded_ssd;
     _mo->ac_bits+=ac_bits;
+    oc_fr_code_block(_fr);
   }
-  oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii);
+  /*GCC 4.4.4 generates a warning here because it can't tell that
+     the init code in the nqis check above will run anytime this
+     line runs.*/
+  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
   frags[_fragi].dc=dc;
   frags[_fragi].coded=1;
   return 1;
 }
 
-static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
- oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead){
+static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
   /*Worst case token stack usage for 4 fragments.*/
   oc_token_checkpoint  stack[64*4];
   oc_token_checkpoint *stackptr;
@@ -867,6 +899,7 @@ static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
   oc_fr_state          fr_checkpoint;
   oc_qii_state         qs_checkpoint;
   int                  mb_mode;
+  int                  refi;
   int                  ncoded;
   ptrdiff_t            fragi;
   int                  bi;
@@ -880,78 +913,83 @@ static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
   uncoded_fragis=_pipe->uncoded_fragis[0];
   nuncoded_fragis=_pipe->nuncoded_fragis[0];
   mb_mode=mb_modes[_mbi];
+  refi=OC_FRAME_FOR_MODE(mb_mode);
   ncoded=0;
   stackptr=stack;
   memset(&mo,0,sizeof(mo));
   for(bi=0;bi<4;bi++){
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].refi=refi;
     frags[fragi].mb_mode=mb_mode;
-    if(oc_enc_block_transform_quantize(_enc,
-     _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
-      oc_fr_code_block(_pipe->fr+0);
+    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
+     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
       coded_fragis[ncoded_fragis++]=fragi;
       ncoded++;
     }
-    else{
-      *(uncoded_fragis-++nuncoded_fragis)=fragi;
-      oc_fr_skip_block(_pipe->fr+0);
-    }
+    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
   }
-  if(_enc->state.frame_type!=OC_INTRA_FRAME){
-    if(ncoded>0&&!mo.dc_flag){
-      int cost;
-      /*Some individual blocks were worth coding.
-        See if that's still true when accounting for mode and MV overhead.*/
-      cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
-       +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
-      if(mo.uncoded_ac_ssd<=cost){
-        /*Taking macroblock overhead into account, it is not worth coding this
-           MB.*/
-        oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
-        *(_pipe->fr+0)=*&fr_checkpoint;
-        *(_pipe->qs+0)=*&qs_checkpoint;
-        for(bi=0;bi<4;bi++){
-          fragi=sb_maps[_mbi>>2][_mbi&3][bi];
-          if(frags[fragi].coded){
-            *(uncoded_fragis-++nuncoded_fragis)=fragi;
-            frags[fragi].coded=0;
-          }
-          oc_fr_skip_block(_pipe->fr+0);
+  if(ncoded>0&&!mo.dc_flag){
+    int cost;
+    /*Some individual blocks were worth coding.
+      See if that's still true when accounting for mode and MV overhead.*/
+    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
+     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
+    if(mo.uncoded_ac_ssd<=cost){
+      /*Taking macroblock overhead into account, it is not worth coding this
+         MB.*/
+      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
+      *(_pipe->fr+0)=*&fr_checkpoint;
+      *(_pipe->qs+0)=*&qs_checkpoint;
+      for(bi=0;bi<4;bi++){
+        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+        if(frags[fragi].coded){
+          *(uncoded_fragis-++nuncoded_fragis)=fragi;
+          frags[fragi].coded=0;
+          frags[fragi].refi=OC_FRAME_NONE;
         }
-        ncoded_fragis-=ncoded;
-        ncoded=0;
+        oc_fr_skip_block(_pipe->fr+0);
       }
-    }
-    /*If no luma blocks coded, the mode is forced.*/
-    if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
-    /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
-       with a single coded block.
-      This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
-       skipped blocks, while a 1MV does not.*/
-    else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
-      mb_modes[_mbi]=OC_MODE_INTER_MV;
+      ncoded_fragis-=ncoded;
+      ncoded=0;
     }
   }
+  /*If no luma blocks coded, the mode is forced.*/
+  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
+     with a single coded block.
+    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
+     skipped blocks, while a 1MV does not.*/
+  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
+    mb_modes[_mbi]=OC_MODE_INTER_MV;
+  }
   _pipe->ncoded_fragis[0]=ncoded_fragis;
   _pipe->nuncoded_fragis[0]=nuncoded_fragis;
   return ncoded;
 }
 
-static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
+static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
-  const oc_sb_map *sb_maps;
-  oc_sb_flags     *sb_flags;
-  ptrdiff_t       *coded_fragis;
-  ptrdiff_t        ncoded_fragis;
-  ptrdiff_t       *uncoded_fragis;
-  ptrdiff_t        nuncoded_fragis;
-  int              sbi;
+  const ogg_uint16_t *mcu_rd_scale;
+  const ogg_uint16_t *mcu_rd_iscale;
+  const oc_sb_map    *sb_maps;
+  oc_sb_flags        *sb_flags;
+  oc_fr_state        *fr;
+  ptrdiff_t          *coded_fragis;
+  ptrdiff_t           ncoded_fragis;
+  ptrdiff_t          *uncoded_fragis;
+  ptrdiff_t           nuncoded_fragis;
+  ptrdiff_t           froffset;
+  int                 sbi;
+  fr=_pipe->fr+_pli;
+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
   sb_flags=_enc->state.sb_flags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
   uncoded_fragis=_pipe->uncoded_fragis[_pli];
   nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
+  froffset=_pipe->froffset[_pli];
   for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
     /*Worst case token stack usage for 1 fragment.*/
     oc_token_checkpoint stack[64];
@@ -964,21 +1002,21 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
       fragi=sb_maps[sbi][quadi][bi];
       if(fragi>=0){
         oc_token_checkpoint *stackptr;
+        unsigned             rd_scale;
+        unsigned             rd_iscale;
+        rd_scale=mcu_rd_scale[fragi-froffset];
+        rd_iscale=mcu_rd_iscale[fragi-froffset];
         stackptr=stack;
-        if(oc_enc_block_transform_quantize(_enc,
-         _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
+        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
+         rd_scale,rd_iscale,&mo,fr,&stackptr)){
           coded_fragis[ncoded_fragis++]=fragi;
-          oc_fr_code_block(_pipe->fr+_pli);
-        }
-        else{
-          *(uncoded_fragis-++nuncoded_fragis)=fragi;
-          oc_fr_skip_block(_pipe->fr+_pli);
         }
+        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
       }
     }
-    oc_fr_state_flush_sb(_pipe->fr+_pli);
-    sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full;
-    sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial;
+    oc_fr_state_flush_sb(fr);
+    sb_flags[sbi].coded_fully=fr->sb_full;
+    sb_flags[sbi].coded_partially=fr->sb_partial;
   }
   _pipe->ncoded_fragis[_pli]=ncoded_fragis;
   _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
@@ -1006,8 +1044,8 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
   The bit counts and SSD measurements are obtained by examining actual encoded
    frames, with appropriate lambda values and optimal Huffman codes selected.
   EOB bits are assigned to the fragment that started the EOB run (as opposed to
-   dividing them among all the blocks in the run; though the latter approach
-   seems more theoretically correct, Monty's testing showed a small improvement
+   dividing them among all the blocks in the run; the latter approach seems
+   more theoretically correct, but Monty's testing showed a small improvement
    with the former, though that may have been merely statistical noise).
 
   @ARTICLE{Kim03,
@@ -1028,11 +1066,63 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
  +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
  +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
 
+static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
+#if !defined(OC_COLLECT_METRICS)
+  const
+#endif
+  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
+   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
+  int qii;
+#if defined(OC_COLLECT_METRICS)
+  oc_enc_mode_metrics_load(_enc);
+#endif
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    int qi;
+    int pli;
+    qi=_enc->state.qis[qii];
+    for(pli=0;pli<3;pli++){
+      int qti;
+      for(qti=0;qti<2;qti++){
+        int log_plq;
+        int modeline;
+        int bin;
+        int dx;
+        int dq;
+        log_plq=_enc->log_plq[qi][pli][qti];
+        /*Find the pair of rows in the mode table that bracket this quantizer.
+          If it falls outside the range the table covers, then we just use a
+           pair on the edge for linear extrapolation.*/
+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
+        /*Interpolate a row for this quantizer.*/
+        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
+        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
+        if(dq==0)dq=1;
+        for(bin=0;bin<OC_COMP_BINS;bin++){
+          int y0;
+          int z0;
+          int dy;
+          int dz;
+          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
+          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
+          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
+          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
+          _enc->mode_rd[qii][pli][qti][bin].rate=
+           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
+          _enc->mode_rd[qii][pli][qti][bin].rmse=
+           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
+        }
+      }
+    }
+  }
+}
+
 /*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
    prediction.*/
-static unsigned oc_dct_cost2(unsigned *_ssd,
- int _qi,int _pli,int _qti,int _satd){
+static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
+ int _qii,int _pli,int _qti,int _satd){
   unsigned rmse;
+  int      shift;
   int      bin;
   int      dx;
   int      y0;
@@ -1042,20 +1132,279 @@ static unsigned oc_dct_cost2(unsigned *_ssd,
   /*SATD metrics for chroma planes vary much less than luma, so we scale them
      by 4 to distribute them into the mode decision bins more evenly.*/
   _satd<<=_pli+1&2;
-  bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
-  dx=_satd-(bin<<OC_SAD_SHIFT);
-  y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
-  z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
-  dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
-  dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
-  rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
+  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
+  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
+  dx=_satd-(bin<<shift);
+  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
+  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
+  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
+  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
+  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
   *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
-  return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
+  return OC_MAXI(y0+(dy*dx>>shift),0);
+}
+
+/*activity_avg must be positive, or flat regions could get a zero weight, which
+   confounds analysis.
+  We set the minimum to this value so that it also avoids the need for divide
+   by zero checks in oc_mb_masking().*/
+# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
+
+static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4]){
+  const unsigned char *src;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *sb_map;
+  unsigned             luma;
+  int                  ystride;
+  ptrdiff_t            frag_offs;
+  ptrdiff_t            fragi;
+  int                  bi;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  luma=0;
+  for(bi=0;bi<4;bi++){
+    const unsigned char *s;
+    unsigned             x;
+    unsigned             x2;
+    unsigned             act;
+    int                  i;
+    int                  j;
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    /*TODO: This could be replaced with SATD^2, since we already have to
+       compute SATD.*/
+    x=x2=0;
+    s=src+frag_offs;
+    for(i=0;i<8;i++){
+      for(j=0;j<8;j++){
+        unsigned c;
+        c=s[j];
+        x+=c;
+        x2+=c*c;
+      }
+      s+=ystride;
+    }
+    luma+=x;
+    act=(x2<<6)-x*x;
+    if(act<8<<12){
+      /*The region is flat.*/
+      act=OC_MINI(act,5<<12);
+    }
+    else{
+      unsigned e1;
+      unsigned e2;
+      unsigned e3;
+      unsigned e4;
+      /*Test for an edge.
+        TODO: There are probably much simpler ways to do this (e.g., it could
+         probably be combined with the SATD calculation).
+        Alternatively, we could split the block around the mean and compute the
+         reduction in variance in each half.
+        For a Gaussian source the reduction should be
+         (1-2/pi) ~= 0.36338022763241865692446494650994.
+        Significantly more reduction is a good indication of a bi-level image.
+        This has the advantage of identifying, in addition to straight edges,
+         small text regions, which would otherwise be classified as "texture".*/
+      e1=e2=e3=e4=0;
+      s=src+frag_offs-1;
+      for(i=0;i<8;i++){
+        for(j=0;j<8;j++){
+          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
+           +(s+ystride)[j+2]-(s+ystride)[j]);
+          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
+           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
+          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
+           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
+          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
+           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
+        }
+        s+=ystride;
+      }
+      /*If the largest component of the edge energy is at least 40% of the
+         total, then classify the block as an edge block.*/
+      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
+         /*act=act_th*(act/act_th)**0.7
+              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
+           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
+         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
+      }
+    }
+    _activity[bi]=act;
+  }
+  return luma;
+}
+
+static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4],const unsigned _intra_satd[12]){
+  int bi;
+  for(bi=0;bi<4;bi++){
+    unsigned act;
+    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
+    if(act<8<<12){
+      /*The region is flat.*/
+      act=OC_MINI(act,5<<12);
+    }
+    _activity[bi]=act;
+  }
+}
+
+/*Compute the masking scales for the blocks in a macro block.
+  All masking is computed from the luma blocks.
+  We derive scaling factors for the chroma blocks from these, and use the same
+   ones for all chroma blocks, regardless of the subsampling.
+  It's possible for luma to be perfectly flat and yet have high chroma energy,
+   but this is unlikely in non-artificial images, and not a case that has been
+   addressed by any research to my knowledge.
+  The output of the masking process is two scale factors, which are fed into
+   the various R-D optimizations.
+  The first, rd_scale, is applied to D in the equation
+    D*rd_scale+lambda*R.
+  This is the form that must be used to properly combine scores from multiple
+   blocks, and can be interpreted as scaling distortions by their visibility.
+  The inverse, rd_iscale, is applied to lambda in the equation
+    D+rd_iscale*lambda*R.
+  This is equivalent to the first form within a single block, but much faster
+   to use when evaluating many possible distortions (e.g., during actual
+   quantization, where separate distortions are evaluated for every
+   coefficient).
+  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
+   used to perform the multiplications with the proper re-scaling for the range
+   of the scaling factors.
+  Many researchers apply masking values directly to the quantizers used, and
+   not to the R-D cost.
+  Since we generally use MSE for D, rd_scale must use the square of their
+   values to generate an equivalent effect.*/
+static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
+ const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
+ unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
+  unsigned activity_sum;
+  unsigned la;
+  unsigned lb;
+  unsigned d;
+  int      bi;
+  int      bi_min;
+  int      bi_min2;
+  /*The ratio lb/la is meant to approximate
+     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
+     effective luminance masking from~\cite{LKW06} (including the self-masking
+     deflator).
+    The following actually turns out to be a pretty good approximation for
+     _luma>75 or so.
+    For smaller values luminance does not really follow Weber's Law anyway, and
+     this approximation gives a much less aggressive bitrate boost in this
+     region.
+    Though some researchers claim that contrast sensitivity actually decreases
+     for very low luminance values, in my experience excessive brightness on
+     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
+     of the CCIR 601 range) make artifacts in such regions extremely visible.
+    We substitute _luma_avg for 128 to allow the strength of the masking to
+     vary with the actual average image luminance, within certain limits (the
+     caller has clamped _luma_avg to the range [90,160], inclusive).
+    @ARTICLE{LKW06,
+      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
+      title="{JPEG2000} Encoding With Perceptual Distortion Control",
+      journal="{IEEE} Transactions on Image Processing",
+      volume=15,
+      number=7,
+      pages="1763--1778",
+      month=Jul,
+      year=2006
+    }*/
+#if 0
+  la=_luma+4*_luma_avg;
+  lb=4*_luma+_luma_avg;
+#else
+  /*Disable luminance masking.*/
+  la=lb=1;
+#endif
+  activity_sum=0;
+  for(bi=0;bi<4;bi++){
+    unsigned a;
+    unsigned b;
+    activity_sum+=_activity[bi];
+    /*Apply activity masking.*/
+    a=_activity[bi]+4*_activity_avg;
+    b=4*_activity[bi]+_activity_avg;
+    d=OC_RD_SCALE(b,1);
+    /*And luminance masking.*/
+    d=(a+(d>>1))/d;
+    _rd_scale[bi]=(d*la+(lb>>1))/lb;
+    /*And now the inverse.*/
+    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
+    d=(b+(d>>1))/d;
+    _rd_iscale[bi]=(d*lb+(la>>1))/la;
+  }
+  /*Now compute scaling factors for chroma blocks.
+    We start by finding the two smallest iscales from the luma blocks.*/
+  bi_min=_rd_iscale[1]<_rd_iscale[0];
+  bi_min2=1-bi_min;
+  for(bi=2;bi<4;bi++){
+    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
+      bi_min2=bi_min;
+      bi_min=bi;
+    }
+    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
+  }
+  /*If the minimum iscale is less than 1.0, use the second smallest instead,
+     and force the value to at least 1.0 (inflating chroma is a waste).*/
+  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
+  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
+  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
+  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
+  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
+  return activity_sum;
+}
+
+static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _frag_satd[12]){
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    ystride;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  unsigned               luma;
+  int                    dc;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  luma=0;
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+    luma+=dc;
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  return luma;
 }
 
 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
 static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
- const oc_qii_state *_qs,unsigned _mbi){
+ const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
   const unsigned char *src;
   const ptrdiff_t     *frag_buf_offs;
   const oc_sb_map     *sb_maps;
@@ -1068,6 +1417,7 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
   unsigned             rate[4][3];
   int                  prev[3][3];
   unsigned             satd;
+  int                  dc;
   unsigned             best_cost;
   unsigned             best_ssd;
   unsigned             best_rate;
@@ -1083,19 +1433,30 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
   ystride=_enc->state.ref_ystride[0];
   fragi=sb_maps[_mbi>>2][_mbi&3][0];
   frag_offs=frag_buf_offs[fragi];
-  satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  else{
+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+  }
   nqis=_enc->state.nqis;
   lambda=_enc->lambda;
   for(qii=0;qii<nqis;qii++){
     oc_qii_state_advance(qs[0]+qii,_qs,qii);
-    rate[0][qii]=oc_dct_cost2(ssd[0]+qii,_enc->state.qis[qii],0,0,satd)
+    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
      +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
+    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
     cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
   }
   for(bi=1;bi<4;bi++){
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
     frag_offs=frag_buf_offs[fragi];
-    satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+    }
+    else{
+      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+    }
     for(qii=0;qii<nqis;qii++){
       oc_qii_state qt[3];
       unsigned     cur_ssd;
@@ -1103,7 +1464,8 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
       int          best_qij;
       int          qij;
       oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
-      cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,0,satd);
+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
       best_ssd=ssd[bi-1][0]+cur_ssd;
       best_rate=rate[bi-1][0]+cur_rate
        +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
@@ -1152,13 +1514,14 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
 
 /*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
 static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
- const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi){
+ const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
   const unsigned char *src;
   oc_fragment         *frags;
   ptrdiff_t            frag_offs;
   oc_qii_state         qt[3];
   unsigned             cost[3];
   unsigned             satd;
+  int                  dc;
   unsigned             best_cost;
   int                  best_qii;
   int                  qii;
@@ -1168,16 +1531,30 @@ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[_pli];
   frag_offs=_enc->state.frag_buf_offs[_fragi];
-  satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  else{
+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+  }
+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
+     worth spending the bits to change the AC quantizer.
+    TODO: This may be worth revisiting when we separate out DC and AC
+     predictions from SATD.*/
+#if 0
   nqis=_enc->state.nqis;
+#else
+  nqis=1;
+#endif
   lambda=_enc->lambda;
   best_qii=0;
   for(qii=0;qii<nqis;qii++){
     unsigned cur_rate;
     unsigned cur_ssd;
     oc_qii_state_advance(qt+qii,_qs,qii);
-    cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],_pli,0,satd)
+    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
      +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
+    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
     cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
   }
   best_cost=cost[0];
@@ -1192,17 +1569,49 @@ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
   return best_cost;
 }
 
+static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,unsigned _mbi,
+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
+  /*Worst case token stack usage for 4 fragments.*/
+  oc_token_checkpoint  stack[64*4];
+  oc_token_checkpoint *stackptr;
+  const oc_sb_map     *sb_maps;
+  oc_fragment         *frags;
+  ptrdiff_t           *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t            fragi;
+  int                  bi;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  frags=_enc->state.frags;
+  coded_fragis=_pipe->coded_fragis[0];
+  ncoded_fragis=_pipe->ncoded_fragis[0];
+  stackptr=stack;
+  for(bi=0;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].refi=OC_FRAME_SELF;
+    frags[fragi].mb_mode=OC_MODE_INTRA;
+    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
+     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
+    coded_fragis[ncoded_fragis++]=fragi;
+  }
+  _pipe->ncoded_fragis[0]=ncoded_fragis;
+}
+
 static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
-  const oc_sb_map *sb_maps;
-  oc_sb_flags     *sb_flags;
-  ptrdiff_t       *coded_fragis;
-  ptrdiff_t        ncoded_fragis;
-  int              sbi;
+  const ogg_uint16_t *mcu_rd_scale;
+  const ogg_uint16_t *mcu_rd_iscale;
+  const oc_sb_map    *sb_maps;
+  ptrdiff_t          *coded_fragis;
+  ptrdiff_t           ncoded_fragis;
+  ptrdiff_t           froffset;
+  int                 sbi;
+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
-  sb_flags=_enc->state.sb_flags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  froffset=_pipe->froffset[_pli];
   for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
     /*Worst case token stack usage for 1 fragment.*/
     oc_token_checkpoint stack[64];
@@ -1213,10 +1622,14 @@ static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
       fragi=sb_maps[sbi][quadi][bi];
       if(fragi>=0){
         oc_token_checkpoint *stackptr;
-        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi);
+        unsigned             rd_scale;
+        unsigned             rd_iscale;
+        rd_scale=mcu_rd_scale[fragi-froffset];
+        rd_iscale=mcu_rd_iscale[fragi-froffset];
+        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
         stackptr=stack;
-        oc_enc_block_transform_quantize(_enc,
-         _pipe,_pli,fragi,0,NULL,&stackptr);
+        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
+         rd_scale,rd_iscale,NULL,NULL,&stackptr);
         coded_fragis[ncoded_fragis++]=fragi;
       }
     }
@@ -1226,13 +1639,19 @@ static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
 
 /*Analysis stage for an INTRA frame.*/
 void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
-  oc_enc_pipeline_state   pipe;
+  ogg_int64_t             activity_sum;
+  ogg_int64_t             luma_sum;
+  unsigned                activity_avg;
+  unsigned                luma_avg;
+  const ogg_uint16_t     *chroma_rd_scale;
+  ogg_uint16_t           *mcu_rd_scale;
+  ogg_uint16_t           *mcu_rd_iscale;
   const unsigned char    *map_idxs;
   int                     nmap_idxs;
   oc_sb_flags            *sb_flags;
   signed char            *mb_modes;
   const oc_mb_map        *mb_maps;
-  oc_mb_enc_info         *embs;
+  const oc_sb_map        *sb_maps;
   oc_fragment            *frags;
   unsigned                stripe_sby;
   unsigned                mcu_nvsbs;
@@ -1242,7 +1661,14 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
   int                     pli;
   _enc->state.frame_type=OC_INTRA_FRAME;
   oc_enc_tokenize_start(_enc);
-  oc_enc_pipeline_init(_enc,&pipe);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
+  oc_enc_mode_rd_init(_enc);
+  activity_sum=luma_sum=0;
+  activity_avg=_enc->activity_avg;
+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
+  mcu_rd_scale=_enc->mcu_rd_scale;
+  mcu_rd_iscale=_enc->mcu_rd_iscale;
   /*Choose MVs and MB modes and quantize and code luma.
     Must be done in Hilbert order.*/
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
@@ -1253,52 +1679,91 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
   sb_flags=_enc->state.sb_flags;
   mb_modes=_enc->state.mb_modes;
   mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
-  embs=_enc->mb_info;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
   frags=_enc->state.frags;
   notstart=0;
   notdone=1;
   mcu_nvsbs=_enc->mcu_nvsbs;
   for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
-    unsigned sbi;
-    unsigned sbi_end;
-    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
-    sbi_end=pipe.sbi_end[0];
-    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+    ptrdiff_t cfroffset;
+    unsigned  sbi;
+    unsigned  sbi_end;
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
       int quadi;
       /*Mode addressing is through Y plane, always 4 MB per SB.*/
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        unsigned  activity[4];
+        unsigned  rd_scale[5];
+        unsigned  rd_iscale[5];
+        unsigned  luma;
         unsigned  mbi;
         int       mapii;
         int       mapi;
         int       bi;
         ptrdiff_t fragi;
         mbi=sbi<<2|quadi;
+        /*Activity masking.*/
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          luma=oc_mb_activity(_enc,mbi,activity);
+        }
+        else{
+          unsigned intra_satd[12];
+          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
+        }
+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
+        luma_sum+=luma;
         /*Motion estimation:
-          We always do a basic 1MV search for all macroblocks, coded or not,
-           keyframe or not.*/
-        if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_search(_enc,mbi);
-        oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi);
+          We do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not, unless we aren't using motion estimation at all.*/
+        if(!_recode&&_enc->state.curframe_num>0&&
+         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
+          oc_mcenc_search(_enc,mbi);
+        }
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
+        }
         mb_modes[mbi]=OC_MODE_INTRA;
-        oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
+        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
+         mbi,rd_scale,rd_iscale);
         /*Propagate final MB mode and MVs to the chroma blocks.*/
         for(mapii=4;mapii<nmap_idxs;mapii++){
           mapi=map_idxs[mapii];
           pli=mapi>>2;
           bi=mapi&3;
           fragi=mb_maps[mbi][pli][bi];
+          frags[fragi].refi=OC_FRAME_SELF;
           frags[fragi].mb_mode=OC_MODE_INTRA;
         }
+        /*Save masking scale factors for chroma blocks.*/
+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][1][bi];
+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
+        }
       }
     }
-    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
     /*Code chroma planes.*/
     for(pli=1;pli<3;pli++){
-      oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
-       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
-      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
     }
     notstart=1;
   }
+  /*Compute the average block activity and MB luma score for the frame.*/
+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
+   _enc->state.fplanes[0].nfrags));
+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
   /*Finish filling in the reference frame borders.*/
   refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
@@ -1339,27 +1804,21 @@ static const unsigned OC_NOSKIP[12]={
 
 static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
  oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
- const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[4],int _qti){
   oc_fr_state  fr;
   oc_qii_state qs;
   unsigned     ssd;
   unsigned     rate;
-  int          overhead;
   unsigned     satd;
   unsigned     best_ssd;
   unsigned     best_rate;
-  int          best_overhead;
   int          best_fri;
   int          best_qii;
-  unsigned     cur_cost;
-  unsigned     cur_ssd;
-  unsigned     cur_rate;
-  int          cur_overhead;
   int          lambda;
   int          nqis;
   int          nskipped;
   int          bi;
-  int          qii;
   lambda=_enc->lambda;
   nqis=_enc->state.nqis;
   /*We could do a trellis optimization here, but we don't make final skip
@@ -1370,26 +1829,36 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
      code the flags, anyway.*/
   *&fr=*_fr;
   *&qs=*_qs;
-  ssd=rate=overhead=nskipped=0;
+  ssd=rate=nskipped=0;
   for(bi=0;bi<4;bi++){
     oc_fr_state  ft[2];
     oc_qii_state qt[3];
     unsigned     best_cost;
+    unsigned     cur_cost;
+    unsigned     cur_ssd;
+    unsigned     cur_rate;
+    unsigned     cur_overhead;
+    int          qii;
     satd=_frag_satd[bi];
     *(ft+0)=*&fr;
     oc_fr_code_block(ft+0);
-    oc_qii_state_advance(qt+0,&qs,0);
-    best_overhead=(ft[0].bits-fr.bits<<OC_BIT_SCALE);
-    best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],0,_qti,satd)
-     +(qt[0].bits-qs.bits<<OC_BIT_SCALE);
-    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate+best_overhead,lambda);
+    cur_overhead=ft[0].bits-fr.bits;
+    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
+     +(cur_overhead<<OC_BIT_SCALE);
+    if(nqis>1){
+      oc_qii_state_advance(qt+0,&qs,0);
+      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
+    }
+    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
+    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
     best_fri=0;
     best_qii=0;
     for(qii=1;qii<nqis;qii++){
       oc_qii_state_advance(qt+qii,&qs,qii);
-      cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
-       +(qt[qii].bits-qs.bits<<OC_BIT_SCALE);
-      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate+best_overhead,lambda);
+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
+       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
       if(cur_cost<best_cost){
         best_cost=cur_cost;
         best_ssd=cur_ssd;
@@ -1397,7 +1866,7 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
         best_qii=qii;
       }
     }
-    if(_skip_ssd[bi]<UINT_MAX&&nskipped<3){
+    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
       *(ft+1)=*&fr;
       oc_fr_skip_block(ft+1);
       cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
@@ -1405,15 +1874,13 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
       cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
       if(cur_cost<=best_cost){
         best_ssd=cur_ssd;
-        best_rate=0;
-        best_overhead=cur_overhead;
+        best_rate=cur_overhead;
         best_fri=1;
         best_qii+=4;
       }
     }
     rate+=best_rate;
     ssd+=best_ssd;
-    overhead+=best_overhead;
     *&fr=*(ft+best_fri);
     if(best_fri==0)*&qs=*(qt+best_qii);
     else nskipped++;
@@ -1421,12 +1888,12 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
   }
   _modec->ssd=ssd;
   _modec->rate=rate;
-  _modec->overhead=OC_MAXI(overhead,0);
 }
 
 static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
  oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
- const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ unsigned _rd_scale,int _qti){
   unsigned ssd;
   unsigned rate;
   unsigned satd;
@@ -1443,7 +1910,15 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
   int      bi;
   int      qii;
   lambda=_enc->lambda;
+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
+     worth spending the bits to change the AC quantizer.
+    TODO: This may be worth revisiting when we separate out DC and AC
+     predictions from SATD.*/
+#if 0
   nqis=_enc->state.nqis;
+#else
+  nqis=1;
+#endif
   ssd=_modec->ssd;
   rate=_modec->rate;
   /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
@@ -1455,13 +1930,15 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
     for(;bi<nblocks;bi++){
       unsigned best_cost;
       satd=_frag_satd[bi];
-      best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd)
+      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
        +OC_CHROMA_QII_RATE;
+      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
       best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
       best_qii=0;
       for(qii=1;qii<nqis;qii++){
-        cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
+        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
          +OC_CHROMA_QII_RATE;
+        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
         cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
         if(cur_cost<best_cost){
           best_cost=cur_cost;
@@ -1470,7 +1947,7 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
           best_qii=qii;
         }
       }
-      if(_skip_ssd[bi]<UINT_MAX){
+      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
         cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
         cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
         if(cur_cost<=best_cost){
@@ -1490,65 +1967,50 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
 }
 
 static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
- unsigned _mbi,unsigned _ssd[12]){
-  OC_ALIGN16(ogg_int16_t  buffer[64]);
-  const unsigned char    *src;
-  const unsigned char    *ref;
-  int                     ystride;
-  const oc_fragment      *frags;
-  const ptrdiff_t        *frag_buf_offs;
-  const ptrdiff_t        *sb_map;
-  const oc_mb_map_plane  *mb_map;
-  const unsigned char    *map_idxs;
-  int                     map_nidxs;
-  ogg_int64_t             mask;
-  unsigned                uncoded_ssd;
-  int                     uncoded_dc;
-  unsigned                dc_dequant;
-  int                     dc_flag;
-  int                     mapii;
-  int                     mapi;
-  int                     pli;
-  int                     bi;
-  ptrdiff_t               fragi;
-  ptrdiff_t               frag_offs;
-  int                     borderi;
-  int                     pi;
+ unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const oc_fragment     *frags;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  oc_mv                 *mvs;
+  int                    map_nidxs;
+  unsigned               uncoded_ssd;
+  int                    mapii;
+  int                    mapi;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    borderi;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   ystride=_enc->state.ref_ystride[0];
   frags=_enc->state.frags;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][0][1][0];
+  mvs=_enc->mb_info[_mbi].block_mv;
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
-    frag_offs=frag_buf_offs[fragi];
-    oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
     borderi=frags[fragi].borderi;
-    uncoded_ssd=uncoded_dc=0;
+    frag_offs=frag_buf_offs[fragi];
     if(borderi<0){
-      for(pi=0;pi<64;pi++){
-        uncoded_ssd+=buffer[pi]*buffer[pi];
-        uncoded_dc+=buffer[pi];
-      }
+      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
     }
     else{
-      ogg_int64_t mask;
-      mask=_enc->state.borders[borderi].mask;
-      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        uncoded_ssd+=buffer[pi]*buffer[pi];
-        uncoded_dc+=buffer[pi];
-      }
+      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
     }
-    /*Scale to match DCT domain.*/
-    uncoded_ssd<<=4;
-    /*We actually only want the AC contribution to the SSD.*/
-    uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
-    /*DC is a special case; if there's more than a full-quantizer improvement
-       in the effective DC component, always force-code the block.*/
-    dc_flag=abs(uncoded_dc)>dc_dequant<<1;
-    uncoded_ssd|=-dc_flag;
+    /*Scale to match DCT domain and RD.*/
+    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
+    /*Motion is a special case; if there is more than a full-pixel motion
+       against the prior frame, penalize skipping.
+      TODO: The factor of two here is a kludge, but it tested out better than a
+       hard limit.*/
+    if(mvs[bi]!=0)uncoded_ssd*=2;
     _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -1556,96 +2018,52 @@ static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
   map_nidxs=(map_nidxs-4>>1)+4;
   mapii=4;
+  mvs=_enc->mb_info[_mbi].unref_mv;
   for(pli=1;pli<3;pli++){
     ystride=_enc->state.ref_ystride[pli];
-    dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][pli][1][0];
     for(;mapii<map_nidxs;mapii++){
       mapi=map_idxs[mapii];
       bi=mapi&3;
       fragi=mb_map[pli][bi];
-      frag_offs=frag_buf_offs[fragi];
-      oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
       borderi=frags[fragi].borderi;
-      uncoded_ssd=uncoded_dc=0;
+      frag_offs=frag_buf_offs[fragi];
       if(borderi<0){
-        for(pi=0;pi<64;pi++){
-          uncoded_ssd+=buffer[pi]*buffer[pi];
-          uncoded_dc+=buffer[pi];
-        }
+        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
       }
       else{
-        mask=_enc->state.borders[borderi].mask;
-        for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-          uncoded_ssd+=buffer[pi]*buffer[pi];
-          uncoded_dc+=buffer[pi];
-        }
+        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
       }
-      /*Scale to match DCT domain.*/
-      uncoded_ssd<<=4;
-      /*We actually only want the AC contribution to the SSD.*/
-      uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
-      /*DC is a special case; if there's more than a full-quantizer improvement
-         in the effective DC component, always force-code the block.*/
-      dc_flag=abs(uncoded_dc)>dc_dequant<<1;
-      uncoded_ssd|=-dc_flag;
+      /*Scale to match DCT domain and RD.*/
+      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
+      /*Motion is a special case; if there is more than a full-pixel motion
+         against the prior frame, penalize skipping.
+        TODO: The factor of two here is a kludge, but it tested out better than
+         a hard limit*/
+      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
       _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
     }
     map_nidxs=(map_nidxs-4<<1)+4;
   }
 }
 
-static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
- unsigned _frag_satd[12]){
-  const unsigned char   *src;
-  const ptrdiff_t       *frag_buf_offs;
-  const ptrdiff_t       *sb_map;
-  const oc_mb_map_plane *mb_map;
-  const unsigned char   *map_idxs;
-  int                    map_nidxs;
-  int                    mapii;
-  int                    mapi;
-  int                    ystride;
-  int                    pli;
-  int                    bi;
-  ptrdiff_t              fragi;
-  ptrdiff_t              frag_offs;
-  frag_buf_offs=_enc->state.frag_buf_offs;
-  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ystride=_enc->state.ref_ystride[0];
-  for(bi=0;bi<4;bi++){
-    fragi=sb_map[bi];
-    frag_offs=frag_buf_offs[fragi];
-    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
-  }
-  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
-  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
-  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
-  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
-  ystride=_enc->state.ref_ystride[1];
-  for(mapii=4;mapii<map_nidxs;mapii++){
-    mapi=map_idxs[mapii];
-    pli=mapi>>2;
-    bi=mapi&3;
-    fragi=mb_map[pli][bi];
-    frag_offs=frag_buf_offs[fragi];
-    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
-  }
-}
 
 static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
  unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
- const unsigned _frag_satd[12],const unsigned _skip_ssd[12]){
-  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
-  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
-  _modec->overhead+=
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[5]){
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   _frag_satd,_skip_ssd,_rd_scale[4],0);
+  _modec->overhead=
    oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
   oc_mode_set_cost(_modec,_enc->lambda);
 }
 
 static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
- unsigned _mbi,int _mb_mode,const signed char *_mv,
- const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
+ unsigned _mbi,int _mb_mode,oc_mv _mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
   unsigned               frag_satd[12];
   const unsigned char   *src;
   const unsigned char   *ref;
@@ -1658,35 +2076,45 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   int                    mapii;
   int                    mapi;
   int                    mv_offs[2];
-  int                    dx;
-  int                    dy;
   int                    pli;
   int                    bi;
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
+  int                    dc;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
+  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  dx=_mv[0];
-  dy=_mv[1];
   _modec->rate=_modec->ssd=0;
-  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
     for(bi=0;bi<4;bi++){
       fragi=sb_map[bi];
       frag_offs=frag_buf_offs[fragi];
-      frag_satd[bi]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+        frag_satd[bi]+=abs(dc);
+      }
+      else{
+        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      }
     }
   }
   else{
     for(bi=0;bi<4;bi++){
       fragi=sb_map[bi];
       frag_offs=frag_buf_offs[fragi];
-      frag_satd[bi]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+        frag_satd[bi]+=abs(dc);
+      }
+      else{
+        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+      }
     }
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -1694,15 +2122,22 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
   /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
   ystride=_enc->state.ref_ystride[1];
-  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
     for(mapii=4;mapii<map_nidxs;mapii++){
       mapi=map_idxs[mapii];
       pli=mapi>>2;
       bi=mapi&3;
       fragi=mb_map[pli][bi];
       frag_offs=frag_buf_offs[fragi];
-      frag_satd[mapii]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+        frag_satd[mapii]+=abs(dc);
+      }
+      else{
+        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      }
     }
   }
   else{
@@ -1712,30 +2147,38 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
       bi=mapi&3;
       fragi=mb_map[pli][bi];
       frag_offs=frag_buf_offs[fragi];
-      frag_satd[mapii]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+        frag_satd[mapii]+=abs(dc);
+      }
+      else{
+        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+      }
     }
   }
-  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
-  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
-  _modec->overhead+=
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   frag_satd,_skip_ssd,_rd_scale[4],1);
+  _modec->overhead=
    oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
   oc_mode_set_cost(_modec,_enc->lambda);
 }
 
 static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
  unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
- const unsigned _skip_ssd[12]){
-  static const oc_mv OC_MV_ZERO;
-  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_fr,_qs,_skip_ssd);
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
 }
 
 static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
- unsigned _mbi,int _mb_mode,const signed char *_mv,
- const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
+ unsigned _mbi,int _mb_mode,oc_mv _mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[4]){
   int bits0;
-  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd);
-  bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31];
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
+  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
   _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
    -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
   oc_mode_set_cost(_modec,_enc->lambda);
@@ -1749,7 +2192,7 @@ static const unsigned char OC_MB_PHASE[4][4]={
 
 static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
  unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
- const unsigned _skip_ssd[12]){
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
   unsigned               frag_satd[12];
   oc_mv                  lbmvs[4];
   oc_mv                  cbmvs[4];
@@ -1765,8 +2208,6 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   int                    mapii;
   int                    mapi;
   int                    mv_offs[2];
-  int                    dx;
-  int                    dy;
   int                    pli;
   int                    bi;
   ptrdiff_t              fragi;
@@ -1774,8 +2215,9 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   int                    bits0;
   int                    bits1;
   unsigned               satd;
+  int                    dc;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   frag_mvs=_enc->state.frag_mvs;
@@ -1783,41 +2225,36 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   _modec->rate=_modec->ssd=0;
   for(bi=0;bi<4;bi++){
     fragi=mb_map[0][bi];
-    dx=_mv[bi][0];
-    dy=_mv[bi][1];
     /*Save the block MVs as the current ones while we're here; we'll replace
        them if we don't ultimately choose 4MV mode.*/
-    frag_mvs[fragi][0]=(signed char)dx;
-    frag_mvs[fragi][1]=(signed char)dy;
+    frag_mvs[fragi]=_mv[bi];
     frag_offs=frag_buf_offs[fragi];
-    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
-      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
     }
     else{
-      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd;
+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
   }
   oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
-   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,1);
+   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
   /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
   bits0=0;
   bits1=0;
   nqis=_enc->state.nqis;
   for(bi=0;bi<4;bi++){
-    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis){
-      memset(lbmvs+bi,0,sizeof(*lbmvs));
-    }
+    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
     else{
-      memcpy(lbmvs+bi,_mv+bi,sizeof(*lbmvs));
-      bits0+=OC_MV_BITS[0][_mv[bi][0]+31]+OC_MV_BITS[0][_mv[bi][1]+31];
+      lbmvs[bi]=_mv[bi];
+      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
+       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
       bits1+=12;
     }
   }
-  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,
-   (const oc_mv *)lbmvs);
+  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
   map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
   /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
@@ -1827,23 +2264,22 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
     pli=mapi>>2;
     bi=mapi&3;
     fragi=mb_map[pli][bi];
-    dx=cbmvs[bi][0];
-    dy=cbmvs[bi][1];
     frag_offs=frag_buf_offs[fragi];
     /*TODO: We could save half these calls by re-using the results for the Cb
        and Cr planes; is it worth it?*/
-    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){
-      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
     }
     else{
-      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
-       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[mapii]=satd;
+    frag_satd[mapii]=satd+abs(dc);
   }
-  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
-  _modec->overhead+=
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   frag_satd,_skip_ssd,_rd_scale[4],1);
+  _modec->overhead=
    oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
    +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
    -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
@@ -1852,12 +2288,18 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
 
 int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   oc_set_chroma_mvs_func  set_chroma_mvs;
-  oc_enc_pipeline_state   pipe;
   oc_qii_state            intra_luma_qs;
   oc_mv                   last_mv;
   oc_mv                   prior_mv;
   ogg_int64_t             interbits;
   ogg_int64_t             intrabits;
+  ogg_int64_t             activity_sum;
+  ogg_int64_t             luma_sum;
+  unsigned                activity_avg;
+  unsigned                luma_avg;
+  const ogg_uint16_t     *chroma_rd_scale;
+  ogg_uint16_t           *mcu_rd_scale;
+  ogg_uint16_t           *mcu_rd_iscale;
   const unsigned char    *map_idxs;
   int                     nmap_idxs;
   unsigned               *coded_mbis;
@@ -1871,30 +2313,36 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   oc_mb_enc_info         *embs;
   oc_fragment            *frags;
   oc_mv                  *frag_mvs;
-  int                     qi;
   unsigned                stripe_sby;
   unsigned                mcu_nvsbs;
   int                     notstart;
   int                     notdone;
-  int                     vdec;
   unsigned                sbi;
   unsigned                sbi_end;
   int                     refi;
   int                     pli;
+  int                     sp_level;
+  sp_level=_enc->sp_level;
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
   _enc->state.frame_type=OC_INTER_FRAME;
   oc_mode_scheme_chooser_reset(&_enc->chooser);
   oc_enc_tokenize_start(_enc);
-  oc_enc_pipeline_init(_enc,&pipe);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
+  oc_enc_mode_rd_init(_enc);
   if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
   _enc->mv_bits[0]=_enc->mv_bits[1]=0;
   interbits=intrabits=0;
-  last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
+  activity_sum=luma_sum=0;
+  activity_avg=_enc->activity_avg;
+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
+  mcu_rd_scale=_enc->mcu_rd_scale;
+  mcu_rd_iscale=_enc->mcu_rd_iscale;
+  last_mv=prior_mv=0;
   /*Choose MVs and MB modes and quantize and code luma.
     Must be done in Hilbert order.*/
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
   nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
-  qi=_enc->state.qis[0];
   coded_mbis=_enc->coded_mbis;
   uncoded_mbis=coded_mbis+_enc->state.nmbs;
   ncoded_mbis=0;
@@ -1909,37 +2357,51 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   embs=_enc->mb_info;
   frags=_enc->state.frags;
   frag_mvs=_enc->state.frag_mvs;
-  vdec=!(_enc->state.info.pixel_fmt&2);
   notstart=0;
   notdone=1;
   mcu_nvsbs=_enc->mcu_nvsbs;
   for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
-    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
-    sbi_end=pipe.sbi_end[0];
-    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+    ptrdiff_t cfroffset;
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
       int quadi;
       /*Mode addressing is through Y plane, always 4 MB per SB.*/
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
         oc_mode_choice modes[8];
+        unsigned       activity[4];
+        unsigned       rd_scale[5];
+        unsigned       rd_iscale[5];
         unsigned       skip_ssd[12];
         unsigned       intra_satd[12];
+        unsigned       luma;
         int            mb_mv_bits_0;
         int            mb_gmv_bits_0;
         int            inter_mv_pref;
         int            mb_mode;
-        int            dx;
-        int            dy;
+        int            refi;
+        int            mv;
         unsigned       mbi;
         int            mapii;
         int            mapi;
         int            bi;
         ptrdiff_t      fragi;
         mbi=sbi<<2|quadi;
+        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+        /*Activity masking.*/
+        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          oc_mb_activity(_enc,mbi,activity);
+        }
+        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+        luma_sum+=luma;
+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
         /*Motion estimation:
           We always do a basic 1MV search for all macroblocks, coded or not,
            keyframe or not.*/
-        if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
-        dx=dy=0;
+        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
+        mv=0;
         /*Find the block choice with the lowest estimated coding cost.
           If a Cb or Cr block is coded but no Y' block from a macro block then
            the mode MUST be OC_MODE_INTER_NOMV.
@@ -1948,15 +2410,16 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         /*Block coding cost is estimated from correlated SATD metrics.*/
         /*At this point, all blocks that are in frame are still marked coded.*/
         if(!_recode){
-          memcpy(embs[mbi].unref_mv,
-           embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
+          embs[mbi].unref_mv[OC_FRAME_GOLD]=
+           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
+          embs[mbi].unref_mv[OC_FRAME_PREV]=
+           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
           embs[mbi].refined=0;
         }
-        oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Estimate the cost of coding this MB in a keyframe.*/
         if(_allow_keyframe){
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-           pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
+           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
           intrabits+=modes[OC_MODE_INTRA].rate;
           for(bi=0;bi<4;bi++){
             oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
@@ -1964,26 +2427,28 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
           }
         }
         /*Estimate the cost in a delta frame for various modes.*/
-        oc_skip_cost(_enc,&pipe,mbi,skip_ssd);
-        oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
-         OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
-        if(_enc->sp_level<OC_SP_LEVEL_NOMC){
+        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
+        if(sp_level<OC_SP_LEVEL_NOMC){
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-           pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
            OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
-           pipe.fr+0,pipe.qs+0,skip_ssd);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
-           OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
-           OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
-          oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-           embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
-           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
            OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
-           pipe.fr+0,pipe.qs+0,skip_ssd);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           /*The explicit MV modes (2,6,7) have not yet gone through halfpel
              refinement.
             We choose the explicit MV mode that's already furthest ahead on
@@ -1991,6 +2456,14 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
             We have to be careful to remember which ones we've refined so that
              we don't refine it again if we re-encode this frame.*/
           inter_mv_pref=_enc->lambda*3;
+          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
+          }
+          else{
+            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
+          }
           if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
            modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
             if(!(embs[mbi].refined&0x80)){
@@ -1998,7 +2471,8 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
               embs[mbi].refined|=0x80;
             }
             oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-             embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
           }
           else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
            modes[OC_MODE_INTER_MV].cost){
@@ -2008,7 +2482,7 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
             }
             mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
              OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
-             pipe.fr+0,pipe.qs+0,skip_ssd);
+             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           }
           if(!(embs[mbi].refined&0x04)){
             oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
@@ -2016,7 +2490,7 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
           }
           mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
            OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
-           pipe.fr+0,pipe.qs+0,skip_ssd);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           /*Finally, pick the mode with the cheapest estimated R-D cost.*/
           mb_mode=OC_MODE_INTER_NOMV;
           if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
@@ -2046,8 +2520,14 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
           }
         }
         else{
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
-           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           mb_mode=OC_MODE_INTER_NOMV;
           if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
             mb_mode=OC_MODE_INTRA;
@@ -2062,67 +2542,55 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         if(mb_mode!=OC_MODE_INTER_MV_FOUR){
           switch(mb_mode){
             case OC_MODE_INTER_MV:{
-              dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
-              dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
-            }break;
-            case OC_MODE_INTER_MV_LAST:{
-              dx=last_mv[0];
-              dy=last_mv[1];
-            }break;
-            case OC_MODE_INTER_MV_LAST2:{
-              dx=prior_mv[0];
-              dy=prior_mv[1];
+              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
             }break;
+            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
+            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
             case OC_MODE_GOLDEN_MV:{
-              dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
-              dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
+              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
             }break;
           }
           for(bi=0;bi<4;bi++){
             fragi=mb_maps[mbi][0][bi];
-            frag_mvs[fragi][0]=(signed char)dx;
-            frag_mvs[fragi][1]=(signed char)dy;
+            frag_mvs[fragi]=mv;
           }
         }
         for(bi=0;bi<4;bi++){
           fragi=sb_maps[mbi>>2][mbi&3][bi];
           frags[fragi].qii=modes[mb_mode].qii[bi];
         }
-        if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
-         modes[mb_mode].overhead>>OC_BIT_SCALE)>0){
+        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
+         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
           int orig_mb_mode;
           orig_mb_mode=mb_mode;
           mb_mode=mb_modes[mbi];
+          refi=OC_FRAME_FOR_MODE(mb_mode);
           switch(mb_mode){
             case OC_MODE_INTER_MV:{
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              prior_mv=last_mv;
               /*If we're backing out from 4MV, find the MV we're actually
                  using.*/
               if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
                 for(bi=0;;bi++){
                   fragi=mb_maps[mbi][0][bi];
                   if(frags[fragi].coded){
-                    memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
-                    dx=frag_mvs[fragi][0];
-                    dy=frag_mvs[fragi][1];
+                    mv=last_mv=frag_mvs[fragi];
                     break;
                   }
                 }
-                mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
+                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
               }
               /*Otherwise we used the original analysis MV.*/
-              else{
-                memcpy(last_mv,
-                 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
-              }
+              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
               _enc->mv_bits[0]+=mb_mv_bits_0;
               _enc->mv_bits[1]+=12;
             }break;
             case OC_MODE_INTER_MV_LAST2:{
               oc_mv tmp_mv;
-              memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
-              memcpy(last_mv,tmp_mv,sizeof(last_mv));
+              tmp_mv=prior_mv;
+              prior_mv=last_mv;
+              last_mv=tmp_mv;
             }break;
             case OC_MODE_GOLDEN_MV:{
               _enc->mv_bits[0]+=mb_gmv_bits_0;
@@ -2131,28 +2599,28 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
             case OC_MODE_INTER_MV_FOUR:{
               oc_mv lbmvs[4];
               oc_mv cbmvs[4];
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              prior_mv=last_mv;
               for(bi=0;bi<4;bi++){
                 fragi=mb_maps[mbi][0][bi];
                 if(frags[fragi].coded){
-                  memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
-                  memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
-                  _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
-                   +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
+                  lbmvs[bi]=last_mv=frag_mvs[fragi];
+                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
+                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
                   _enc->mv_bits[1]+=12;
                 }
                 /*Replace the block MVs for not-coded blocks with (0,0).*/
-                else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
+                else lbmvs[bi]=0;
               }
-              (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+              (*set_chroma_mvs)(cbmvs,lbmvs);
               for(mapii=4;mapii<nmap_idxs;mapii++){
                 mapi=map_idxs[mapii];
                 pli=mapi>>2;
                 bi=mapi&3;
                 fragi=mb_maps[mbi][pli][bi];
-                frags[fragi].mb_mode=mb_mode;
                 frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
-                memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
+                frags[fragi].refi=refi;
+                frags[fragi].mb_mode=mb_mode;
+                frag_mvs[fragi]=cbmvs[bi];
               }
             }break;
           }
@@ -2163,7 +2631,8 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         else{
           *(uncoded_mbis-++nuncoded_mbis)=mbi;
           mb_mode=OC_MODE_INTER_NOMV;
-          dx=dy=0;
+          refi=OC_FRAME_PREV;
+          mv=0;
         }
         /*Propagate final MB mode and MVs to the chroma blocks.
           This has already been done for 4MV mode, since it requires individual
@@ -2174,43 +2643,56 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
             pli=mapi>>2;
             bi=mapi&3;
             fragi=mb_maps[mbi][pli][bi];
-            frags[fragi].mb_mode=mb_mode;
             /*If we switched from 4MV mode to INTER_MV mode, then the qii
                values won't have been chosen with the right MV, but it's
                probaby not worth re-estimating them.*/
             frags[fragi].qii=modes[mb_mode].qii[mapii];
-            frag_mvs[fragi][0]=(signed char)dx;
-            frag_mvs[fragi][1]=(signed char)dy;
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mv;
           }
         }
+        /*Save masking scale factors for chroma blocks.*/
+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][1][bi];
+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
+        }
       }
-      oc_fr_state_flush_sb(pipe.fr+0);
-      sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
-      sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
+      oc_fr_state_flush_sb(_enc->pipe.fr+0);
+      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
+      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
     }
-    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
     /*Code chroma planes.*/
     for(pli=1;pli<3;pli++){
-      oc_enc_sb_transform_quantize_chroma(_enc,&pipe,
-       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
-      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
     }
     notstart=1;
   }
+  /*Update the average block activity and MB luma score for the frame.
+    We could use a Bessel follower here, but fast reaction is probably almost
+     always best.*/
+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
+   _enc->state.fplanes[0].nfrags));
+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
   /*Finish filling in the reference frame borders.*/
   refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
   /*Finish adding flagging overhead costs to inter bit counts to determine if
      we should have coded a key frame instead.*/
   if(_allow_keyframe){
-    if(interbits>intrabits)return 1;
     /*Technically the chroma plane counts are over-estimations, because they
        don't account for continuing runs from the luma planes, but the
-       inaccuracy is small.*/
-    for(pli=0;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
-    interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
-    interbits+=
-     _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
+       inaccuracy is small.
+      We don't need to add the luma plane coding flag costs, because they are
+       already included in the MB rate estimates.*/
+    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
     if(interbits>intrabits)return 1;
   }
   _enc->ncoded_mbis=ncoded_mbis;
@@ -2228,482 +2710,3 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   }
   return 0;
 }
-
-#if defined(OC_COLLECT_METRICS)
-# include <stdio.h>
-# include <math.h>
-
-/*TODO: It may be helpful (for block-level quantizers especially) to separate
-   out the contributions from AC and DC into separate tables.*/
-
-# define OC_ZWEIGHT   (0.25)
-
-static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
- double _w,int _satd,int _rate,double _rmse){
-  double rate;
-  /*Accumulate statistics without the scaling; this lets us change the scale
-     factor yet still use old data.*/
-  rate=ldexp(_rate,-OC_BIT_SCALE);
-  if(_metrics->fragw>0){
-    double dsatd;
-    double drate;
-    double drmse;
-    double w;
-    dsatd=_satd-_metrics->satd/_metrics->fragw;
-    drate=rate-_metrics->rate/_metrics->fragw;
-    drmse=_rmse-_metrics->rmse/_metrics->fragw;
-    w=_metrics->fragw*_w/(_metrics->fragw+_w);
-    _metrics->satd2+=dsatd*dsatd*w;
-    _metrics->satdrate+=dsatd*drate*w;
-    _metrics->rate2+=drate*drate*w;
-    _metrics->satdrmse+=dsatd*drmse*w;
-    _metrics->rmse2+=drmse*drmse*w;
-  }
-  _metrics->fragw+=_w;
-  _metrics->satd+=_satd*_w;
-  _metrics->rate+=rate*_w;
-  _metrics->rmse+=_rmse*_w;
-}
-
-static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
- const oc_mode_metrics *_src,int _n){
-  int i;
-  /*Find a non-empty set of metrics.*/
-  for(i=0;i<_n&&_src[i].fragw<=0;i++);
-  if(i>=_n){
-    memset(_dst,0,sizeof(*_dst));
-    return;
-  }
-  memcpy(_dst,_src+i,sizeof(*_dst));
-  /*And iterate over the remaining non-empty sets of metrics.*/
-  for(i++;i<_n;i++)if(_src[i].fragw>0){
-    double wa;
-    double wb;
-    double dsatd;
-    double drate;
-    double drmse;
-    double w;
-    wa=_dst->fragw;
-    wb=_src[i].fragw;
-    dsatd=_src[i].satd/wb-_dst->satd/wa;
-    drate=_src[i].rate/wb-_dst->rate/wa;
-    drmse=_src[i].rmse/wb-_dst->rmse/wa;
-    w=wa*wb/(wa+wb);
-    _dst->fragw+=_src[i].fragw;
-    _dst->satd+=_src[i].satd;
-    _dst->rate+=_src[i].rate;
-    _dst->rmse+=_src[i].rmse;
-    _dst->satd2+=_src[i].satd2+dsatd*dsatd*w;
-    _dst->satdrate+=_src[i].satdrate+dsatd*drate*w;
-    _dst->rate2+=_src[i].rate2+drate*drate*w;
-    _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w;
-    _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
-  }
-}
-
-/*Compile collected SATD/rate/RMSE metrics into a form that's immediately
-   useful for mode decision.*/
-static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){
-  int pli;
-  int qti;
-  oc_restore_fpu(&_enc->state);
-  /*Convert raw collected data into cleaned up sample points.*/
-  for(pli=0;pli<3;pli++){
-    for(qti=0;qti<2;qti++){
-      double fragw;
-      int    bin0;
-      int    bin1;
-      int    bin;
-      fragw=0;
-      bin0=bin1=0;
-      for(bin=0;bin<OC_SAD_BINS;bin++){
-        oc_mode_metrics metrics;
-        OC_MODE_RD[_qi][pli][qti][bin].rate=0;
-        OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
-        /*Find some points on either side of the current bin.*/
-        while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
-          fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
-        }
-        while(bin0+1<bin&&bin0+1<bin1&&
-         fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
-          fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
-        }
-        /*Merge statistics and fit lines.*/
-        oc_mode_metrics_merge(&metrics,
-         OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
-        if(metrics.fragw>0&&metrics.satd2>0){
-          double a;
-          double b;
-          double msatd;
-          double mrate;
-          double mrmse;
-          double rate;
-          double rmse;
-          msatd=metrics.satd/metrics.fragw;
-          mrate=metrics.rate/metrics.fragw;
-          mrmse=metrics.rmse/metrics.fragw;
-          /*Compute the points on these lines corresponding to the actual bin
-             value.*/
-          b=metrics.satdrate/metrics.satd2;
-          a=mrate-b*msatd;
-          rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
-          OC_MODE_RD[_qi][pli][qti][bin].rate=
-           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
-          b=metrics.satdrmse/metrics.satd2;
-          a=mrmse-b*msatd;
-          rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
-          OC_MODE_RD[_qi][pli][qti][bin].rmse=
-           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
-        }
-      }
-    }
-  }
-}
-
-
-
-/*The following token skipping code used to also be used in the decoder (and
-   even at one point other places in the encoder).
-  However, it was obsoleted by other optimizations, and is now only used here.
-  It has been moved here to avoid generating the code when it's not needed.*/
-
-/*Determines the number of blocks or coefficients to be skipped for a given
-   token value.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: A positive value indicates that number of coefficients are to be
-           skipped in the current block.
-          Otherwise, the negative of the return value indicates that number of
-           blocks are to be ended.*/
-typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
-
-/*Handles the simple end of block tokens.*/
-static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
-  int nblocks_adjust;
-  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
-  return -_extra_bits-nblocks_adjust;
-}
-
-/*The last EOB token has a special case, where an EOB run of size zero ends all
-   the remaining blocks in the frame.*/
-static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
-  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
-     yet available everywhere; this should be equivalent.*/
-  if(!_extra_bits)return -(~(size_t)0>>1);
-  return -_extra_bits;
-}
-
-/*Handles the pure zero run tokens.*/
-static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
-  return _extra_bits+1;
-}
-
-/*Handles a normal coefficient value token.*/
-static ptrdiff_t oc_token_skip_val(void){
-  return 1;
-}
-
-/*Handles a category 1A zero run/coefficient value combo token.*/
-static ptrdiff_t oc_token_skip_run_cat1a(int _token){
-  return _token-OC_DCT_RUN_CAT1A+2;
-}
-
-/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
-static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
-  int run_cati;
-  int ncoeffs_mask;
-  int ncoeffs_adjust;
-  run_cati=_token-OC_DCT_RUN_CAT1B;
-  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
-  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
-  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
-}
-
-/*A jump table for computing the number of coefficients or blocks to skip for
-   a given token value.
-  This reduces all the conditional branches, etc., needed to parse these token
-   values down to one indirect jump.*/
-static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob6,
-  oc_token_skip_zrl,
-  oc_token_skip_zrl,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  oc_token_skip_run,
-  oc_token_skip_run,
-  oc_token_skip_run,
-  oc_token_skip_run
-};
-
-/*Determines the number of blocks or coefficients to be skipped for a given
-   token value.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: A positive value indicates that number of coefficients are to be
-           skipped in the current block.
-          Otherwise, the negative of the return value indicates that number of
-           blocks are to be ended.
-          0 will never be returned, so that at least one coefficient in one
-           block will always be decoded for every token.*/
-static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
-  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
-}
-
-
-
-void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
-  static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
-     0,16,16,16,16,16,32,32,
-    32,32,32,32,32,32,32,48,
-    48,48,48,48,48,48,48,48,
-    48,48,48,48,64,64,64,64,
-    64,64,64,64,64,64,64,64,
-    64,64,64,64,64,64,64,64,
-    64,64,64,64,64,64,64,64
-  };
-  const oc_fragment *frags;
-  const unsigned    *frag_satd;
-  const unsigned    *frag_ssd;
-  const ptrdiff_t   *coded_fragis;
-  ptrdiff_t          ncoded_fragis;
-  ptrdiff_t          fragii;
-  double             fragw;
-  int                qti;
-  int                qii;
-  int                qi;
-  int                pli;
-  int                zzi;
-  int                token;
-  int                eb;
-  oc_restore_fpu(&_enc->state);
-  /*Load any existing mode metrics if we haven't already.*/
-  if(!oc_has_mode_metrics){
-    FILE *fmetrics;
-    memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
-    fmetrics=fopen("modedec.stats","rb");
-    if(fmetrics!=NULL){
-      fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
-      fclose(fmetrics);
-    }
-    for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
-    oc_has_mode_metrics=1;
-  }
-  qti=_enc->state.frame_type;
-  frags=_enc->state.frags;
-  frag_satd=_enc->frag_satd;
-  frag_ssd=_enc->frag_ssd;
-  coded_fragis=_enc->state.coded_fragis;
-  ncoded_fragis=fragii=0;
-  /*Weight the fragments by the inverse frame size; this prevents HD content
-     from dominating the statistics.*/
-  fragw=1.0/_enc->state.nfrags;
-  for(pli=0;pli<3;pli++){
-    ptrdiff_t ti[64];
-    int       eob_token[64];
-    int       eob_run[64];
-    /*Set up token indices and eob run counts.
-      We don't bother trying to figure out the real cost of the runs that span
-       coefficients; instead we use the costs that were available when R-D
-       token optimization was done.*/
-    for(zzi=0;zzi<64;zzi++){
-      ti[zzi]=_enc->dct_token_offs[pli][zzi];
-      if(ti[zzi]>0){
-        token=_enc->dct_tokens[pli][zzi][0];
-        eb=_enc->extra_bits[pli][zzi][0];
-        eob_token[zzi]=token;
-        eob_run[zzi]=-oc_dct_token_skip(token,eb);
-      }
-      else{
-        eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
-        eob_run[zzi]=0;
-      }
-    }
-    /*Scan the list of coded fragments for this plane.*/
-    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
-    for(;fragii<ncoded_fragis;fragii++){
-      ptrdiff_t    fragi;
-      ogg_uint32_t frag_bits;
-      int          huffi;
-      int          skip;
-      int          mb_mode;
-      unsigned     satd;
-      int          bin;
-      fragi=coded_fragis[fragii];
-      frag_bits=0;
-      for(zzi=0;zzi<64;){
-        if(eob_run[zzi]>0){
-          /*We've reached the end of the block.*/
-          eob_run[zzi]--;
-          break;
-        }
-        huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
-         +OC_ZZI_HUFF_OFFSET[zzi];
-        if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
-          /*This token caused an EOB run to be flushed.
-            Therefore it gets the bits associated with it.*/
-          frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
-           +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
-          eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
-        }
-        token=_enc->dct_tokens[pli][zzi][ti[zzi]];
-        eb=_enc->extra_bits[pli][zzi][ti[zzi]];
-        ti[zzi]++;
-        skip=oc_dct_token_skip(token,eb);
-        if(skip<0){
-          eob_token[zzi]=token;
-          eob_run[zzi]=-skip;
-        }
-        else{
-          /*A regular DCT value token; accumulate the bits for it.*/
-          frag_bits+=_enc->huff_codes[huffi][token].nbits
-           +OC_DCT_TOKEN_EXTRA_BITS[token];
-          zzi+=skip;
-        }
-      }
-      mb_mode=frags[fragi].mb_mode;
-      qi=_enc->state.qis[frags[fragi].qii];
-      satd=frag_satd[fragi]<<(pli+1&2);
-      bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1);
-      oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin,
-       fragw,satd,frag_bits<<OC_BIT_SCALE,sqrt(frag_ssd[fragi]));
-    }
-  }
-  /*Update global SATD/rate/RMSE estimation matrix.*/
-  for(qii=0;qii<_enc->state.nqis;qii++){
-    oc_enc_mode_metrics_update(_enc,_enc->state.qis[qii]);
-  }
-}
-
-void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){
-  FILE *fmetrics;
-  int   qi;
-  /*Generate sample points for complete list of QI values.*/
-  for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
-  fmetrics=fopen("modedec.stats","wb");
-  if(fmetrics!=NULL){
-    fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
-    fclose(fmetrics);
-  }
-  fprintf(stdout,
-   "/*File generated by libtheora with OC_COLLECT_METRICS"
-   " defined at compile time.*/\n"
-   "#if !defined(_modedec_H)\n"
-   "# define _modedec_H (1)\n"
-   "\n"
-   "\n"
-   "\n"
-   "# if defined(OC_COLLECT_METRICS)\n"
-   "typedef struct oc_mode_metrics oc_mode_metrics;\n"
-   "# endif\n"
-   "typedef struct oc_mode_rd      oc_mode_rd;\n"
-   "\n"
-   "\n"
-   "\n"
-   "/*The number of extra bits of precision at which to store rate"
-   " metrics.*/\n"
-   "# define OC_BIT_SCALE  (%i)\n"
-   "/*The number of extra bits of precision at which to store RMSE metrics.\n"
-   "  This must be at least half OC_BIT_SCALE (rounded up).*/\n"
-   "# define OC_RMSE_SCALE (%i)\n"
-   "/*The number of bins to partition statistics into.*/\n"
-   "# define OC_SAD_BINS   (%i)\n"
-   "/*The number of bits of precision to drop"
-   " from SAD scores to assign them to a\n"
-   "   bin.*/\n"
-   "# define OC_SAD_SHIFT  (%i)\n"
-   "\n"
-   "\n"
-   "\n"
-   "# if defined(OC_COLLECT_METRICS)\n"
-   "struct oc_mode_metrics{\n"
-   "  double fragw;\n"
-   "  double satd;\n"
-   "  double rate;\n"
-   "  double rmse;\n"
-   "  double satd2;\n"
-   "  double satdrate;\n"
-   "  double rate2;\n"
-   "  double satdrmse;\n"
-   "  double rmse2;\n"
-   "};\n"
-   "\n"
-   "\n"
-   "int             oc_has_mode_metrics;\n"
-   "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
-   "# endif\n"
-   "\n"
-   "\n"
-   "\n"
-   "struct oc_mode_rd{\n"
-   "  ogg_int16_t rate;\n"
-   "  ogg_int16_t rmse;\n"
-   "};\n"
-   "\n"
-   "\n"
-   "# if !defined(OC_COLLECT_METRICS)\n"
-   "static const\n"
-   "# endif\n"
-   "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
-   OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
-  for(qi=0;qi<64;qi++){
-    int pli;
-    fprintf(stdout,"  {\n");
-    for(pli=0;pli<3;pli++){
-      int qti;
-      fprintf(stdout,"    {\n");
-      for(qti=0;qti<2;qti++){
-        int bin;
-        static const char *pl_names[3]={"Y'","Cb","Cr"};
-        static const char *qti_names[2]={"INTRA","INTER"};
-        fprintf(stdout,"      /*%s  qi=%i  %s*/\n",
-         pl_names[pli],qi,qti_names[qti]);
-        fprintf(stdout,"      {\n");
-        fprintf(stdout,"        ");
-        for(bin=0;bin<OC_SAD_BINS;bin++){
-          if(bin&&!(bin&0x3))fprintf(stdout,"\n        ");
-          fprintf(stdout,"{%5i,%5i}",
-           OC_MODE_RD[qi][pli][qti][bin].rate,
-           OC_MODE_RD[qi][pli][qti][bin].rmse);
-          if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
-        }
-        fprintf(stdout,"\n      }");
-        if(qti<1)fprintf(stdout,",");
-        fprintf(stdout,"\n");
-      }
-      fprintf(stdout,"    }");
-      if(pli<2)fprintf(stdout,",");
-      fprintf(stdout,"\n");
-    }
-    fprintf(stdout,"  }");
-    if(qi<63)fprintf(stdout,",");
-    fprintf(stdout,"\n");
-  }
-  fprintf(stdout,
-   "};\n"
-   "\n"
-   "#endif\n");
-}
-#endif
diff --git a/thirdparty/libtheora/apiwrapper.c b/thirdparty/libtheora/apiwrapper.c
index dc959b8d13..87b4e939f2 100644
--- a/thirdparty/libtheora/apiwrapper.c
+++ b/thirdparty/libtheora/apiwrapper.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/apiwrapper.h b/thirdparty/libtheora/apiwrapper.h
index 93454d7bda..ff45e0a4d6 100644
--- a/thirdparty/libtheora/apiwrapper.h
+++ b/thirdparty/libtheora/apiwrapper.h
@@ -21,7 +21,7 @@
 # include <theora/theora.h>
 # include "theora/theoradec.h"
 # include "theora/theoraenc.h"
-# include "internal.h"
+# include "state.h"
 
 typedef struct th_api_wrapper th_api_wrapper;
 typedef struct th_api_info    th_api_info;
diff --git a/thirdparty/libtheora/bitpack.c b/thirdparty/libtheora/bitpack.c
index 8195003bad..5125dde6b0 100644
--- a/thirdparty/libtheora/bitpack.c
+++ b/thirdparty/libtheora/bitpack.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function: packing variable sized words into an octet stream
-  last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <string.h>
@@ -32,15 +32,18 @@ static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
   const unsigned char *stop;
   oc_pb_window         window;
   int                  available;
+  unsigned             shift;
+  stop=_b->stop;
+  ptr=_b->ptr;
   window=_b->window;
   available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+  shift=OC_PB_WINDOW_SIZE-available;
+  while(7<shift&&ptr<stop){
+    shift-=8;
+    window|=(oc_pb_window)*ptr++<<shift;
   }
   _b->ptr=ptr;
+  available=OC_PB_WINDOW_SIZE-shift;
   if(_bits>available){
     if(ptr>=stop){
       _b->eof=1;
@@ -67,7 +70,7 @@ void oc_pack_adv1(oc_pack_buf *_b){
 }
 
 /*Here we assume that 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits){
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
   oc_pb_window window;
   int          available;
   long         result;
@@ -82,12 +85,12 @@ long oc_pack_read(oc_pack_buf *_b,int _bits){
   available-=_bits;
   window<<=1;
   window<<=_bits-1;
-  _b->bits=available;
   _b->window=window;
+  _b->bits=available;
   return result;
 }
 
-int oc_pack_read1(oc_pack_buf *_b){
+int oc_pack_read1_c(oc_pack_buf *_b){
   oc_pb_window window;
   int          available;
   int          result;
@@ -100,8 +103,8 @@ int oc_pack_read1(oc_pack_buf *_b){
   result=window>>OC_PB_WINDOW_SIZE-1;
   available--;
   window<<=1;
-  _b->bits=available;
   _b->window=window;
+  _b->bits=available;
   return result;
 }
 
diff --git a/thirdparty/libtheora/bitpack.h b/thirdparty/libtheora/bitpack.h
index a020a292f5..237b584055 100644
--- a/thirdparty/libtheora/bitpack.h
+++ b/thirdparty/libtheora/bitpack.h
@@ -16,15 +16,32 @@
  ********************************************************************/
 #if !defined(_bitpack_H)
 # define _bitpack_H (1)
+# include <stddef.h>
 # include <limits.h>
+# include "internal.h"
 
 
 
-typedef unsigned long      oc_pb_window;
+typedef size_t             oc_pb_window;
 typedef struct oc_pack_buf oc_pack_buf;
 
 
 
+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+#  include "arm/armbits.h"
+# endif
+
+# if !defined(oc_pack_read)
+#  define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+#  define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+#  define oc_huff_token_decode oc_huff_token_decode_c
+# endif
+
 # define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
 /*This is meant to be a large, positive constant that can still be efficiently
    loaded as an immediate (on platforms like ARM, for example).
@@ -34,9 +51,9 @@ typedef struct oc_pack_buf oc_pack_buf;
 
 
 struct oc_pack_buf{
-  oc_pb_window         window;
-  const unsigned char *ptr;
   const unsigned char *stop;
+  const unsigned char *ptr;
+  oc_pb_window         window;
   int                  bits;
   int                  eof;
 };
@@ -45,8 +62,8 @@ void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
 int oc_pack_look1(oc_pack_buf *_b);
 void oc_pack_adv1(oc_pack_buf *_b);
 /*Here we assume 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits);
-int oc_pack_read1(oc_pack_buf *_b);
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
 /* returns -1 for read beyond EOF, or the number of whole bytes available */
 long oc_pack_bytes_left(oc_pack_buf *_b);
 
diff --git a/thirdparty/libtheora/collect.c b/thirdparty/libtheora/collect.c
new file mode 100644
index 0000000000..c0d8a2733f
--- /dev/null
+++ b/thirdparty/libtheora/collect.c
@@ -0,0 +1,974 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include <string.h>
+#include "collect.h"
+
+#if defined(OC_COLLECT_METRICS)
+
+int              OC_HAS_MODE_METRICS;
+double           OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+double           OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+const char      *OC_MODE_METRICS_FILENAME="modedec.stats";
+
+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _s,int _q,int _r,double _d){
+  if(_metrics->w>0){
+    double ds;
+    double dq;
+    double dr;
+    double dd;
+    double ds2;
+    double dq2;
+    double s2;
+    double sq;
+    double q2;
+    double sr;
+    double qr;
+    double sd;
+    double qd;
+    double s2q;
+    double sq2;
+    double w;
+    double wa;
+    double rwa;
+    double rwa2;
+    double rwb;
+    double rwb2;
+    double rw2;
+    double rw3;
+    double rw4;
+    wa=_metrics->w;
+    ds=_s-_metrics->s/wa;
+    dq=_q-_metrics->q/wa;
+    dr=_r-_metrics->r/wa;
+    dd=_d-_metrics->d/wa;
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2=_metrics->s2;
+    sq=_metrics->sq;
+    q2=_metrics->q2;
+    sr=_metrics->sr;
+    qr=_metrics->qr;
+    sd=_metrics->sd;
+    qd=_metrics->qd;
+    s2q=_metrics->s2q;
+    sq2=_metrics->sq2;
+    w=wa+_w;
+    rwa=wa/w;
+    rwb=_w/w;
+    rwa2=rwa*rwa;
+    rwb2=rwb*rwb;
+    rw2=wa*rwb;
+    rw3=rw2*(rwa2-rwb2);
+    rw4=_w*rwa2*rwa2+wa*rwb2*rwb2;
+    _metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb
+     +(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4;
+    _metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3;
+    _metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3;
+    _metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3;
+    _metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3;
+    _metrics->s2+=ds2*rw2;
+    _metrics->sq+=ds*dq*rw2;
+    _metrics->q2+=dq2*rw2;
+    _metrics->sr+=ds*dr*rw2;
+    _metrics->qr+=dq*dr*rw2;
+    _metrics->r2+=dr*dr*rw2;
+    _metrics->sd+=ds*dd*rw2;
+    _metrics->qd+=dq*dd*rw2;
+    _metrics->d2+=dd*dd*rw2;
+  }
+  _metrics->w+=_w;
+  _metrics->s+=_s*_w;
+  _metrics->q+=_q*_w;
+  _metrics->r+=_r*_w;
+  _metrics->d+=_d*_w;
+}
+
+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n){
+  int i;
+  /*Find a non-empty set of metrics.*/
+  for(i=0;i<_n&&_src[i].w==0;i++);
+  if(i>=_n){
+    memset(_dst,0,sizeof(*_dst));
+    return;
+  }
+  memcpy(_dst,_src+i,sizeof(*_dst));
+  /*And iterate over the remaining non-empty sets of metrics.*/
+  for(i++;i<_n;i++)if(_src[i].w!=0){
+    double ds;
+    double dq;
+    double dr;
+    double dd;
+    double ds2;
+    double dq2;
+    double s2a;
+    double s2b;
+    double sqa;
+    double sqb;
+    double q2a;
+    double q2b;
+    double sra;
+    double srb;
+    double qra;
+    double qrb;
+    double sda;
+    double sdb;
+    double qda;
+    double qdb;
+    double s2qa;
+    double s2qb;
+    double sq2a;
+    double sq2b;
+    double w;
+    double wa;
+    double wb;
+    double rwa;
+    double rwb;
+    double rwa2;
+    double rwb2;
+    double rw2;
+    double rw3;
+    double rw4;
+    wa=_dst->w;
+    wb=_src[i].w;
+    ds=_src[i].s/wb-_dst->s/wa;
+    dq=_src[i].q/wb-_dst->q/wa;
+    dr=_src[i].r/wb-_dst->r/wa;
+    dd=_src[i].d/wb-_dst->d/wa;
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2a=_dst->s2;
+    sqa=_dst->sq;
+    q2a=_dst->q2;
+    sra=_dst->sr;
+    qra=_dst->qr;
+    sda=_dst->sd;
+    qda=_dst->qd;
+    s2qa=_dst->s2q;
+    sq2a=_dst->sq2;
+    s2b=_src[i].s2;
+    sqb=_src[i].sq;
+    q2b=_src[i].q2;
+    srb=_src[i].sr;
+    qrb=_src[i].qr;
+    sdb=_src[i].sd;
+    qdb=_src[i].qd;
+    s2qb=_src[i].s2q;
+    sq2b=_src[i].sq2;
+    w=wa+wb;
+    if(w==0)rwa=rwb=0;
+    else{
+      rwa=wa/w;
+      rwb=wb/w;
+    }
+    rwa2=rwa*rwa;
+    rwb2=rwb*rwb;
+    rw2=wa*rwb;
+    rw3=rw2*(rwa2-rwb2);
+    rw4=wb*rwa2*rwa2+wa*rwb2*rwb2;
+    /*
+    (1,1,1) ->
+     (0,0,0)#
+     (1,0,0) C(1,1)*C(1,0)*C(1,0)->  d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1})
+     (0,1,0) C(1,0)*C(1,1)*C(1,0)->  d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1})
+     (0,0,1) C(1,0)*C(1,0)*C(1,1)->  d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0})
+     (1,1,0)*
+     (1,0,1)*
+     (0,1,1)*
+     (1,1,1) C(1,1)*C(1,1)*C(1,1)->  d^{1,1,1}*(rwa^3*wb-rwb^3*wa)
+    (2,1) ->
+     (0,0)#
+     (1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1})
+     (0,1) C(2,0)*C(1,1)->  d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0})
+     (2,0)*
+     (1,1)*
+     (2,1) C(2,2)*C(1,1)->  d^{2,1}*(rwa^3*wb-rwb^3*wa)
+    (2,2) ->
+     (0,0)#
+     (1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2})
+     (0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1})
+     (2,0) C(2,2)*C(2,0)->  d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2})
+     (1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1})
+     (0,2) C(2,0)*C(2,2)->  d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0})
+     (1,2)*
+     (2,1)*
+     (2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa)
+    */
+    _dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa))
+     +ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa)
+     +dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4;
+    _dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa)
+     +dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3;
+    _dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a)
+     +2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3;
+    _dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra)
+     +dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3;
+    _dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda)
+     +dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3;
+    _dst->s2+=_src[i].s2+ds2*rw2;
+    _dst->sq+=_src[i].sq+ds*dq*rw2;
+    _dst->q2+=_src[i].q2+dq2*rw2;
+    _dst->sr+=_src[i].sr+ds*dr*rw2;
+    _dst->qr+=_src[i].qr+dq*dr*rw2;
+    _dst->r2+=_src[i].r2+dr*dr*rw2;
+    _dst->sd+=_src[i].sd+ds*dd*rw2;
+    _dst->qd+=_src[i].qd+dq*dd*rw2;
+    _dst->d2+=_src[i].d2+dd*dd*rw2;
+    _dst->w+=_src[i].w;
+    _dst->s+=_src[i].s;
+    _dst->q+=_src[i].q;
+    _dst->r+=_src[i].r;
+    _dst->d+=_src[i].d;
+  }
+}
+
+/*Adjust a single corner of a set of metric bins to minimize the squared
+   prediction error of R and D.
+  Each bin is assumed to cover a quad like so:
+    (s0,q0)    (s1,q0)
+       A----------B
+       |          |
+       |          |
+       |          |
+       |          |
+       C----------Z
+    (s0,q1)    (s1,q1)
+  The values A, B, and C are fixed, and Z is the free parameter.
+  Then, for example, R_i is predicted via bilinear interpolation as
+    x_i=(s_i-s0)/(s1-s0)
+    y_i=(q_i-q0)/(q1-q0)
+    dRds1_i=A+(B-A)*x_i
+    dRds2_i=C+(Z-C)*x_i
+    R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i
+  To find the Z that minimizes the squared prediction error over i, this can
+   be rewritten as
+    R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z
+  Letting X={...,x_i*y_i,...}^T and
+   Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T,
+   the optimal Z is given by Z=(X^T.Y)/(X^T.X).
+  Now, we need to compute these dot products without actually storing data for
+   each sample.
+  Starting with X^T.X, we have
+   X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2).
+  Expanding the interior of the sum in a monomial basis of s_i and q_i gives
+    s0^2*q0^2  *(1)
+     -2*s0*q0^2*(s_i)
+     -2*s0^2*q0*(q_i)
+     +q0^2     *(s_i^2)
+     +4*s0*q0  *(s_i*q_i)
+     +s0^2     *(q_i^2)
+     -2*q0     *(s_i^2*q_i)
+     -2*s0     *(s_i*q_i^2)
+     +1        *(s_i^2*q_i^2).
+  However, computing things directly in this basis leads to gross numerical
+   errors, as most of the terms will have similar size and destructive
+   cancellation results.
+  A much better basis is the central (co-)moment basis:
+    {1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2,
+     (s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2},
+   where sbar and qbar are the average s and q values over the bin,
+   respectively.
+  In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is
+    ds^2*dq^2*(1)
+     +dq^2   *((s_i-sbar)^2)
+     +4*ds*dq*((s_i-sbar)*(q_i-qbar))
+     +ds^2   *((q_i-qbar)^2)
+     +2*dq   *((s_i-sbar)^2*(q_i-qbar))
+     +2*ds   *((s_i-sbar)*(q_i-qbar)^2)
+     +1      *((s_i-sbar)^2*(q_i-qbar)^2).
+  With these expressions in the central (co-)moment bases, all we need to do
+   is compute sums over the (co-)moment terms, which can be done
+   incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()),
+   with no need to store the individual samples.
+  Now, for X^T.Y, we have
+    X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0)
+     -((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/
+     ((s1-s0)*(q1-q0)),
+   or, rewriting the constants to simplify notation,
+    X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0)
+     +C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)).
+  Again, converting to the central (co-)moment basis, the interior of the
+   above sum is
+    ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq)  *(1)
+     +(C1*dq+C3*dq^2)                     *((s_i-sbar)^2)
+     +(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar))
+     +(C2*ds+C3*ds^2)                     *((q_i-qbar)^2)
+     +dq                                  *((s_i-sbar)*(r_i-rbar))
+     +ds                                  *((q_i-qbar)*(r_i-rbar))
+     +(C1+2*C3*dq)                        *((s_i-sbar)^2*(q_i-qbar))
+     +(C2+2*C3*ds)                        *((s_i-sbar)*(q_i-qbar)^2)
+     +1                                   *((s_i-sbar)*(q_i-qbar)*(r_i-rbar))
+     +C3                                  *((s_i-sbar)^2*(q_i-qbar)^2).
+  You might think it would be easier (if perhaps slightly less robust) to
+   accumulate terms directly around s0 and q0.
+  However, we update each corner of the bins in turn, so we would have to
+   change basis to move the sums from corner to corner anyway.*/
+double oc_mode_metrics_solve(double *_r,double *_d,
+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
+ const int *_q0,const int *_q1,
+ const double *_ra,const double *_rb,const double *_rc,
+ const double *_da,const double *_db,const double *_dc,int _n){
+  double xx;
+  double rxy;
+  double dxy;
+  double wt;
+  int i;
+  xx=rxy=dxy=wt=0;
+  for(i=0;i<_n;i++)if(_metrics[i].w>0){
+    double s10;
+    double q10;
+    double sq10;
+    double ds;
+    double dq;
+    double ds2;
+    double dq2;
+    double r;
+    double d;
+    double s2;
+    double sq;
+    double q2;
+    double sr;
+    double qr;
+    double sd;
+    double qd;
+    double s2q;
+    double sq2;
+    double sqr;
+    double sqd;
+    double s2q2;
+    double c0;
+    double c1;
+    double c2;
+    double c3;
+    double w;
+    w=_metrics[i].w;
+    wt+=w;
+    s10=_s1[i]-_s0[i];
+    q10=_q1[i]-_q0[i];
+    sq10=s10*q10;
+    ds=_metrics[i].s/w-_s0[i];
+    dq=_metrics[i].q/w-_q0[i];
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2=_metrics[i].s2;
+    sq=_metrics[i].sq;
+    q2=_metrics[i].q2;
+    s2q=_metrics[i].s2q;
+    sq2=_metrics[i].sq2;
+    s2q2=_metrics[i].s2q2;
+    xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10);
+    r=_metrics[i].r/w;
+    sr=_metrics[i].sr;
+    qr=_metrics[i].qr;
+    sqr=_metrics[i].sqr;
+    c0=-_ra[i];
+    c1=-(_rb[i]-_ra[i])/s10;
+    c2=-(_rc[i]-_ra[i])/q10;
+    c3=-(_ra[i]-_rb[i]-_rc[i])/sq10;
+    rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
+     +(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr
+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10;
+    d=_metrics[i].d/w;
+    sd=_metrics[i].sd;
+    qd=_metrics[i].qd;
+    sqd=_metrics[i].sqd;
+    c0=-_da[i];
+    c1=-(_db[i]-_da[i])/s10;
+    c2=-(_dc[i]-_da[i])/q10;
+    c3=-(_da[i]-_db[i]-_dc[i])/sq10;
+    dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
+     +(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd
+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10;
+  }
+  if(xx>1E-3){
+    *_r=rxy/xx;
+    *_d=dxy/xx;
+  }
+  else{
+    *_r=0;
+    *_d=0;
+  }
+  return wt;
+}
+
+/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
+   useful for mode decision.*/
+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
+ int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
+  int niters;
+  int prevdr;
+  int prevdd;
+  int dr;
+  int dd;
+  int pli;
+  int qti;
+  int qi;
+  int si;
+  dd=dr=INT_MAX;
+  niters=0;
+  /*The encoder interpolates rate and RMSE terms bilinearly from an
+     OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
+    To find the sample values at the grid points that minimize the total
+     squared prediction error actually requires solving a relatively sparse
+     linear system with a number of variables equal to the number of grid
+     points.
+    Instead of writing a general sparse linear system solver, we just use
+     Gauss-Seidel iteration, i.e., we update one grid point at time until
+     they stop changing.*/
+  do{
+    prevdr=dr;
+    prevdd=dd;
+    dd=dr=0;
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
+          for(si=0;si<OC_COMP_BINS;si++){
+            oc_mode_metrics m[4];
+            int             s0[4];
+            int             s1[4];
+            int             q0[4];
+            int             q1[4];
+            double          ra[4];
+            double          rb[4];
+            double          rc[4];
+            double          da[4];
+            double          db[4];
+            double          dc[4];
+            double          r;
+            double          d;
+            int             rate;
+            int             rmse;
+            int             ds;
+            int             n;
+            n=0;
+            /*Collect the statistics for the (up to) four bins grid point
+               (si,qi) touches.*/
+            if(qi>0&&si>0){
+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si-1<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
+            }
+            if(qi>0){
+              ds=si+1<OC_COMP_BINS?1:-1;
+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si+ds<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              da[n]=
+               ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si);
+            }
+            if(qi+1<OC_LOGQ_BINS&&si>0){
+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si-1<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi][pli][qti]+si-1);
+            }
+            if(qi+1<OC_LOGQ_BINS){
+              ds=si+1<OC_COMP_BINS?1:-1;
+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si+ds<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              da[n]=
+               ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi][pli][qti]+si);
+            }
+            /*On the first pass, initialize with a simple weighted average of
+               the neighboring bins.*/
+            if(!OC_HAS_MODE_METRICS&&niters==0){
+              double w;
+              w=r=d=0;
+              while(n-->0){
+                w+=m[n].w;
+                r+=m[n].r;
+                d+=m[n].d;
+              }
+              r=w>1E-3?r/w:0;
+              d=w>1E-3?d/w:0;
+              _weight[qi][pli][qti][si]=w;
+            }
+            else{
+              /*Update the grid point and save the weight for later.*/
+              _weight[qi][pli][qti][si]=
+               oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
+            }
+            rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
+            rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
+            dr+=abs(rate-_table[qi][pli][qti][si].rate);
+            dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
+            _table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
+          }
+        }
+      }
+    }
+  }
+  /*After a fixed number of initial iterations, only iterate so long as the
+     total change is decreasing.
+    This ensures we don't oscillate forever, which is a danger, as all of our
+     results are rounded fairly coarsely.*/
+  while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd)));
+  if(_reweight){
+    /*Now, reduce the values of the optimal solution until we get enough
+       samples in each bin to overcome the constant OC_ZWEIGHT factor.
+      This encourages sampling under-populated bins and prevents a single large
+       sample early on from discouraging coding in that bin ever again.*/
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
+          for(si=0;si<OC_COMP_BINS;si++){
+            double wt;
+            wt=_weight[qi][pli][qti][si];
+            wt/=OC_ZWEIGHT+wt;
+            _table[qi][pli][qti][si].rate=(ogg_int16_t)
+             (_table[qi][pli][qti][si].rate*wt+0.5);
+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)
+             (_table[qi][pli][qti][si].rmse*wt+0.5);
+          }
+        }
+      }
+    }
+  }
+}
+
+/*Dump the in memory mode metrics to a file.
+  Note this data format isn't portable between different platforms.*/
+void oc_mode_metrics_dump(void){
+  FILE *fmetrics;
+  fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
+  if(fmetrics!=NULL){
+    (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
+    (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
+    (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
+    fclose(fmetrics);
+  }
+}
+
+void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
+#if !defined(OC_COLLECT_METRICS)
+ const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
+#else
+ oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
+#endif
+  int qii;
+  fprintf(_fout,
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
+    int pli;
+    fprintf(_fout,"  {\n");
+    for(pli=0;pli<3;pli++){
+      int qti;
+      fprintf(_fout,"    {\n");
+      for(qti=0;qti<2;qti++){
+        int bin;
+        int qi;
+        static const char *pl_names[3]={"Y'","Cb","Cr"};
+        static const char *qti_names[2]={"INTRA","INTER"};
+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
+        fprintf(_fout,"      /*%s  qi=%i  %s*/\n",
+         pl_names[pli],qi,qti_names[qti]);
+        fprintf(_fout,"      {\n");
+        fprintf(_fout,"        ");
+        for(bin=0;bin<OC_COMP_BINS;bin++){
+          if(bin&&!(bin&0x3))fprintf(_fout,"\n        ");
+          fprintf(_fout,"{%5i,%5i}",
+           _mode_rd_table[qii][pli][qti][bin].rate,
+           _mode_rd_table[qii][pli][qti][bin].rmse);
+          if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
+        }
+        fprintf(_fout,"\n      }");
+        if(qti<1)fprintf(_fout,",");
+        fprintf(_fout,"\n");
+      }
+      fprintf(_fout,"    }");
+      if(pli<2)fprintf(_fout,",");
+      fprintf(_fout,"\n");
+    }
+    fprintf(_fout,"  }");
+    if(qii+1<OC_LOGQ_BINS)fprintf(_fout,",");
+    fprintf(_fout,"\n");
+  }
+  fprintf(_fout,
+   "};\n"
+   "\n");
+}
+
+void oc_mode_metrics_print(FILE *_fout){
+  int qii;
+  fprintf(_fout,
+   "/*File generated by libtheora with OC_COLLECT_METRICS"
+   " defined at compile time.*/\n"
+   "#if !defined(_modedec_H)\n"
+   "# define _modedec_H (1)\n"
+   "# include \"encint.h\"\n"
+   "\n"
+   "\n"
+   "\n"
+   "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
+   "   (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
+   "  The actual statistics used by the encoder will be interpolated from\n"
+   "   that table based on log_plq for the actual quantization matrix used.*/\n"
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
+    fprintf(_fout,"  { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
+     OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
+     OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
+     qii+1<OC_LOGQ_BINS?",":"");
+  }
+  fprintf(_fout,
+   "};\n"
+   "\n");
+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
+  fprintf(_fout,
+   "#endif\n");
+}
+
+
+# if !defined(OC_COLLECT_NO_ENC_FUNCS)
+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
+  oc_restore_fpu(&_enc->state);
+  /*Load any existing mode metrics if we haven't already.*/
+  if(!OC_HAS_MODE_METRICS){
+    FILE *fmetrics;
+    memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
+    memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
+    fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
+    if(fmetrics!=NULL){
+      /*Read in the binary structures as written my oc_mode_metrics_dump().
+        Note this format isn't portable between different platforms.*/
+      (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
+      (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
+      (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
+      fclose(fmetrics);
+    }
+    else{
+      int qii;
+      int qi;
+      int pli;
+      int qti;
+      for(qii=0;qii<OC_LOGQ_BINS;qii++){
+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
+        for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+          OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti];
+        }
+      }
+    }
+    oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
+     OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
+    oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
+     OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+    OC_HAS_MODE_METRICS=1;
+  }
+}
+
+/*The following token skipping code used to also be used in the decoder (and
+   even at one point other places in the encoder).
+  However, it was obsoleted by other optimizations, and is now only used here.
+  It has been moved here to avoid generating the code when it's not needed.*/
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.*/
+typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
+
+/*Handles the simple end of block tokens.*/
+static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
+  int nblocks_adjust;
+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
+  return -_extra_bits-nblocks_adjust;
+}
+
+/*The last EOB token has a special case, where an EOB run of size zero ends all
+   the remaining blocks in the frame.*/
+static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
+  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
+     yet available everywhere; this should be equivalent.*/
+  if(!_extra_bits)return -(~(size_t)0>>1);
+  return -_extra_bits;
+}
+
+/*Handles the pure zero run tokens.*/
+static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
+  return _extra_bits+1;
+}
+
+/*Handles a normal coefficient value token.*/
+static ptrdiff_t oc_token_skip_val(void){
+  return 1;
+}
+
+/*Handles a category 1A zero run/coefficient value combo token.*/
+static ptrdiff_t oc_token_skip_run_cat1a(int _token){
+  return _token-OC_DCT_RUN_CAT1A+2;
+}
+
+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
+static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
+  int run_cati;
+  int ncoeffs_mask;
+  int ncoeffs_adjust;
+  run_cati=_token-OC_DCT_RUN_CAT1B;
+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
+}
+
+/*A jump table for computing the number of coefficients or blocks to skip for
+   a given token value.
+  This reduces all the conditional branches, etc., needed to parse these token
+   values down to one indirect jump.*/
+static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob6,
+  oc_token_skip_zrl,
+  oc_token_skip_zrl,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run
+};
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.
+          0 will never be returned, so that at least one coefficient in one
+           block will always be decoded for every token.*/
+static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
+  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
+}
+
+
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
+  static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
+     0,16,16,16,16,16,32,32,
+    32,32,32,32,32,32,32,48,
+    48,48,48,48,48,48,48,48,
+    48,48,48,48,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64
+  };
+  const oc_fragment *frags;
+  const unsigned    *frag_sad;
+  const unsigned    *frag_satd;
+  const unsigned    *frag_ssd;
+  const ptrdiff_t   *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  double             fragw;
+  int                modelines[3][3][2];
+  int                qti;
+  int                qii;
+  int                qi;
+  int                pli;
+  int                zzi;
+  int                token;
+  int                eb;
+  oc_restore_fpu(&_enc->state);
+  /*Figure out which metric bins to use for this frame's quantizers.*/
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        int log_plq;
+        int modeline;
+        log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti];
+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
+        modelines[qii][pli][qti]=modeline;
+      }
+    }
+  }
+  qti=_enc->state.frame_type;
+  frags=_enc->state.frags;
+  frag_sad=_enc->frag_sad;
+  frag_satd=_enc->frag_satd;
+  frag_ssd=_enc->frag_ssd;
+  coded_fragis=_enc->state.coded_fragis;
+  ncoded_fragis=fragii=0;
+  /*Weight the fragments by the inverse frame size; this prevents HD content
+     from dominating the statistics.*/
+  fragw=1.0/_enc->state.nfrags;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t ti[64];
+    int       eob_token[64];
+    int       eob_run[64];
+    /*Set up token indices and eob run counts.
+      We don't bother trying to figure out the real cost of the runs that span
+       coefficients; instead we use the costs that were available when R-D
+       token optimization was done.*/
+    for(zzi=0;zzi<64;zzi++){
+      ti[zzi]=_enc->dct_token_offs[pli][zzi];
+      if(ti[zzi]>0){
+        token=_enc->dct_tokens[pli][zzi][0];
+        eb=_enc->extra_bits[pli][zzi][0];
+        eob_token[zzi]=token;
+        eob_run[zzi]=-oc_dct_token_skip(token,eb);
+      }
+      else{
+        eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        eob_run[zzi]=0;
+      }
+    }
+    /*Scan the list of coded fragments for this plane.*/
+    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    for(;fragii<ncoded_fragis;fragii++){
+      ptrdiff_t fragi;
+      int       frag_bits;
+      int       huffi;
+      int       skip;
+      int       mb_mode;
+      unsigned  sad;
+      unsigned  satd;
+      double    sqrt_ssd;
+      int       bin;
+      int       qtj;
+      fragi=coded_fragis[fragii];
+      frag_bits=0;
+      for(zzi=0;zzi<64;){
+        if(eob_run[zzi]>0){
+          /*We've reached the end of the block.*/
+          eob_run[zzi]--;
+          break;
+        }
+        huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
+         +OC_ZZI_HUFF_OFFSET[zzi];
+        if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
+          /*This token caused an EOB run to be flushed.
+            Therefore it gets the bits associated with it.*/
+          frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
+          eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        }
+        token=_enc->dct_tokens[pli][zzi][ti[zzi]];
+        eb=_enc->extra_bits[pli][zzi][ti[zzi]];
+        ti[zzi]++;
+        skip=oc_dct_token_skip(token,eb);
+        if(skip<0){
+          eob_token[zzi]=token;
+          eob_run[zzi]=-skip;
+        }
+        else{
+          /*A regular DCT value token; accumulate the bits for it.*/
+          frag_bits+=_enc->huff_codes[huffi][token].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[token];
+          zzi+=skip;
+        }
+      }
+      mb_mode=frags[fragi].mb_mode;
+      qii=frags[fragi].qii;
+      qi=_enc->state.qis[qii];
+      sad=frag_sad[fragi]<<(pli+1&2);
+      satd=frag_satd[fragi]<<(pli+1&2);
+      sqrt_ssd=sqrt(frag_ssd[fragi]);
+      qtj=mb_mode!=OC_MODE_INTRA;
+      /*Accumulate statistics.
+        The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
+         OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
+         yet still use old data.*/
+      bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
+      oc_mode_metrics_add(
+       OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
+       fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
+      bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
+      oc_mode_metrics_add(
+       OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
+       fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
+    }
+  }
+  /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
+  oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
+   OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
+  oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
+   OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+}
+# endif
+
+#endif
diff --git a/thirdparty/libtheora/collect.h b/thirdparty/libtheora/collect.h
new file mode 100644
index 0000000000..9458b84e3f
--- /dev/null
+++ b/thirdparty/libtheora/collect.h
@@ -0,0 +1,109 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_collect_H)
+# define _collect_H (1)
+# include "encint.h"
+# if defined(OC_COLLECT_METRICS)
+#  include <stdio.h>
+
+
+
+typedef struct oc_mode_metrics oc_mode_metrics;
+
+
+
+/**Sets the file name to load/store mode metrics from/to.
+ * The file name string is stored by reference, and so must be valid for the
+ *  lifetime of the encoder.
+ * Mode metric collection uses global tables; do not attempt to perform
+ *  multiple collections at once.
+ * \param[in] _buf <tt>char[]</tt> The file name.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_METRICS_FILE (0x8000)
+
+
+
+/*Accumulates various weighted sums of the measurements.
+  w -> weight
+  s -> SATD
+  q -> log quantizer
+  r -> rate (in bits)
+  d -> RMSE
+  All of the single letters correspond to direct, weighted sums, e.g.,
+   w=sum(w_i), s=sum(s_i*w_i), etc.
+  The others correspond to central moments (or co-moments) of the given order,
+   e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i).
+  Because we need some moments up to fourth order, we use central moments to
+   minimize the dynamic range and prevent rounding error from dominating the
+   calculations.*/
+struct oc_mode_metrics{
+  double w;
+  double s;
+  double q;
+  double r;
+  double d;
+  double s2;
+  double sq;
+  double q2;
+  double sr;
+  double qr;
+  double r2;
+  double sd;
+  double qd;
+  double d2;
+  double s2q;
+  double sq2;
+  double sqr;
+  double sqd;
+  double s2q2;
+};
+
+
+# define OC_ZWEIGHT   (0.25)
+
+/*TODO: It may be helpful (for block-level quantizers especially) to separate
+   out the contributions from AC and DC into separate tables.*/
+
+extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
+extern oc_mode_rd  OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+extern oc_mode_rd  OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+
+extern int              OC_HAS_MODE_METRICS;
+extern oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+extern oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+extern const char      *OC_MODE_METRICS_FILENAME;
+
+void oc_mode_metrics_dump();
+void oc_mode_metrics_print(FILE *_fout);
+
+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _s,int _q,int _r,double _d);
+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n);
+double oc_mode_metrics_solve(double *_r,double *_d,
+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
+ const int *_q0,const int *_q1,
+ const double *_ra,const double *_rb,const double *_rc,
+ const double *_da,const double *_db,const double *_dc,int _n);
+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
+ int shift,double (*_weight)[3][2][OC_COMP_BINS]);
+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
+
+# endif
+#endif
diff --git a/thirdparty/libtheora/dct.h b/thirdparty/libtheora/dct.h
index 24ba6f111a..8052ea6bc1 100644
--- a/thirdparty/libtheora/dct.h
+++ b/thirdparty/libtheora/dct.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/decinfo.c b/thirdparty/libtheora/decinfo.c
index 845eb1361c..a91e740b15 100644
--- a/thirdparty/libtheora/decinfo.c
+++ b/thirdparty/libtheora/decinfo.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -20,6 +20,11 @@
 #include <limits.h>
 #include "decint.h"
 
+/*Only used for fuzzing.*/
+#if defined(HAVE_MEMORY_CONSTRAINT)
+static const int MAX_FUZZING_WIDTH = 16384;
+static const int MAX_FUZZING_HEIGHT = 16384;
+#endif
 
 
 /*Unpacks a series of octets from a given byte array into the pack buffer.
@@ -55,8 +60,8 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
   /*verify we can parse this bitstream version.
      We accept earlier minors and all subminors, by spec*/
   if(_info->version_major>TH_VERSION_MAJOR||
-   _info->version_major==TH_VERSION_MAJOR&&
-   _info->version_minor>TH_VERSION_MINOR){
+   (_info->version_major==TH_VERSION_MAJOR&&
+   _info->version_minor>TH_VERSION_MINOR)){
     return TH_EVERSION;
   }
   /*Read the encoded frame description.*/
@@ -82,6 +87,11 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
    _info->fps_numerator==0||_info->fps_denominator==0){
     return TH_EBADHEADER;
   }
+#if defined(HAVE_MEMORY_CONSTRAINT)
+  if(_info->frame_width>=MAX_FUZZING_WIDTH&&_info->frame_height>=MAX_FUZZING_HEIGHT){
+    return TH_EBADHEADER;
+  }
+#endif
   /*Note: The sense of pic_y is inverted in what we pass back to the
      application compared to how it is stored in the bitstream.
     This is because the bitstream uses a right-handed coordinate system, while
@@ -128,6 +138,10 @@ static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
    _tc->comments*sizeof(_tc->comment_lengths[0]));
   _tc->user_comments=(char **)_ogg_malloc(
    _tc->comments*sizeof(_tc->user_comments[0]));
+  if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
+    _tc->comments=0;
+    return TH_EFAULT;
+  }
   for(i=0;i<_tc->comments;i++){
     len=oc_unpack_length(_opb);
     if(len<0||len>oc_pack_bytes_left(_opb)){
@@ -168,9 +182,23 @@ static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
   int  ret;
   val=oc_pack_read(_opb,8);
   packtype=(int)val;
-  /*If we're at a data packet and we have received all three headers, we're
-     done.*/
-  if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){
+  /*If we're at a data packet...*/
+  if(!(packtype&0x80)){
+    /*Check to make sure we received all three headers...
+      If we haven't seen any valid headers, assume this is not actually
+       Theora.*/
+    if(_info->frame_width<=0)return TH_ENOTFORMAT;
+    /*Follow our documentation, which says we'll return TH_EFAULT if this
+       are NULL (_info was checked by our caller).*/
+    if(_tc==NULL)return TH_EFAULT;
+    /*And if any other headers were missing, declare this packet "out of
+       sequence" instead.*/
+    if(_tc->vendor==NULL)return TH_EBADHEADER;
+    /*Don't check this until it's needed, since we allow passing NULL for the
+       arguments that we're not expecting the next header to fill in yet.*/
+    if(_setup==NULL)return TH_EFAULT;
+    if(*_setup==NULL)return TH_EBADHEADER;
+    /*If we got everything, we're done.*/
     return 0;
   }
   /*Check the codec string.*/
diff --git a/thirdparty/libtheora/decint.h b/thirdparty/libtheora/decint.h
index 261b67631a..3cea6b1439 100644
--- a/thirdparty/libtheora/decint.h
+++ b/thirdparty/libtheora/decint.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,15 +19,39 @@
 #if !defined(_decint_H)
 # define _decint_H (1)
 # include "theora/theoradec.h"
-# include "internal.h"
+# include "state.h"
 # include "bitpack.h"
-
-typedef struct th_setup_info oc_setup_info;
-typedef struct th_dec_ctx    oc_dec_ctx;
-
 # include "huffdec.h"
 # include "dequant.h"
 
+typedef struct th_setup_info         oc_setup_info;
+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct th_dec_ctx            oc_dec_ctx;
+
+
+
+/*Decoder-specific accelerated functions.*/
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xdec.h"
+# endif
+
+# if !defined(oc_dec_accel_init)
+#  define oc_dec_accel_init oc_dec_accel_init_c
+# endif
+# if defined(OC_DEC_USE_VTABLE)
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
+#  endif
+# else
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
+#  endif
+# endif
+
+
+
 /*Constants for the packet-in state machine specific to the decoder.*/
 
 /*Next packet to read: Data packet.*/
@@ -37,71 +61,125 @@ typedef struct th_dec_ctx    oc_dec_ctx;
 
 struct th_setup_info{
   /*The Huffman codes.*/
-  oc_huff_node      *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
   /*The quantization parameters.*/
   th_quant_info  qinfo;
 };
 
 
 
+/*Decoder specific functions with accelerated variants.*/
+struct oc_dec_opt_vtable{
+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
+   oc_dec_pipeline_state *_pipe,int _pli);
+};
+
+
+
+struct oc_dec_pipeline_state{
+  /*Decoded DCT coefficients.
+    These are placed here instead of on the stack so that they can persist
+     between blocks, which makes clearing them back to zero much faster when
+     only a few non-zero coefficients were decoded.
+    It requires at least 65 elements because the zig-zag index array uses the
+     65th element as a dumping ground for out-of-range indices to protect us
+     from buffer overflow.
+    We make it fully twice as large so that the second half can serve as the
+     reconstruction buffer, which saves passing another parameter to all the
+     acceleration functios.
+    It also solves problems with 16-byte alignment for NEON on ARM.
+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+     alignment, and silently produces incorrect results if you ask for 16.
+    Finally, keeping it off the stack means there's less likely to be a data
+     hazard beween the NEON co-processor and the regular ARM core, which avoids
+     unnecessary stalls.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][4];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+};
+
+
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
-  oc_theora_state      state;
+  oc_theora_state        state;
   /*Whether or not packets are ready to be emitted.
     This takes on negative values while there are remaining header packets to
      be emitted, reaches 0 when the codec is ready for input, and goes to 1
      when a frame has been processed and a data packet is ready.*/
-  int                  packet_state;
+  int                    packet_state;
   /*Buffer in which to assemble packets.*/
-  oc_pack_buf          opb;
+  oc_pack_buf            opb;
   /*Huffman decode trees.*/
-  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of the first token in each plane for each coefficient.*/
-  ptrdiff_t            ti0[3][64];
+  ptrdiff_t              ti0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
-  ptrdiff_t            eob_runs[3][64];
+  ptrdiff_t              eob_runs[3][64];
   /*The DCT token lists.*/
-  unsigned char       *dct_tokens;
+  unsigned char         *dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  unsigned char       *extra_bits;
+  unsigned char         *extra_bits;
   /*The number of dct tokens unpacked so far.*/
-  int                  dct_tokens_count;
+  int                    dct_tokens_count;
   /*The out-of-loop post-processing level.*/
-  int                  pp_level;
+  int                    pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
-  int                  pp_dc_scale[64];
+  int                    pp_dc_scale[64];
   /*The sharpen modifier used for out-of-loop deringing.*/
-  int                  pp_sharp_mod[64];
+  int                    pp_sharp_mod[64];
   /*The DC quantization index of each block.*/
-  unsigned char       *dc_qis;
+  unsigned char         *dc_qis;
   /*The variance of each block.*/
-  int                 *variances;
+  int                   *variances;
   /*The storage for the post-processed frame buffer.*/
-  unsigned char       *pp_frame_data;
+  unsigned char         *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                  pp_frame_state;
+  int                    pp_frame_state;
   /*The buffer used for the post-processed frame.
     Note that this is _not_ guaranteed to have the same strides and offsets as
      the reference frame buffers.*/
-  th_ycbcr_buffer      pp_frame_buf;
+  th_ycbcr_buffer        pp_frame_buf;
   /*The striped decode callback function.*/
-  th_stripe_callback   stripe_cb;
+  th_stripe_callback     stripe_cb;
+  oc_dec_pipeline_state  pipe;
+# if defined(OC_DEC_USE_VTABLE)
+  /*Table for decoder acceleration functions.*/
+  oc_dec_opt_vtable      opt_vtable;
+# endif
 # if defined(HAVE_CAIRO)
   /*Output metrics for debugging.*/
-  int                  telemetry;
-  int                  telemetry_mbmode;
-  int                  telemetry_mv;
-  int                  telemetry_qi;
-  int                  telemetry_bits;
-  int                  telemetry_frame_bytes;
-  int                  telemetry_coding_bytes;
-  int                  telemetry_mode_bytes;
-  int                  telemetry_mv_bytes;
-  int                  telemetry_qi_bytes;
-  int                  telemetry_dc_bytes;
-  unsigned char       *telemetry_frame_data;
+  int                    telemetry_mbmode;
+  int                    telemetry_mv;
+  int                    telemetry_qi;
+  int                    telemetry_bits;
+  int                    telemetry_frame_bytes;
+  int                    telemetry_coding_bytes;
+  int                    telemetry_mode_bytes;
+  int                    telemetry_mv_bytes;
+  int                    telemetry_qi_bytes;
+  int                    telemetry_dc_bytes;
+  unsigned char         *telemetry_frame_data;
 # endif
 };
 
+/*Default pure-C implementations of decoder-specific accelerated functions.*/
+void oc_dec_accel_init_c(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
 #endif
diff --git a/thirdparty/libtheora/decode.c b/thirdparty/libtheora/decode.c
index bde967b794..fad26e0927 100644
--- a/thirdparty/libtheora/decode.c
+++ b/thirdparty/libtheora/decode.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: decode.c 16581 2009-09-25 22:56:16Z gmaxwell $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -118,7 +118,7 @@ static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
 
 /*Whether or not an internal token needs any additional extra bits.*/
 #define OC_DCT_TOKEN_NEEDS_MORE(token) \
- (token<(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
+ (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
   sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
 
 /*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
@@ -129,7 +129,7 @@ static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
    is not yet available everywhere; this should be equivalent.*/
 #define OC_DCT_EOB_FINISH (~(size_t)0>>1)
 
-/*The location of the (6) run legth bits in the code word.
+/*The location of the (6) run length bits in the code word.
   These are placed at index 0 and given 8 bits (even though 6 would suffice)
    because it may be faster to extract the lower byte on some platforms.*/
 #define OC_DCT_CW_RLEN_SHIFT (0)
@@ -297,8 +297,6 @@ static const ogg_int32_t OC_DCT_CODE_WORD[92]={
 
 
 static int oc_sb_run_unpack(oc_pack_buf *_opb){
-  long bits;
-  int ret;
   /*Coding scheme:
        Codeword            Run Length
      0                       1
@@ -308,32 +306,26 @@ static int oc_sb_run_unpack(oc_pack_buf *_opb){
      11110xxx                10-17
      111110xxxx              18-33
      111111xxxxxxxxxxxx      34-4129*/
-  bits=oc_pack_read1(_opb);
-  if(bits==0)return 1;
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 2+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read1(_opb);
-    return 4+(int)bits;
-  }
-  bits=oc_pack_read(_opb,3);
-  if((bits&4)==0)return 6+(int)bits;
-  else if((bits&2)==0){
-    ret=10+((bits&1)<<2);
-    bits=oc_pack_read(_opb,2);
-    return ret+(int)bits;
-  }
-  else if((bits&1)==0){
-    bits=oc_pack_read(_opb,4);
-    return 18+(int)bits;
+  static const ogg_int16_t OC_SB_RUN_TREE[22]={
+    4,
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3),
+     -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17,
+      2,
+       -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6)
+  };
+  int ret;
+  ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE);
+  if(ret>=0x10){
+    int offs;
+    offs=ret&0x1F;
+    ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4);
   }
-  bits=oc_pack_read(_opb,12);
-  return 34+(int)bits;
+  return ret;
 }
 
 static int oc_block_run_unpack(oc_pack_buf *_opb){
-  long bits;
-  long bits2;
   /*Coding scheme:
      Codeword             Run Length
      0x                      1-2
@@ -342,26 +334,37 @@ static int oc_block_run_unpack(oc_pack_buf *_opb){
      1110xx                  7-10
      11110xx                 11-14
      11111xxxx               15-30*/
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 1+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read1(_opb);
-    return 3+(int)bits;
-  }
-  bits=oc_pack_read(_opb,2);
-  if((bits&2)==0)return 5+(int)bits;
-  else if((bits&1)==0){
-    bits=oc_pack_read(_opb,2);
-    return 7+(int)bits;
-  }
-  bits=oc_pack_read(_opb,3);
-  if((bits&4)==0)return 11+bits;
-  bits2=oc_pack_read(_opb,2);
-  return 15+((bits&3)<<2)+bits2;
+  static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={
+    5,
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3),
+     -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4),
+     -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6),
+     33,       36,       39,       44,
+      1,-(1<<8|7),-(1<<8|8),
+      1,-(1<<8|9),-(1<<8|10),
+      2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14),
+      4,
+       -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18),
+       -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22),
+       -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26),
+       -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30)
+  };
+  return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE);
 }
 
 
 
+void oc_dec_accel_init_c(oc_dec_ctx *_dec){
+# if defined(OC_DEC_USE_VTABLE)
+  _dec->opt_vtable.dc_unpredict_mcu_plane=
+   oc_dec_dc_unpredict_mcu_plane_c;
+# endif
+}
+
 static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
  const th_setup_info *_setup){
   int qti;
@@ -371,7 +374,7 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
   ret=oc_state_init(&_dec->state,_info,3);
   if(ret<0)return ret;
   ret=oc_huff_trees_copy(_dec->huff_tables,
-   (const oc_huff_node *const *)_setup->huff_tables);
+   (const ogg_int16_t *const *)_setup->huff_tables);
   if(ret<0){
     oc_state_clear(&_dec->state);
     return ret;
@@ -406,6 +409,7 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
   }
   memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
    sizeof(_dec->state.loop_filter_limits));
+  oc_dec_accel_init(_dec);
   _dec->pp_level=OC_PP_LEVEL_DISABLED;
   _dec->dc_qis=NULL;
   _dec->variances=NULL;
@@ -413,7 +417,6 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
   _dec->stripe_cb.ctx=NULL;
   _dec->stripe_cb.stripe_decoded=NULL;
 #if defined(HAVE_CAIRO)
-  _dec->telemetry=0;
   _dec->telemetry_bits=0;
   _dec->telemetry_qi=0;
   _dec->telemetry_mbmode=0;
@@ -504,6 +507,7 @@ static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){
           fragi=sb_maps[sbi][quadi][bi];
           if(fragi>=0){
             frags[fragi].coded=1;
+            frags[fragi].refi=OC_FRAME_SELF;
             frags[fragi].mb_mode=OC_MODE_INTRA;
             coded_fragis[ncoded_fragis++]=fragi;
           }
@@ -595,6 +599,7 @@ static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){
 static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
   const oc_sb_map   *sb_maps;
   const oc_sb_flags *sb_flags;
+  signed char       *mb_modes;
   oc_fragment       *frags;
   unsigned           nsbs;
   unsigned           sbi;
@@ -617,6 +622,7 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
   else flag=0;
   sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
   sb_flags=_dec->state.sb_flags;
+  mb_modes=_dec->state.mb_modes;
   frags=_dec->state.frags;
   sbi=nsbs=run_count=0;
   coded_fragis=_dec->state.coded_fragis;
@@ -627,7 +633,9 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
     for(;sbi<nsbs;sbi++){
       int quadi;
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int quad_coded;
         int bi;
+        quad_coded=0;
         for(bi=0;bi<4;bi++){
           ptrdiff_t fragi;
           fragi=sb_maps[sbi][quadi][bi];
@@ -645,9 +653,13 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
             }
             if(coded)coded_fragis[ncoded_fragis++]=fragi;
             else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            quad_coded|=coded;
             frags[fragi].coded=coded;
+            frags[fragi].refi=OC_FRAME_NONE;
           }
         }
+        /*Remember if there's a coded luma block in this macro block.*/
+        if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
       }
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
@@ -659,33 +671,39 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
 }
 
 
+/*Coding scheme:
+   Codeword            Mode Index
+   0                       0
+   10                      1
+   110                     2
+   1110                    3
+   11110                   4
+   111110                  5
+   1111110                 6
+   1111111                 7*/
+static const ogg_int16_t OC_VLC_MODE_TREE[26]={
+  4,
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+   -(3<<8|2),-(3<<8|2),-(4<<8|3),17,
+    3,
+     -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4),
+     -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7)
+};
 
-typedef int (*oc_mode_unpack_func)(oc_pack_buf *_opb);
-
-static int oc_vlc_mode_unpack(oc_pack_buf *_opb){
-  long val;
-  int  i;
-  for(i=0;i<7;i++){
-    val=oc_pack_read1(_opb);
-    if(!val)break;
-  }
-  return i;
-}
-
-static int oc_clc_mode_unpack(oc_pack_buf *_opb){
-  long val;
-  val=oc_pack_read(_opb,3);
-  return (int)val;
-}
+static const ogg_int16_t OC_CLC_MODE_TREE[9]={
+  3,
+   -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3),
+   -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7)
+};
 
 /*Unpacks the list of macro block modes for INTER frames.*/
 static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
-  const oc_mb_map     *mb_maps;
   signed char         *mb_modes;
-  const oc_fragment   *frags;
   const unsigned char *alphabet;
   unsigned char        scheme0_alphabet[8];
-  oc_mode_unpack_func  mode_unpack;
+  const ogg_int16_t   *mode_tree;
   size_t               nmbs;
   size_t               mbi;
   long                 val;
@@ -707,65 +725,80 @@ static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
     alphabet=scheme0_alphabet;
   }
   else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
-  if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
-  else mode_unpack=oc_vlc_mode_unpack;
+  mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
   mb_modes=_dec->state.mb_modes;
-  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   nmbs=_dec->state.nmbs;
-  frags=_dec->state.frags;
   for(mbi=0;mbi<nmbs;mbi++){
-    if(mb_modes[mbi]!=OC_MODE_INVALID){
-      int bi;
-      /*Check for a coded luma block in this macro block.*/
-      for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
-      /*We found one, decode a mode.*/
-      if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)];
-      /*There were none: INTER_NOMV is forced.*/
-      else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+    if(mb_modes[mbi]>0){
+      /*We have a coded luma block; decode a mode.*/
+      mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
     }
+    /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
+       fact that OC_MODE_INTER_NOMV is already 0.*/
   }
 }
 
 
 
-typedef int (*oc_mv_comp_unpack_func)(oc_pack_buf *_opb);
+static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={
+  5,
+   -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),
+   -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),
+   -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),
+   -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2),
+   -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3),
+   33,          36,          39,          42,
+   45,          50,          55,          60,
+   65,          74,          83,          92,
+    1,-(1<<8|32+4),-(1<<8|32-4),
+    1,-(1<<8|32+5),-(1<<8|32-5),
+    1,-(1<<8|32+6),-(1<<8|32-6),
+    1,-(1<<8|32+7),-(1<<8|32-7),
+    2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9),
+    2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11),
+    2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13),
+    2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15),
+    3,
+     -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17),
+     -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19),
+    3,
+     -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21),
+     -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23),
+    3,
+     -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25),
+     -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27),
+    3,
+     -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29),
+     -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31)
+};
+
+static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={
+  6,
+   -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1),
+   -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3),
+   -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5),
+   -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7),
+   -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9),
+   -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11),
+   -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13),
+   -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15),
+   -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17),
+   -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19),
+   -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21),
+   -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23),
+   -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25),
+   -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27),
+   -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29),
+   -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31)
+};
 
-static int oc_vlc_mv_comp_unpack(oc_pack_buf *_opb){
-  long bits;
-  int  mask;
-  int  mv;
-  bits=oc_pack_read(_opb,3);
-  switch(bits){
-    case  0:return 0;
-    case  1:return 1;
-    case  2:return -1;
-    case  3:
-    case  4:{
-      mv=(int)(bits-1);
-      bits=oc_pack_read1(_opb);
-    }break;
-    /*case  5:
-    case  6:
-    case  7:*/
-    default:{
-      mv=1<<bits-3;
-      bits=oc_pack_read(_opb,bits-2);
-      mv+=(int)(bits>>1);
-      bits&=1;
-    }break;
-  }
-  mask=-(int)bits;
-  return mv+mask^mask;
-}
 
-static int oc_clc_mv_comp_unpack(oc_pack_buf *_opb){
-  long bits;
-  int  mask;
-  int  mv;
-  bits=oc_pack_read(_opb,6);
-  mv=(int)bits>>1;
-  mask=-((int)bits&1);
-  return mv+mask^mask;
+static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  int dx;
+  int dy;
+  dx=oc_huff_token_decode(_opb,_tree)-32;
+  dy=oc_huff_token_decode(_opb,_tree)-32;
+  return OC_MV(dx,dy);
 }
 
 /*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
@@ -774,105 +807,93 @@ static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){
   const oc_mb_map        *mb_maps;
   const signed char      *mb_modes;
   oc_set_chroma_mvs_func  set_chroma_mvs;
-  oc_mv_comp_unpack_func  mv_comp_unpack;
+  const ogg_int16_t      *mv_comp_tree;
   oc_fragment            *frags;
   oc_mv                  *frag_mvs;
   const unsigned char    *map_idxs;
   int                     map_nidxs;
-  oc_mv                   last_mv[2];
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
   oc_mv                   cbmvs[4];
   size_t                  nmbs;
   size_t                  mbi;
   long                    val;
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
   val=oc_pack_read1(&_dec->opb);
-  mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
+  mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE;
   map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
   map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
-  memset(last_mv,0,sizeof(last_mv));
+  prior_mv=last_mv=0;
   frags=_dec->state.frags;
   frag_mvs=_dec->state.frag_mvs;
   mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   mb_modes=_dec->state.mb_modes;
   nmbs=_dec->state.nmbs;
   for(mbi=0;mbi<nmbs;mbi++){
-    int          mb_mode;
+    int mb_mode;
     mb_mode=mb_modes[mbi];
     if(mb_mode!=OC_MODE_INVALID){
-      oc_mv        mbmv;
-      ptrdiff_t    fragi;
-      int          coded[13];
-      int          codedi;
-      int          ncoded;
-      int          mapi;
-      int          mapii;
-      /*Search for at least one coded fragment.*/
-      ncoded=mapii=0;
-      do{
-        mapi=map_idxs[mapii];
-        fragi=mb_maps[mbi][mapi>>2][mapi&3];
-        if(frags[fragi].coded)coded[ncoded++]=mapi;
-      }
-      while(++mapii<map_nidxs);
-      if(ncoded<=0)continue;
-      switch(mb_mode){
-        case OC_MODE_INTER_MV_FOUR:{
-          oc_mv       lbmvs[4];
-          int         bi;
-          /*Mark the tail of the list, so we don't accidentally go past it.*/
-          coded[ncoded]=-1;
-          for(bi=codedi=0;bi<4;bi++){
-            if(coded[codedi]==bi){
-              codedi++;
-              fragi=mb_maps[mbi][0][bi];
-              frags[fragi].mb_mode=mb_mode;
-              lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-              lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-              memcpy(frag_mvs[fragi],lbmvs[bi],sizeof(lbmvs[bi]));
-            }
-            else lbmvs[bi][0]=lbmvs[bi][1]=0;
-          }
-          if(codedi>0){
-            memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-            memcpy(last_mv[0],lbmvs[coded[codedi-1]],sizeof(last_mv[0]));
+      oc_mv     mbmv;
+      ptrdiff_t fragi;
+      int       mapi;
+      int       mapii;
+      int       refi;
+      if(mb_mode==OC_MODE_INTER_MV_FOUR){
+        oc_mv lbmvs[4];
+        int   bi;
+        prior_mv=last_mv;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+            frag_mvs[fragi]=lbmvs[bi];
           }
-          if(codedi<ncoded){
-            (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
-            for(;codedi<ncoded;codedi++){
-              mapi=coded[codedi];
-              bi=mapi&3;
-              fragi=mb_maps[mbi][mapi>>2][bi];
-              frags[fragi].mb_mode=mb_mode;
-              memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(cbmvs[bi]));
-            }
+          else lbmvs[bi]=0;
+        }
+        (*set_chroma_mvs)(cbmvs,lbmvs);
+        for(mapii=4;mapii<map_nidxs;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][mapi>>2][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            frag_mvs[fragi]=cbmvs[bi];
           }
-        }break;
-        case OC_MODE_INTER_MV:{
-          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-          mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-          mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-        }break;
-        case OC_MODE_INTER_MV_LAST:memcpy(mbmv,last_mv[0],sizeof(mbmv));break;
-        case OC_MODE_INTER_MV_LAST2:{
-          memcpy(mbmv,last_mv[1],sizeof(mbmv));
-          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
-          memcpy(last_mv[0],mbmv,sizeof(last_mv[0]));
-        }break;
-        case OC_MODE_GOLDEN_MV:{
-          mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-          mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-        }break;
-        default:memset(mbmv,0,sizeof(mbmv));break;
+        }
       }
-      /*4MV mode fills in the fragments itself.
-        For all other modes we can use this common code.*/
-      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
-        for(codedi=0;codedi<ncoded;codedi++){
-          mapi=coded[codedi];
+      else{
+        switch(mb_mode){
+          case OC_MODE_INTER_MV:{
+            prior_mv=last_mv;
+            last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
+          case OC_MODE_INTER_MV_LAST2:{
+            mbmv=prior_mv;
+            prior_mv=last_mv;
+            last_mv=mbmv;
+          }break;
+          case OC_MODE_GOLDEN_MV:{
+            mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          default:mbmv=0;break;
+        }
+        /*Fill in the MVs for the fragments.*/
+        refi=OC_FRAME_FOR_MODE(mb_mode);
+        mapii=0;
+        do{
+          mapi=map_idxs[mapii];
           fragi=mb_maps[mbi][mapi>>2][mapi&3];
-          frags[fragi].mb_mode=mb_mode;
-          memcpy(frag_mvs[fragi],mbmv,sizeof(mbmv));
+          if(frags[fragi].coded){
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mbmv;
+          }
         }
+        while(++mapii<map_nidxs);
       }
     }
   }
@@ -1181,6 +1202,9 @@ static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){
 
 
 static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
+  /*musl libc malloc()/realloc() calls might use floating point, so make sure
+     we've cleared the MMX state for them.*/
+  oc_restore_fpu(&_dec->state);
   /*pp_level 0: disabled; free any memory used and return*/
   if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){
     if(_dec->dc_qis!=NULL){
@@ -1301,34 +1325,16 @@ static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
 }
 
 
-
-typedef struct{
-  int                 bounding_values[256];
-  ptrdiff_t           ti[3][64];
-  ptrdiff_t           eob_runs[3][64];
-  const ptrdiff_t    *coded_fragis[3];
-  const ptrdiff_t    *uncoded_fragis[3];
-  ptrdiff_t           ncoded_fragis[3];
-  ptrdiff_t           nuncoded_fragis[3];
-  const ogg_uint16_t *dequant[3][3][2];
-  int                 fragy0[3];
-  int                 fragy_end[3];
-  int                 pred_last[3][3];
-  int                 mcu_nvfrags;
-  int                 loop_filter;
-  int                 pp_level;
-}oc_dec_pipeline_state;
-
-
-
 /*Initialize the main decoding pipeline.*/
 static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe){
   const ptrdiff_t *coded_fragis;
   const ptrdiff_t *uncoded_fragis;
+  int              flimit;
   int              pli;
   int              qii;
   int              qti;
+  int              zzi;
   /*If chroma is sub-sampled in the vertical direction, we have to decode two
      super block rows of Y' for each super block row of Cb and Cr.*/
   _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
@@ -1360,8 +1366,9 @@ static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
   /*Set the previous DC predictor to 0 for all color planes and frame types.*/
   memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
   /*Initialize the bounding value array for the loop filter.*/
-  _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state,
-   _pipe->bounding_values);
+  flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
   /*Initialize any buffers needed for post-processing.
     We also save the current post-processing level, to guard against the user
      changing it from a callback.*/
@@ -1374,13 +1381,15 @@ static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
      _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
      sizeof(_dec->pp_frame_buf[0])*3);
   }
+  /*Clear down the DCT coefficient buffer for the first block.*/
+  for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
 }
 
 /*Undo the DC prediction in a single plane of an MCU (one or two super block
    rows).
   As a side effect, the number of coded and uncoded fragments in this plane of
    the MCU is also computed.*/
-static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
   const oc_fragment_plane *fplane;
   oc_fragment             *frags;
@@ -1408,9 +1417,9 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
          predictor for the same reference frame.*/
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         if(frags[fragi].coded){
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
-          pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+          int refi;
+          refi=frags[fragi].refi;
+          pred_last[refi]=frags[fragi].dc+=pred_last[refi];
           ncoded_fragis++;
         }
       }
@@ -1423,27 +1432,24 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
       u_frags=frags-nhfrags;
       l_ref=-1;
       ul_ref=-1;
-      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      u_ref=u_frags[fragi].refi;
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         int ur_ref;
         if(fragx+1>=nhfrags)ur_ref=-1;
-        else{
-          ur_ref=u_frags[fragi+1].coded?
-           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
-        }
+        else ur_ref=u_frags[fragi+1].refi;
         if(frags[fragi].coded){
           int pred;
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          int refi;
+          refi=frags[fragi].refi;
           /*We break out a separate case based on which of our neighbors use
              the same reference frames.
             This is somewhat faster than trying to make a generic case which
              handles all of them, since it reduces lots of poorly predicted
              jumps to one switch statement, and also lets a number of the
              multiplications be optimized out by strength reduction.*/
-          switch((l_ref==ref)|(ul_ref==ref)<<1|
-           (u_ref==ref)<<2|(ur_ref==ref)<<3){
-            default:pred=pred_last[ref];break;
+          switch((l_ref==refi)|(ul_ref==refi)<<1|
+           (u_ref==refi)<<2|(ur_ref==refi)<<3){
+            default:pred=pred_last[refi];break;
             case  1:
             case  3:pred=frags[fragi-1].dc;break;
             case  2:pred=u_frags[fragi-1].dc;break;
@@ -1455,6 +1461,7 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
             case  9:
             case 11:
             case 13:{
+              /*The TI compiler mis-compiles this line.*/
               pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
             }break;
             case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
@@ -1476,9 +1483,9 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
               else if(abs(pred-p1)>128)pred=p1;
             }break;
           }
-          pred_last[ref]=frags[fragi].dc+=pred;
+          pred_last[refi]=frags[fragi].dc+=pred;
           ncoded_fragis++;
-          l_ref=ref;
+          l_ref=refi;
         }
         else l_ref=-1;
         ul_ref=u_ref;
@@ -1495,7 +1502,7 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
 /*Reconstructs all coded fragments in a single MCU (one or two super block
    rows).
   This requires that each coded fragment have a proper macro block mode and
-   motion vector (if not in INTRA mode), and have it's DC value decoded, with
+   motion vector (if not in INTRA mode), and have its DC value decoded, with
    the DC prediction process reversed, and the number of coded and uncoded
    fragments in this plane of the MCU be counted.
   The token lists for each color plane and coefficient should also be filled
@@ -1522,16 +1529,11 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
   eob_runs=_pipe->eob_runs[_pli];
   for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
   for(fragii=0;fragii<ncoded_fragis;fragii++){
-    /*This array is made one element larger because the zig-zag index array
-       uses the final element as a dumping ground for out-of-range indices
-       to protect us from buffer overflow.*/
-    OC_ALIGN8(ogg_int16_t dct_coeffs[65]);
     const ogg_uint16_t *ac_quant;
     ptrdiff_t           fragi;
     int                 last_zzi;
     int                 zzi;
     fragi=coded_fragis[fragii];
-    for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
     qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
     /*Decode the AC coefficients.*/
@@ -1568,18 +1570,19 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
         zzi+=rlen;
-        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
+         (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
         zzi+=!eob;
       }
     }
     /*TODO: zzi should be exactly 64 here.
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
-    dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+    _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,fragi,_pli,
-     dct_coeffs,last_zzi,dc_quant[qti]);
+     _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -1593,9 +1596,14 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
      code, and the hard case (high bitrate, high resolution) is handled
      correctly.*/
   /*Copy the uncoded blocks from the previous reference frame.*/
-  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
-   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_dec->state,
+     _dec->state.ref_frame_data[OC_FRAME_SELF],
+     _dec->state.ref_frame_data[OC_FRAME_PREV],
+     _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
+  }
 }
 
 /*Filter a horizontal block edge.*/
@@ -1953,9 +1961,9 @@ static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img,
 th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
   oc_dec_ctx *dec;
   if(_info==NULL||_setup==NULL)return NULL;
-  dec=_ogg_malloc(sizeof(*dec));
+  dec=oc_aligned_malloc(sizeof(*dec),16);
   if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){
-    _ogg_free(dec);
+    oc_aligned_free(dec);
     return NULL;
   }
   dec->state.curframe_num=0;
@@ -1965,7 +1973,7 @@ th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
 void th_decode_free(th_dec_ctx *_dec){
   if(_dec!=NULL){
     oc_dec_clear(_dec);
-    _ogg_free(_dec);
+    oc_aligned_free(_dec);
   }
 }
 
@@ -2013,28 +2021,24 @@ int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
   case TH_DECCTL_SET_TELEMETRY_MBMODE:{
     if(_dec==NULL||_buf==NULL)return TH_EFAULT;
     if(_buf_sz!=sizeof(int))return TH_EINVAL;
-    _dec->telemetry=1;
     _dec->telemetry_mbmode=*(int *)_buf;
     return 0;
   }break;
   case TH_DECCTL_SET_TELEMETRY_MV:{
     if(_dec==NULL||_buf==NULL)return TH_EFAULT;
     if(_buf_sz!=sizeof(int))return TH_EINVAL;
-    _dec->telemetry=1;
     _dec->telemetry_mv=*(int *)_buf;
     return 0;
   }break;
   case TH_DECCTL_SET_TELEMETRY_QI:{
     if(_dec==NULL||_buf==NULL)return TH_EFAULT;
     if(_buf_sz!=sizeof(int))return TH_EINVAL;
-    _dec->telemetry=1;
     _dec->telemetry_qi=*(int *)_buf;
     return 0;
   }break;
   case TH_DECCTL_SET_TELEMETRY_BITS:{
     if(_dec==NULL||_buf==NULL)return TH_EFAULT;
     if(_buf_sz!=sizeof(int))return TH_EINVAL;
-    _dec->telemetry=1;
     _dec->telemetry_bits=*(int *)_buf;
     return 0;
   }break;
@@ -2047,63 +2051,751 @@ int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
    buffers (i.e., decoding did not start on a key frame).
   We initialize them to a solid gray here.*/
 static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
-  th_info *info;
-  size_t   yplane_sz;
-  size_t   cplane_sz;
-  int      yhstride;
-  int      yheight;
-  int      chstride;
-  int      cheight;
+  th_info   *info;
+  size_t     yplane_sz;
+  size_t     cplane_sz;
+  ptrdiff_t  yoffset;
+  int        yhstride;
+  int        yheight;
+  int        chstride;
+  int        cheight;
   _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
   _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
-  _dec->state.ref_frame_idx[OC_FRAME_SELF]=1;
+  _dec->state.ref_frame_idx[OC_FRAME_SELF]=0;
+  _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+   _dec->state.ref_frame_data[OC_FRAME_PREV]=
+   _dec->state.ref_frame_data[OC_FRAME_SELF]=
+   _dec->state.ref_frame_bufs[0][0].data;
+  memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0],
+   sizeof(_dec->pp_frame_buf[0])*3);
   info=&_dec->state.info;
-  yhstride=info->frame_width+2*OC_UMV_PADDING;
+  yhstride=abs(_dec->state.ref_ystride[0]);
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>!(info->pixel_fmt&1);
+  chstride=abs(_dec->state.ref_ystride[1]);
   cheight=yheight>>!(info->pixel_fmt&2);
-  yplane_sz=yhstride*(size_t)yheight;
+  yplane_sz=yhstride*(size_t)yheight+16;
   cplane_sz=chstride*(size_t)cheight;
-  memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
+  yoffset=yhstride*(ptrdiff_t)(yheight-OC_UMV_PADDING-1)+OC_UMV_PADDING;
+  memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz);
+}
+
+#if defined(HAVE_CAIRO)
+static void oc_render_telemetry(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr,
+ int _telemetry){
+  /*Stuff the plane into cairo.*/
+  cairo_surface_t *cs;
+  unsigned char   *data;
+  unsigned char   *y_row;
+  unsigned char   *u_row;
+  unsigned char   *v_row;
+  unsigned char   *rgb_row;
+  int              cstride;
+  int              w;
+  int              h;
+  int              x;
+  int              y;
+  int              hdec;
+  int              vdec;
+  w=_ycbcr[0].width;
+  h=_ycbcr[0].height;
+  hdec=!(_dec->state.info.pixel_fmt&1);
+  vdec=!(_dec->state.info.pixel_fmt&2);
+  /*Lazy data buffer init.
+    We could try to re-use the post-processing buffer, which would save
+     memory, but complicate the allocation logic there.
+    I don't think anyone cares about memory usage when using telemetry; it is
+     not meant for embedded devices.*/
+  if(_dec->telemetry_frame_data==NULL){
+    _dec->telemetry_frame_data=_ogg_malloc(
+     (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data));
+    if(_dec->telemetry_frame_data==NULL)return;
+  }
+  cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h);
+  /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/
+  data=cairo_image_surface_get_data(cs);
+  if(data==NULL){
+    cairo_surface_destroy(cs);
+    return;
+  }
+  cstride=cairo_image_surface_get_stride(cs);
+  y_row=_ycbcr[0].data;
+  u_row=_ycbcr[1].data;
+  v_row=_ycbcr[2].data;
+  rgb_row=data;
+  for(y=0;y<h;y++){
+    for(x=0;x<w;x++){
+      int r;
+      int g;
+      int b;
+      r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200;
+      g=(3827562*y_row[x]-1287801*u_row[x>>hdec]
+       -2672387*v_row[x>>hdec]+447306710)/3287200;
+      b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600;
+      rgb_row[4*x+0]=OC_CLAMP255(b);
+      rgb_row[4*x+1]=OC_CLAMP255(g);
+      rgb_row[4*x+2]=OC_CLAMP255(r);
+    }
+    y_row+=_ycbcr[0].stride;
+    u_row+=_ycbcr[1].stride&-((y&1)|!vdec);
+    v_row+=_ycbcr[2].stride&-((y&1)|!vdec);
+    rgb_row+=cstride;
+  }
+  /*Draw coded identifier for each macroblock (stored in Hilbert order).*/
+  {
+    cairo_t           *c;
+    const oc_fragment *frags;
+    oc_mv             *frag_mvs;
+    const signed char *mb_modes;
+    oc_mb_map         *mb_maps;
+    size_t             nmbs;
+    size_t             mbi;
+    int                row2;
+    int                col2;
+    int                qim[3]={0,0,0};
+    if(_dec->state.nqis==2){
+      int bqi;
+      bqi=_dec->state.qis[0];
+      if(_dec->state.qis[1]>bqi)qim[1]=1;
+      if(_dec->state.qis[1]<bqi)qim[1]=-1;
+    }
+    if(_dec->state.nqis==3){
+      int bqi;
+      int cqi;
+      int dqi;
+      bqi=_dec->state.qis[0];
+      cqi=_dec->state.qis[1];
+      dqi=_dec->state.qis[2];
+      if(cqi>bqi&&dqi>bqi){
+        if(dqi>cqi){
+          qim[1]=1;
+          qim[2]=2;
+        }
+        else{
+          qim[1]=2;
+          qim[2]=1;
+        }
+      }
+      else if(cqi<bqi&&dqi<bqi){
+        if(dqi<cqi){
+          qim[1]=-1;
+          qim[2]=-2;
+        }
+        else{
+          qim[1]=-2;
+          qim[2]=-1;
+        }
+      }
+      else{
+        if(cqi<bqi)qim[1]=-1;
+        else qim[1]=1;
+        if(dqi<bqi)qim[2]=-1;
+        else qim[2]=1;
+      }
+    }
+    c=cairo_create(cs);
+    frags=_dec->state.frags;
+    frag_mvs=_dec->state.frag_mvs;
+    mb_modes=_dec->state.mb_modes;
+    mb_maps=_dec->state.mb_maps;
+    nmbs=_dec->state.nmbs;
+    row2=0;
+    col2=0;
+    for(mbi=0;mbi<nmbs;mbi++){
+      float x;
+      float y;
+      int   bi;
+      y=h-(row2+((col2+1>>1)&1))*16-16;
+      x=(col2>>1)*16;
+      cairo_set_line_width(c,1.);
+      /*Keyframe (all intra) red box.*/
+      if(_dec->state.frame_type==OC_INTRA_FRAME){
+        if(_dec->telemetry_mbmode&0x02){
+          cairo_set_source_rgba(c,1.,0,0,.5);
+          cairo_rectangle(c,x+2.5,y+2.5,11,11);
+          cairo_stroke_preserve(c);
+          cairo_set_source_rgba(c,1.,0,0,.25);
+          cairo_fill(c);
+        }
+      }
+      else{
+        ptrdiff_t fragi;
+        int       frag_mvx;
+        int       frag_mvy;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(fragi>=0&&frags[fragi].coded){
+            frag_mvx=OC_MV_X(frag_mvs[fragi]);
+            frag_mvy=OC_MV_Y(frag_mvs[fragi]);
+            break;
+          }
+        }
+        if(bi<4){
+          switch(mb_modes[mbi]){
+            case OC_MODE_INTRA:{
+              if(_dec->telemetry_mbmode&0x02){
+                cairo_set_source_rgba(c,1.,0,0,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,0,0,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_INTER_NOMV:{
+              if(_dec->telemetry_mbmode&0x01){
+                cairo_set_source_rgba(c,0,0,1.,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,0,0,1.,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV:{
+              if(_dec->telemetry_mbmode&0x04){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x04){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_LAST:{
+              if(_dec->telemetry_mbmode&0x08){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_move_to(c,x+13.5,y+2.5);
+                cairo_line_to(c,x+2.5,y+8);
+                cairo_line_to(c,x+13.5,y+13.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x08){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              if(_dec->telemetry_mbmode&0x10){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_move_to(c,x+8,y+2.5);
+                cairo_line_to(c,x+2.5,y+8);
+                cairo_line_to(c,x+8,y+13.5);
+                cairo_move_to(c,x+13.5,y+2.5);
+                cairo_line_to(c,x+8,y+8);
+                cairo_line_to(c,x+13.5,y+13.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x10){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_GOLDEN_NOMV:{
+              if(_dec->telemetry_mbmode&0x20){
+                cairo_set_source_rgba(c,1.,1.,0,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,1.,0,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              if(_dec->telemetry_mbmode&0x40){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,1.,1.,0,.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x40){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_FOUR:{
+              if(_dec->telemetry_mbmode&0x80){
+                cairo_rectangle(c,x+2.5,y+2.5,4,4);
+                cairo_rectangle(c,x+9.5,y+2.5,4,4);
+                cairo_rectangle(c,x+2.5,y+9.5,4,4);
+                cairo_rectangle(c,x+9.5,y+9.5,4,4);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_stroke(c);
+              }
+              /*4mv is odd, coded in raster order.*/
+              fragi=mb_maps[mbi][0][0];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+4,y+12);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][1];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+12,y+12);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][2];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+4,y+4);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][3];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+12,y+4);
+                cairo_stroke(c);
+              }
+            }break;
+          }
+        }
+      }
+      /*qii illustration.*/
+      if(_dec->telemetry_qi&0x2){
+        cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE);
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          int       qiv;
+          int       xp;
+          int       yp;
+          xp=x+(bi&1)*8;
+          yp=y+8-(bi&2)*4;
+          fragi=mb_maps[mbi][0][bi];
+          if(fragi>=0&&frags[fragi].coded){
+            qiv=qim[frags[fragi].qii];
+            cairo_set_line_width(c,3.);
+            cairo_set_source_rgba(c,0.,0.,0.,.5);
+            switch(qiv){
+              /*Double plus:*/
+              case 2:{
+                if((bi&1)^((bi&2)>>1)){
+                  cairo_move_to(c,xp+2.5,yp+1.5);
+                  cairo_line_to(c,xp+2.5,yp+3.5);
+                  cairo_move_to(c,xp+1.5,yp+2.5);
+                  cairo_line_to(c,xp+3.5,yp+2.5);
+                  cairo_move_to(c,xp+5.5,yp+4.5);
+                  cairo_line_to(c,xp+5.5,yp+6.5);
+                  cairo_move_to(c,xp+4.5,yp+5.5);
+                  cairo_line_to(c,xp+6.5,yp+5.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
+                }
+                else{
+                  cairo_move_to(c,xp+5.5,yp+1.5);
+                  cairo_line_to(c,xp+5.5,yp+3.5);
+                  cairo_move_to(c,xp+4.5,yp+2.5);
+                  cairo_line_to(c,xp+6.5,yp+2.5);
+                  cairo_move_to(c,xp+2.5,yp+4.5);
+                  cairo_line_to(c,xp+2.5,yp+6.5);
+                  cairo_move_to(c,xp+1.5,yp+5.5);
+                  cairo_line_to(c,xp+3.5,yp+5.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
+                }
+              }break;
+              /*Double minus:*/
+              case -2:{
+                cairo_move_to(c,xp+2.5,yp+2.5);
+                cairo_line_to(c,xp+5.5,yp+2.5);
+                cairo_move_to(c,xp+2.5,yp+5.5);
+                cairo_line_to(c,xp+5.5,yp+5.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,1.,1.,1.);
+              }break;
+              /*Plus:*/
+              case 1:{
+                if((bi&2)==0)yp-=2;
+                if((bi&1)==0)xp-=2;
+                cairo_move_to(c,xp+4.5,yp+2.5);
+                cairo_line_to(c,xp+4.5,yp+6.5);
+                cairo_move_to(c,xp+2.5,yp+4.5);
+                cairo_line_to(c,xp+6.5,yp+4.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,.1,1.,.3,1.);
+                break;
+              }
+              /*Fall through.*/
+              /*Minus:*/
+              case -1:{
+                cairo_move_to(c,xp+2.5,yp+4.5);
+                cairo_line_to(c,xp+6.5,yp+4.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,.3,.1,1.);
+              }break;
+              default:continue;
+            }
+            cairo_set_line_width(c,1.);
+            cairo_stroke(c);
+          }
+        }
+      }
+      col2++;
+      if((col2>>1)>=_dec->state.nhmbs){
+        col2=0;
+        row2+=2;
+      }
+    }
+    /*Bit usage indicator[s]:*/
+    if(_dec->telemetry_bits){
+      int widths[6];
+      int fpsn;
+      int fpsd;
+      int mult;
+      int fullw;
+      int padw;
+      int i;
+      fpsn=_dec->state.info.fps_numerator;
+      fpsd=_dec->state.info.fps_denominator;
+      mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
+      fullw=250.f*h*fpsd*mult/fpsn;
+      padw=w-24;
+      /*Header and coded block bits.*/
+      if(_dec->telemetry_frame_bytes<0||
+       _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
+        _dec->telemetry_frame_bytes=0;
+      }
+      if(_dec->telemetry_coding_bytes<0||
+       _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_coding_bytes=0;
+      }
+      if(_dec->telemetry_mode_bytes<0||
+       _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_mode_bytes=0;
+      }
+      if(_dec->telemetry_mv_bytes<0||
+       _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_mv_bytes=0;
+      }
+      if(_dec->telemetry_qi_bytes<0||
+       _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_qi_bytes=0;
+      }
+      if(_dec->telemetry_dc_bytes<0||
+       _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_dc_bytes=0;
+      }
+      widths[0]=padw*
+       (_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
+      widths[1]=padw*
+       (_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
+      widths[2]=padw*
+       (_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;
+      widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw;
+      widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw;
+      widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw;
+      for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w;
+      cairo_set_source_rgba(c,.0,.0,.0,.6);
+      cairo_rectangle(c,10,h-33,widths[0]+1,5);
+      cairo_rectangle(c,10,h-29,widths[1]+1,5);
+      cairo_rectangle(c,10,h-25,widths[2]+1,5);
+      cairo_rectangle(c,10,h-21,widths[3]+1,5);
+      cairo_rectangle(c,10,h-17,widths[4]+1,5);
+      cairo_rectangle(c,10,h-13,widths[5]+1,5);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,1,0,0);
+      cairo_rectangle(c,10.5,h-32.5,widths[0],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,0,1,0);
+      cairo_rectangle(c,10.5,h-28.5,widths[1],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,0,0,1);
+      cairo_rectangle(c,10.5,h-24.5,widths[2],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.6,.4,.0);
+      cairo_rectangle(c,10.5,h-20.5,widths[3],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.3,.3,.3);
+      cairo_rectangle(c,10.5,h-16.5,widths[4],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.5,.5,.8);
+      cairo_rectangle(c,10.5,h-12.5,widths[5],4);
+      cairo_fill(c);
+    }
+    /*Master qi indicator[s]:*/
+    if(_dec->telemetry_qi&0x1){
+      cairo_text_extents_t extents;
+      char                 buffer[10];
+      int                  p;
+      int                  y;
+      p=0;
+      y=h-7.5;
+      if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10;
+      buffer[p++]=48+_dec->state.qis[0]%10;
+      if(_dec->state.nqis>=2){
+        buffer[p++]=' ';
+        if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10;
+        buffer[p++]=48+_dec->state.qis[1]%10;
+      }
+      if(_dec->state.nqis==3){
+        buffer[p++]=' ';
+        if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10;
+        buffer[p++]=48+_dec->state.qis[2]%10;
+      }
+      buffer[p++]='\0';
+      cairo_select_font_face(c,"sans",
+       CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD);
+      cairo_set_font_size(c,18);
+      cairo_text_extents(c,buffer,&extents);
+      cairo_set_source_rgb(c,1,1,1);
+      cairo_move_to(c,w-extents.x_advance-10,y);
+      cairo_show_text(c,buffer);
+      cairo_set_source_rgb(c,0,0,0);
+      cairo_move_to(c,w-extents.x_advance-10,y);
+      cairo_text_path(c,buffer);
+      cairo_set_line_width(c,.8);
+      cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND);
+      cairo_stroke(c);
+    }
+    cairo_destroy(c);
+  }
+  /*Out of the Cairo plane into the telemetry YUV buffer.*/
+  _ycbcr[0].data=_dec->telemetry_frame_data;
+  _ycbcr[0].stride=_ycbcr[0].width;
+  _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride;
+  _ycbcr[1].stride=_ycbcr[1].width;
+  _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride;
+  _ycbcr[2].stride=_ycbcr[2].width;
+  y_row=_ycbcr[0].data;
+  u_row=_ycbcr[1].data;
+  v_row=_ycbcr[2].data;
+  rgb_row=data;
+  /*This is one of the few places it's worth handling chroma on a
+     case-by-case basis.*/
+  switch(_dec->state.info.pixel_fmt){
+    case TH_PF_420:{
+      for(y=0;y<h;y+=2){
+        unsigned char *y_row2;
+        unsigned char *rgb_row2;
+        y_row2=y_row+_ycbcr[0].stride;
+        rgb_row2=rgb_row+cstride;
+        for(x=0;x<w;x+=2){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          y_row[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+           +24966*rgb_row[4*x+4]+4207500)/255000;
+          y_row[x+1]=OC_CLAMP255(y);
+          y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1]
+           +24966*rgb_row2[4*x+0]+4207500)/255000;
+          y_row2[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5]
+           +24966*rgb_row2[4*x+4]+4207500)/255000;
+          y_row2[x+1]=OC_CLAMP255(y);
+          u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6]
+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+           -16436*(rgb_row[4*x+1]+rgb_row[4*x+5]
+           +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+           +24808*(rgb_row[4*x+0]+rgb_row[4*x+4]
+           +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930;
+          v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6]
+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]
+            +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+           -6384*(rgb_row[4*x+0]+rgb_row[4*x+4]
+            +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510;
+          u_row[x>>1]=OC_CLAMP255(u);
+          v_row[x>>1]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride<<1;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride<<1;
+      }
+    }break;
+    case TH_PF_422:{
+      for(y=0;y<h;y++){
+        for(x=0;x<w;x+=2){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          y_row[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+           +24966*rgb_row[4*x+4]+4207500)/255000;
+          y_row[x+1]=OC_CLAMP255(y);
+          u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6])
+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5])
+           +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930;
+          v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6])
+           -65744*(rgb_row[4*x+1]+rgb_row[4*x+5])
+           -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510;
+          u_row[x>>1]=OC_CLAMP255(u);
+          v_row[x>>1]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride;
+      }
+    }break;
+    /*case TH_PF_444:*/
+    default:{
+      for(y=0;y<h;y++){
+        for(x=0;x<w;x++){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1]
+           +99232*rgb_row[4*x+0]+29032005)/225930;
+          v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1]
+           -25536*rgb_row[4*x+0]+45940035)/357510;
+          y_row[x]=OC_CLAMP255(y);
+          u_row[x]=OC_CLAMP255(u);
+          v_row[x]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride;
+      }
+    }break;
+  }
+  /*Finished.
+    Destroy the surface.*/
+  cairo_surface_destroy(cs);
 }
+#endif
 
 int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
  ogg_int64_t *_granpos){
   int ret;
   if(_dec==NULL||_op==NULL)return TH_EFAULT;
   /*A completely empty packet indicates a dropped frame and is treated exactly
-     like an inter frame with no coded blocks.
-    Only proceed if we have a non-empty packet.*/
-  if(_op->bytes!=0){
-    oc_dec_pipeline_state pipe;
-    th_ycbcr_buffer       stripe_buf;
-    int                   stripe_fragy;
-    int                   refi;
-    int                   pli;
-    int                   notstart;
-    int                   notdone;
+     like an inter frame with no coded blocks.*/
+  if(_op->bytes==0){
+    _dec->state.frame_type=OC_INTER_FRAME;
+    _dec->state.ntotal_coded_fragis=0;
+  }
+  else{
     oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
+    ret=oc_dec_frame_header_unpack(_dec);
+    if(ret<0)return ret;
+    if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec);
+    else oc_dec_coded_flags_unpack(_dec);
+  }
+  /*If there have been no reference frames, and we need one, initialize one.*/
+  if(_dec->state.frame_type!=OC_INTRA_FRAME&&
+   (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
+   _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
+    oc_dec_init_dummy_frame(_dec);
+  }
+  /*If this was an inter frame with no coded blocks...*/
+  if(_dec->state.ntotal_coded_fragis<=0){
+    /*Just update the granule position and return.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    return TH_DUPFRAME;
+  }
+  else{
+    th_ycbcr_buffer stripe_buf;
+    int             stripe_fragy;
+    int             refi;
+    int             pli;
+    int             notstart;
+    int             notdone;
+#ifdef HAVE_CAIRO
+    int             telemetry;
+    /*Save the current telemetry state.
+      This prevents it from being modified in the middle of decoding this
+       frame, which could cause us to skip calls to the striped decoding
+       callback.*/
+    telemetry=_dec->telemetry_mbmode||_dec->telemetry_mv||
+     _dec->telemetry_qi||_dec->telemetry_bits;
+#endif
+    /*Select a free buffer to use for the reconstructed version of this frame.*/
+    for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
+     refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+    _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+    _dec->state.ref_frame_data[OC_FRAME_SELF]=
+     _dec->state.ref_frame_bufs[refi][0].data;
 #if defined(HAVE_CAIRO)
     _dec->telemetry_frame_bytes=_op->bytes;
 #endif
-    ret=oc_dec_frame_header_unpack(_dec);
-    if(ret<0)return ret;
-    /*Select a free buffer to use for the reconstructed version of this
-       frame.*/
-    if(_dec->state.frame_type!=OC_INTRA_FRAME&&
-     (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
-     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
-      /*No reference frames yet!*/
-      oc_dec_init_dummy_frame(_dec);
-      refi=_dec->state.ref_frame_idx[OC_FRAME_SELF];
-    }
-    else{
-      for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
-       refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
-      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
-    }
     if(_dec->state.frame_type==OC_INTRA_FRAME){
-      oc_dec_mark_all_intra(_dec);
       _dec->state.keyframe_num=_dec->state.curframe_num;
 #if defined(HAVE_CAIRO)
       _dec->telemetry_coding_bytes=
@@ -2112,7 +2804,6 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
 #endif
     }
     else{
-      oc_dec_coded_flags_unpack(_dec);
 #if defined(HAVE_CAIRO)
       _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb);
 #endif
@@ -2160,15 +2851,15 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
       An application callback allows further application processing (blitting
        to video memory, color conversion, etc.) to also use the data while it's
        in cache.*/
-    oc_dec_pipeline_init(_dec,&pipe);
+    oc_dec_pipeline_init(_dec,&_dec->pipe);
     oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
     notstart=0;
     notdone=1;
-    for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+    for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){
       int avail_fragy0;
       int avail_fragy_end;
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
-      notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
+      notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end;
       for(pli=0;pli<3;pli++){
         oc_fragment_plane *fplane;
         int                frag_shift;
@@ -2179,45 +2870,46 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
         /*Compute the first and last fragment row of the current MCU for this
            plane.*/
         frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
-        pipe.fragy0[pli]=stripe_fragy>>frag_shift;
-        pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
-         pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
-        oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
-        oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
+        _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli);
         sdelay=edelay=0;
-        if(pipe.loop_filter){
+        if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
-          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
-           refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          oc_state_loop_filter_frag_rows(&_dec->state,
+           _dec->pipe.bounding_values,OC_FRAME_SELF,pli,
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a
            fragment in the next row could filter its top edge, using two pixels
            from a fragment in this row.
           But there's no reason to delay a full fragment between the two.*/
         oc_state_borders_fill_rows(&_dec->state,refi,pli,
-         (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
-         (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+         (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
         /*Out-of-loop post-processing.*/
         pp_offset=3*(pli!=0);
-        if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+        if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
           /*Perform de-blocking in one plane.*/
           sdelay+=notstart;
           edelay+=notdone;
           oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
            _dec->state.ref_frame_bufs[refi],pli,
-           pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
-          if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+          if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
             /*Perform de-ringing in one plane.*/
             sdelay+=notstart;
             edelay+=notdone;
             oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
-             pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+             _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
           }
         }
         /*If no post-processing is done, we still need to delay a row for the
            loop filter, thanks to the strange filtering order VP3 chose.*/
-        else if(pipe.loop_filter){
+        else if(_dec->pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
         }
@@ -2226,11 +2918,16 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
            doubled, but luma might have more post-processing filters enabled
            than chroma, so we don't know up front which one is the limiting
            factor.*/
-        avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy0=OC_MINI(avail_fragy0,
+         _dec->pipe.fragy0[pli]-sdelay<<frag_shift);
         avail_fragy_end=OC_MINI(avail_fragy_end,
-         pipe.fragy_end[pli]-edelay<<frag_shift);
+         _dec->pipe.fragy_end[pli]-edelay<<frag_shift);
       }
+#ifdef HAVE_CAIRO
+      if(_dec->stripe_cb.stripe_decoded!=NULL&&!telemetry){
+#else
       if(_dec->stripe_cb.stripe_decoded!=NULL){
+#endif
         /*The callback might want to use the FPU, so let's make sure they can.
           We violate all kinds of ABI restrictions by not doing this until
            now, but none of them actually matter since we don't use floating
@@ -2252,692 +2949,44 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
       _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
        _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     else{
       /*Otherwise, just replace the previous reference frame.*/
       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
        gamma values, if nothing else).*/
     oc_restore_fpu(&_dec->state);
+#ifdef HAVE_CAIRO
+    /*If telemetry ioctls are active, we need to draw to the output buffer.*/
+    if(telemetry){
+      oc_render_telemetry(_dec,stripe_buf,telemetry);
+      oc_ycbcr_buffer_flip(_dec->pp_frame_buf,stripe_buf);
+      /*If we had a striped decoding callback, we skipped calling it above
+         (because the telemetry wasn't rendered yet).
+        Call it now with the whole frame.*/
+      if(_dec->stripe_cb.stripe_decoded!=NULL){
+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,
+         stripe_buf,0,_dec->state.fplanes[0].nvfrags);
+      }
+    }
+#endif
 #if defined(OC_DUMP_IMAGES)
-    /*Don't dump images for dropped frames.*/
+    /*We only dump images if there were some coded blocks.*/
     oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
 #endif
     return 0;
   }
-  else{
-    if(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
-     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0){
-      int refi;
-      /*No reference frames yet!*/
-      oc_dec_init_dummy_frame(_dec);
-      refi=_dec->state.ref_frame_idx[OC_FRAME_PREV];
-      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
-      memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[refi],
-       sizeof(_dec->pp_frame_buf[0])*3);
-    }
-    /*Just update the granule position and return.*/
-    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
-     _dec->state.info.keyframe_granule_shift)
-     +(_dec->state.curframe_num-_dec->state.keyframe_num);
-    _dec->state.curframe_num++;
-    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
-    return TH_DUPFRAME;
-  }
 }
 
 int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){
   if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT;
   oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf);
-#if defined(HAVE_CAIRO)
-  /*If telemetry ioctls are active, we need to draw to the output buffer.
-    Stuff the plane into cairo.*/
-  if(_dec->telemetry){
-    cairo_surface_t *cs;
-    unsigned char   *data;
-    unsigned char   *y_row;
-    unsigned char   *u_row;
-    unsigned char   *v_row;
-    unsigned char   *rgb_row;
-    int              cstride;
-    int              w;
-    int              h;
-    int              x;
-    int              y;
-    int              hdec;
-    int              vdec;
-    w=_ycbcr[0].width;
-    h=_ycbcr[0].height;
-    hdec=!(_dec->state.info.pixel_fmt&1);
-    vdec=!(_dec->state.info.pixel_fmt&2);
-    /*Lazy data buffer init.
-      We could try to re-use the post-processing buffer, which would save
-       memory, but complicate the allocation logic there.
-      I don't think anyone cares about memory usage when using telemetry; it is
-       not meant for embedded devices.*/
-    if(_dec->telemetry_frame_data==NULL){
-      _dec->telemetry_frame_data=_ogg_malloc(
-       (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data));
-      if(_dec->telemetry_frame_data==NULL)return 0;
-    }
-    cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h);
-    /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/
-    data=cairo_image_surface_get_data(cs);
-    if(data==NULL){
-      cairo_surface_destroy(cs);
-      return 0;
-    }
-    cstride=cairo_image_surface_get_stride(cs);
-    y_row=_ycbcr[0].data;
-    u_row=_ycbcr[1].data;
-    v_row=_ycbcr[2].data;
-    rgb_row=data;
-    for(y=0;y<h;y++){
-      for(x=0;x<w;x++){
-        int r;
-        int g;
-        int b;
-        r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200;
-        g=(3827562*y_row[x]-1287801*u_row[x>>hdec]
-         -2672387*v_row[x>>hdec]+447306710)/3287200;
-        b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600;
-        rgb_row[4*x+0]=OC_CLAMP255(b);
-        rgb_row[4*x+1]=OC_CLAMP255(g);
-        rgb_row[4*x+2]=OC_CLAMP255(r);
-      }
-      y_row+=_ycbcr[0].stride;
-      u_row+=_ycbcr[1].stride&-((y&1)|!vdec);
-      v_row+=_ycbcr[2].stride&-((y&1)|!vdec);
-      rgb_row+=cstride;
-    }
-    /*Draw coded identifier for each macroblock (stored in Hilbert order).*/
-    {
-      cairo_t           *c;
-      const oc_fragment *frags;
-      oc_mv             *frag_mvs;
-      const signed char *mb_modes;
-      oc_mb_map         *mb_maps;
-      size_t             nmbs;
-      size_t             mbi;
-      int                row2;
-      int                col2;
-      int                qim[3]={0,0,0};
-      if(_dec->state.nqis==2){
-        int bqi;
-        bqi=_dec->state.qis[0];
-        if(_dec->state.qis[1]>bqi)qim[1]=1;
-        if(_dec->state.qis[1]<bqi)qim[1]=-1;
-      }
-      if(_dec->state.nqis==3){
-        int bqi;
-        int cqi;
-        int dqi;
-        bqi=_dec->state.qis[0];
-        cqi=_dec->state.qis[1];
-        dqi=_dec->state.qis[2];
-        if(cqi>bqi&&dqi>bqi){
-          if(dqi>cqi){
-            qim[1]=1;
-            qim[2]=2;
-          }
-          else{
-            qim[1]=2;
-            qim[2]=1;
-          }
-        }
-        else if(cqi<bqi&&dqi<bqi){
-          if(dqi<cqi){
-            qim[1]=-1;
-            qim[2]=-2;
-          }
-          else{
-            qim[1]=-2;
-            qim[2]=-1;
-          }
-        }
-        else{
-          if(cqi<bqi)qim[1]=-1;
-          else qim[1]=1;
-          if(dqi<bqi)qim[2]=-1;
-          else qim[2]=1;
-        }
-      }
-      c=cairo_create(cs);
-      frags=_dec->state.frags;
-      frag_mvs=_dec->state.frag_mvs;
-      mb_modes=_dec->state.mb_modes;
-      mb_maps=_dec->state.mb_maps;
-      nmbs=_dec->state.nmbs;
-      row2=0;
-      col2=0;
-      for(mbi=0;mbi<nmbs;mbi++){
-        float x;
-        float y;
-        int   bi;
-        y=h-(row2+((col2+1>>1)&1))*16-16;
-        x=(col2>>1)*16;
-        cairo_set_line_width(c,1.);
-        /*Keyframe (all intra) red box.*/
-        if(_dec->state.frame_type==OC_INTRA_FRAME){
-          if(_dec->telemetry_mbmode&0x02){
-            cairo_set_source_rgba(c,1.,0,0,.5);
-            cairo_rectangle(c,x+2.5,y+2.5,11,11);
-            cairo_stroke_preserve(c);
-            cairo_set_source_rgba(c,1.,0,0,.25);
-            cairo_fill(c);
-          }
-        }
-        else{
-          const signed char *frag_mv;
-          ptrdiff_t          fragi;
-          for(bi=0;bi<4;bi++){
-            fragi=mb_maps[mbi][0][bi];
-            if(fragi>=0&&frags[fragi].coded){
-              frag_mv=frag_mvs[fragi];
-              break;
-            }
-          }
-          if(bi<4){
-            switch(mb_modes[mbi]){
-              case OC_MODE_INTRA:{
-                if(_dec->telemetry_mbmode&0x02){
-                  cairo_set_source_rgba(c,1.,0,0,.5);
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,1.,0,0,.25);
-                  cairo_fill(c);
-                }
-              }break;
-              case OC_MODE_INTER_NOMV:{
-                if(_dec->telemetry_mbmode&0x01){
-                  cairo_set_source_rgba(c,0,0,1.,.5);
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,0,0,1.,.25);
-                  cairo_fill(c);
-                }
-              }break;
-              case OC_MODE_INTER_MV:{
-                if(_dec->telemetry_mbmode&0x04){
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_set_source_rgba(c,0,1.,0,.5);
-                  cairo_stroke(c);
-                }
-                if(_dec->telemetry_mv&0x04){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+8,y+8);
-                  cairo_stroke(c);
-                }
-              }break;
-              case OC_MODE_INTER_MV_LAST:{
-                if(_dec->telemetry_mbmode&0x08){
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_set_source_rgba(c,0,1.,0,.5);
-                  cairo_move_to(c,x+13.5,y+2.5);
-                  cairo_line_to(c,x+2.5,y+8);
-                  cairo_line_to(c,x+13.5,y+13.5);
-                  cairo_stroke(c);
-                }
-                if(_dec->telemetry_mv&0x08){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+8,y+8);
-                  cairo_stroke(c);
-                }
-              }break;
-              case OC_MODE_INTER_MV_LAST2:{
-                if(_dec->telemetry_mbmode&0x10){
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_set_source_rgba(c,0,1.,0,.5);
-                  cairo_move_to(c,x+8,y+2.5);
-                  cairo_line_to(c,x+2.5,y+8);
-                  cairo_line_to(c,x+8,y+13.5);
-                  cairo_move_to(c,x+13.5,y+2.5);
-                  cairo_line_to(c,x+8,y+8);
-                  cairo_line_to(c,x+13.5,y+13.5);
-                  cairo_stroke(c);
-                }
-                if(_dec->telemetry_mv&0x10){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+8,y+8);
-                  cairo_stroke(c);
-                }
-              }break;
-              case OC_MODE_GOLDEN_NOMV:{
-                if(_dec->telemetry_mbmode&0x20){
-                  cairo_set_source_rgba(c,1.,1.,0,.5);
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,1.,1.,0,.25);
-                  cairo_fill(c);
-                }
-              }break;
-              case OC_MODE_GOLDEN_MV:{
-                if(_dec->telemetry_mbmode&0x40){
-                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
-                  cairo_set_source_rgba(c,1.,1.,0,.5);
-                  cairo_stroke(c);
-                }
-                if(_dec->telemetry_mv&0x40){
-                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+8,y+8);
-                  cairo_stroke(c);
-                }
-              }break;
-              case OC_MODE_INTER_MV_FOUR:{
-                if(_dec->telemetry_mbmode&0x80){
-                  cairo_rectangle(c,x+2.5,y+2.5,4,4);
-                  cairo_rectangle(c,x+9.5,y+2.5,4,4);
-                  cairo_rectangle(c,x+2.5,y+9.5,4,4);
-                  cairo_rectangle(c,x+9.5,y+9.5,4,4);
-                  cairo_set_source_rgba(c,0,1.,0,.5);
-                  cairo_stroke(c);
-                }
-                /*4mv is odd, coded in raster order.*/
-                fragi=mb_maps[mbi][0][0];
-                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+4+frag_mv[0],y+12-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+4,y+12);
-                  cairo_stroke(c);
-                }
-                fragi=mb_maps[mbi][0][1];
-                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+12+frag_mv[0],y+12-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+12,y+12);
-                  cairo_stroke(c);
-                }
-                fragi=mb_maps[mbi][0][2];
-                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+4+frag_mv[0],y+4-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+4,y+4);
-                  cairo_stroke(c);
-                }
-                fragi=mb_maps[mbi][0][3];
-                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
-                  frag_mv=frag_mvs[fragi];
-                  cairo_move_to(c,x+12+frag_mv[0],y+4-frag_mv[1]);
-                  cairo_set_source_rgba(c,1.,1.,1.,.9);
-                  cairo_set_line_width(c,3.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,2.);
-                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
-                  cairo_stroke_preserve(c);
-                  cairo_set_line_width(c,1.);
-                  cairo_line_to(c,x+12,y+4);
-                  cairo_stroke(c);
-                }
-              }break;
-            }
-          }
-        }
-        /*qii illustration.*/
-        if(_dec->telemetry_qi&0x2){
-          cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE);
-          for(bi=0;bi<4;bi++){
-            ptrdiff_t fragi;
-            int       qiv;
-            int       xp;
-            int       yp;
-            xp=x+(bi&1)*8;
-            yp=y+8-(bi&2)*4;
-            fragi=mb_maps[mbi][0][bi];
-            if(fragi>=0&&frags[fragi].coded){
-              qiv=qim[frags[fragi].qii];
-              cairo_set_line_width(c,3.);
-              cairo_set_source_rgba(c,0.,0.,0.,.5);
-              switch(qiv){
-                /*Double plus:*/
-                case 2:{
-                  if((bi&1)^((bi&2)>>1)){
-                    cairo_move_to(c,xp+2.5,yp+1.5);
-                    cairo_line_to(c,xp+2.5,yp+3.5);
-                    cairo_move_to(c,xp+1.5,yp+2.5);
-                    cairo_line_to(c,xp+3.5,yp+2.5);
-                    cairo_move_to(c,xp+5.5,yp+4.5);
-                    cairo_line_to(c,xp+5.5,yp+6.5);
-                    cairo_move_to(c,xp+4.5,yp+5.5);
-                    cairo_line_to(c,xp+6.5,yp+5.5);
-                    cairo_stroke_preserve(c);
-                    cairo_set_source_rgba(c,0.,1.,1.,1.);
-                  }
-                  else{
-                    cairo_move_to(c,xp+5.5,yp+1.5);
-                    cairo_line_to(c,xp+5.5,yp+3.5);
-                    cairo_move_to(c,xp+4.5,yp+2.5);
-                    cairo_line_to(c,xp+6.5,yp+2.5);
-                    cairo_move_to(c,xp+2.5,yp+4.5);
-                    cairo_line_to(c,xp+2.5,yp+6.5);
-                    cairo_move_to(c,xp+1.5,yp+5.5);
-                    cairo_line_to(c,xp+3.5,yp+5.5);
-                    cairo_stroke_preserve(c);
-                    cairo_set_source_rgba(c,0.,1.,1.,1.);
-                  }
-                }break;
-                /*Double minus:*/
-                case -2:{
-                  cairo_move_to(c,xp+2.5,yp+2.5);
-                  cairo_line_to(c,xp+5.5,yp+2.5);
-                  cairo_move_to(c,xp+2.5,yp+5.5);
-                  cairo_line_to(c,xp+5.5,yp+5.5);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,1.,1.,1.,1.);
-                }break;
-                /*Plus:*/
-                case 1:{
-                  if(bi&2==0)yp-=2;
-                  if(bi&1==0)xp-=2;
-                  cairo_move_to(c,xp+4.5,yp+2.5);
-                  cairo_line_to(c,xp+4.5,yp+6.5);
-                  cairo_move_to(c,xp+2.5,yp+4.5);
-                  cairo_line_to(c,xp+6.5,yp+4.5);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,.1,1.,.3,1.);
-                  break;
-                }
-                /*Fall through.*/
-                /*Minus:*/
-                case -1:{
-                  cairo_move_to(c,xp+2.5,yp+4.5);
-                  cairo_line_to(c,xp+6.5,yp+4.5);
-                  cairo_stroke_preserve(c);
-                  cairo_set_source_rgba(c,1.,.3,.1,1.);
-                }break;
-                default:continue;
-              }
-              cairo_set_line_width(c,1.);
-              cairo_stroke(c);
-            }
-          }
-        }
-        col2++;
-        if((col2>>1)>=_dec->state.nhmbs){
-          col2=0;
-          row2+=2;
-        }
-      }
-      /*Bit usage indicator[s]:*/
-      if(_dec->telemetry_bits){
-        int widths[6];
-        int fpsn;
-        int fpsd;
-        int mult;
-        int fullw;
-        int padw;
-        int i;
-        fpsn=_dec->state.info.fps_numerator;
-        fpsd=_dec->state.info.fps_denominator;
-        mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
-        fullw=250.f*h*fpsd*mult/fpsn;
-        padw=w-24;
-        /*Header and coded block bits.*/
-        if(_dec->telemetry_frame_bytes<0||
-         _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
-          _dec->telemetry_frame_bytes=0;
-        }
-        if(_dec->telemetry_coding_bytes<0||
-         _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
-          _dec->telemetry_coding_bytes=0;
-        }
-        if(_dec->telemetry_mode_bytes<0||
-         _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
-          _dec->telemetry_mode_bytes=0;
-        }
-        if(_dec->telemetry_mv_bytes<0||
-         _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
-          _dec->telemetry_mv_bytes=0;
-        }
-        if(_dec->telemetry_qi_bytes<0||
-         _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
-          _dec->telemetry_qi_bytes=0;
-        }
-        if(_dec->telemetry_dc_bytes<0||
-         _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
-          _dec->telemetry_dc_bytes=0;
-        }
-        widths[0]=padw*(_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
-        widths[1]=padw*(_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
-        widths[2]=padw*(_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;
-        widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw;
-        widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw;
-        widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw;
-        for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w;
-        cairo_set_source_rgba(c,.0,.0,.0,.6);
-        cairo_rectangle(c,10,h-33,widths[0]+1,5);
-        cairo_rectangle(c,10,h-29,widths[1]+1,5);
-        cairo_rectangle(c,10,h-25,widths[2]+1,5);
-        cairo_rectangle(c,10,h-21,widths[3]+1,5);
-        cairo_rectangle(c,10,h-17,widths[4]+1,5);
-        cairo_rectangle(c,10,h-13,widths[5]+1,5);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,1,0,0);
-        cairo_rectangle(c,10.5,h-32.5,widths[0],4);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,0,1,0);
-        cairo_rectangle(c,10.5,h-28.5,widths[1],4);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,0,0,1);
-        cairo_rectangle(c,10.5,h-24.5,widths[2],4);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,.6,.4,.0);
-        cairo_rectangle(c,10.5,h-20.5,widths[3],4);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,.3,.3,.3);
-        cairo_rectangle(c,10.5,h-16.5,widths[4],4);
-        cairo_fill(c);
-        cairo_set_source_rgb(c,.5,.5,.8);
-        cairo_rectangle(c,10.5,h-12.5,widths[5],4);
-        cairo_fill(c);
-      }
-      /*Master qi indicator[s]:*/
-      if(_dec->telemetry_qi&0x1){
-        cairo_text_extents_t extents;
-        char                 buffer[10];
-        int                  p;
-        int                  y;
-        p=0;
-        y=h-7.5;
-        if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10;
-        buffer[p++]=48+_dec->state.qis[0]%10;
-        if(_dec->state.nqis>=2){
-          buffer[p++]=' ';
-          if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10;
-          buffer[p++]=48+_dec->state.qis[1]%10;
-        }
-        if(_dec->state.nqis==3){
-          buffer[p++]=' ';
-          if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10;
-          buffer[p++]=48+_dec->state.qis[2]%10;
-        }
-        buffer[p++]='\0';
-        cairo_select_font_face(c,"sans",
-         CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD);
-        cairo_set_font_size(c,18);
-        cairo_text_extents(c,buffer,&extents);
-        cairo_set_source_rgb(c,1,1,1);
-        cairo_move_to(c,w-extents.x_advance-10,y);
-        cairo_show_text(c,buffer);
-        cairo_set_source_rgb(c,0,0,0);
-        cairo_move_to(c,w-extents.x_advance-10,y);
-        cairo_text_path(c,buffer);
-        cairo_set_line_width(c,.8);
-        cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND);
-        cairo_stroke(c);
-      }
-      cairo_destroy(c);
-    }
-    /*Out of the Cairo plane into the telemetry YUV buffer.*/
-    _ycbcr[0].data=_dec->telemetry_frame_data;
-    _ycbcr[0].stride=_ycbcr[0].width;
-    _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride;
-    _ycbcr[1].stride=_ycbcr[1].width;
-    _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride;
-    _ycbcr[2].stride=_ycbcr[2].width;
-    y_row=_ycbcr[0].data;
-    u_row=_ycbcr[1].data;
-    v_row=_ycbcr[2].data;
-    rgb_row=data;
-    /*This is one of the few places it's worth handling chroma on a
-       case-by-case basis.*/
-    switch(_dec->state.info.pixel_fmt){
-      case TH_PF_420:{
-        for(y=0;y<h;y+=2){
-          unsigned char *y_row2;
-          unsigned char *rgb_row2;
-          y_row2=y_row+_ycbcr[0].stride;
-          rgb_row2=rgb_row+cstride;
-          for(x=0;x<w;x+=2){
-            int y;
-            int u;
-            int v;
-            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
-             +24966*rgb_row[4*x+0]+4207500)/255000;
-            y_row[x]=OC_CLAMP255(y);
-            y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
-             +24966*rgb_row[4*x+4]+4207500)/255000;
-            y_row[x+1]=OC_CLAMP255(y);
-            y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1]
-             +24966*rgb_row2[4*x+0]+4207500)/255000;
-            y_row2[x]=OC_CLAMP255(y);
-            y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5]
-             +24966*rgb_row2[4*x+4]+4207500)/255000;
-            y_row2[x+1]=OC_CLAMP255(y);
-            u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6]
-             +rgb_row2[4*x+2]+rgb_row2[4*x+6])
-             -16436*(rgb_row[4*x+1]+rgb_row[4*x+5]
-             +rgb_row2[4*x+1]+rgb_row2[4*x+5])
-             +24808*(rgb_row[4*x+0]+rgb_row[4*x+4]
-             +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930;
-            v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6]
-             +rgb_row2[4*x+2]+rgb_row2[4*x+6])
-             -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]
-              +rgb_row2[4*x+1]+rgb_row2[4*x+5])
-             -6384*(rgb_row[4*x+0]+rgb_row[4*x+4]
-              +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510;
-            u_row[x>>1]=OC_CLAMP255(u);
-            v_row[x>>1]=OC_CLAMP255(v);
-          }
-          y_row+=_ycbcr[0].stride<<1;
-          u_row+=_ycbcr[1].stride;
-          v_row+=_ycbcr[2].stride;
-          rgb_row+=cstride<<1;
-        }
-      }break;
-      case TH_PF_422:{
-        for(y=0;y<h;y++){
-          for(x=0;x<w;x+=2){
-            int y;
-            int u;
-            int v;
-            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
-             +24966*rgb_row[4*x+0]+4207500)/255000;
-            y_row[x]=OC_CLAMP255(y);
-            y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
-             +24966*rgb_row[4*x+4]+4207500)/255000;
-            y_row[x+1]=OC_CLAMP255(y);
-            u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6])
-             -32872*(rgb_row[4*x+1]+rgb_row[4*x+5])
-             +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930;
-            v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6])
-             -65744*(rgb_row[4*x+1]+rgb_row[4*x+5])
-             -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510;
-            u_row[x>>1]=OC_CLAMP255(u);
-            v_row[x>>1]=OC_CLAMP255(v);
-          }
-          y_row+=_ycbcr[0].stride;
-          u_row+=_ycbcr[1].stride;
-          v_row+=_ycbcr[2].stride;
-          rgb_row+=cstride;
-        }
-      }break;
-      /*case TH_PF_444:*/
-      default:{
-        for(y=0;y<h;y++){
-          for(x=0;x<w;x++){
-            int y;
-            int u;
-            int v;
-            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
-             +24966*rgb_row[4*x+0]+4207500)/255000;
-            u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1]
-             +99232*rgb_row[4*x+0]+29032005)/225930;
-            v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1]
-             -25536*rgb_row[4*x+0]+45940035)/357510;
-            y_row[x]=OC_CLAMP255(y);
-            u_row[x]=OC_CLAMP255(u);
-            v_row[x]=OC_CLAMP255(v);
-          }
-          y_row+=_ycbcr[0].stride;
-          u_row+=_ycbcr[1].stride;
-          v_row+=_ycbcr[2].stride;
-          rgb_row+=cstride;
-        }
-      }break;
-    }
-    /*Finished.
-      Destroy the surface.*/
-    cairo_surface_destroy(cs);
-  }
-#endif
   return 0;
 }
diff --git a/thirdparty/libtheora/dequant.c b/thirdparty/libtheora/dequant.c
index e554872d4e..860536f72d 100644
--- a/thirdparty/libtheora/dequant.c
+++ b/thirdparty/libtheora/dequant.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/dequant.h b/thirdparty/libtheora/dequant.h
index ef25838e35..9d6cd6be56 100644
--- a/thirdparty/libtheora/dequant.h
+++ b/thirdparty/libtheora/dequant.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/encfrag.c b/thirdparty/libtheora/encfrag.c
index bb814c8e4a..0e18111ac7 100644
--- a/thirdparty/libtheora/encfrag.c
+++ b/thirdparty/libtheora/encfrag.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: encfrag.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <stdlib.h>
@@ -19,11 +19,6 @@
 #include "encint.h"
 
 
-void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride){
-  (*_enc->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
-}
-
 void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   int i;
@@ -35,11 +30,6 @@ void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
   }
 }
 
-void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride){
-  (*_enc->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
-}
-
 void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
  const unsigned char *_src,int _ystride){
   int i;
@@ -50,11 +40,6 @@ void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
   }
 }
 
-unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_x,
- const unsigned char *_y,int _ystride){
-  return (*_enc->opt_vtable.frag_sad)(_x,_y,_ystride);
-}
-
 unsigned oc_enc_frag_sad_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   unsigned sad;
@@ -69,12 +54,6 @@ unsigned oc_enc_frag_sad_c(const unsigned char *_src,
   return sad;
 }
 
-unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh){
-  return (*_enc->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
-}
-
 unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride,unsigned _thresh){
   unsigned sad;
@@ -90,13 +69,6 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
   return sad;
 }
 
-unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh){
-  return (*_enc->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
-   _thresh);
-}
-
 unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh){
@@ -114,6 +86,27 @@ unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
   return sad;
 }
 
+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
+  const unsigned char *src = _src;
+  unsigned dc;
+  unsigned sad;
+  int      i;
+  dc=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)dc+=src[j];
+    src+=_ystride;
+  }
+  dc=dc+32>>6;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
+    _src+=_ystride;
+  }
+  return sad;
+}
+
 static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   int i;
@@ -269,19 +262,20 @@ static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
   }
 }
 
-unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
-  unsigned    sad;
-  int         t0;
-  int         t1;
-  int         t2;
-  int         t3;
-  int         t4;
-  int         t5;
-  int         t6;
-  int         t7;
-  int         r;
-  int         i;
-  sad=0;
+unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
+  unsigned sad;
+  int      dc;
+  int      t0;
+  int      t1;
+  int      t2;
+  int      t3;
+  int      t4;
+  int      t5;
+  int      t6;
+  int      t7;
+  int      r;
+  int      i;
+  sad=dc=0;
   for(i=0;i<8;i++){
     /*Hadamard stage 1:*/
     t0=_buf[i*8+0]+_buf[i*8+4];
@@ -306,7 +300,7 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
     t5+=t7;
     t7=r-t7;
     /*Hadamard stage 3:*/
-    r=abs(t0+t1);
+    r=abs(t0+t1)&-(i>0);
     r+=abs(t0-t1);
     r+=abs(t2+t3);
     r+=abs(t2-t3);
@@ -315,54 +309,61 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
     r+=abs(t6+t7);
     r+=abs(t6-t7);
     sad+=r;
-    if(sad>_thresh)break;
   }
+  dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
+  *_dc=dc;
   return sad;
 }
 
-unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh){
-  return (*_enc->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh);
-}
-
-unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard(buf,_src,_ref,_ystride);
-  return oc_hadamard_sad_thresh(buf,_thresh);
-}
-
-unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh){
-  return (*_enc->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride,
-   _thresh);
+  return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
-  return oc_hadamard_sad_thresh(buf,_thresh);
+  return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
  const unsigned char *_src,int _ystride){
-  return (*_enc->opt_vtable.frag_intra_satd)(_src,_ystride);
-}
-
-unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride){
   ogg_int16_t buf[64];
   oc_intra_hadamard(buf,_src,_ystride);
-  return oc_hadamard_sad_thresh(buf,UINT_MAX)
-   -abs(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+  return oc_hadamard_sad(_dc,buf);
 }
 
-void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride){
-  (*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
+}
+
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++,_mask>>=1){
+      if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    }
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
 }
 
 void oc_enc_frag_copy2_c(unsigned char *_dst,
@@ -376,13 +377,3 @@ void oc_enc_frag_copy2_c(unsigned char *_dst,
     _src2+=_ystride;
   }
 }
-
-void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
- unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){
-  (*_enc->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
-}
-
-void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
-  (*_enc->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
-}
diff --git a/thirdparty/libtheora/encinfo.c b/thirdparty/libtheora/encinfo.c
index 83be1dae72..41db6bad45 100644
--- a/thirdparty/libtheora/encinfo.c
+++ b/thirdparty/libtheora/encinfo.c
@@ -1,6 +1,6 @@
 #include <stdlib.h>
 #include <string.h>
-#include "internal.h"
+#include "state.h"
 #include "enquant.h"
 #include "huffenc.h"
 
diff --git a/thirdparty/libtheora/encint.h b/thirdparty/libtheora/encint.h
index 97897d5a04..d25de4b8f6 100644
--- a/thirdparty/libtheora/encint.h
+++ b/thirdparty/libtheora/encint.h
@@ -11,17 +11,13 @@
  ********************************************************************
 
   function:
-  last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #if !defined(_encint_H)
 # define _encint_H (1)
-# if defined(HAVE_CONFIG_H)
-#  include "config.h"
-# endif
 # include "theora/theoraenc.h"
-# include "internal.h"
-# include "ocintrin.h"
+# include "state.h"
 # include "mathops.h"
 # include "enquant.h"
 # include "huffenc.h"
@@ -32,8 +28,13 @@
 typedef oc_mv                         oc_mv2[2];
 
 typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
+typedef struct oc_enc_opt_data        oc_enc_opt_data;
 typedef struct oc_mb_enc_info         oc_mb_enc_info;
 typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_fr_state            oc_fr_state;
+typedef struct oc_qii_state           oc_qii_state;
+typedef struct oc_enc_pipeline_state  oc_enc_pipeline_state;
+typedef struct oc_mode_rd             oc_mode_rd;
 typedef struct oc_iir_filter          oc_iir_filter;
 typedef struct oc_frame_metrics       oc_frame_metrics;
 typedef struct oc_rc_state            oc_rc_state;
@@ -42,6 +43,170 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 
 
 
+/*Encoder-specific accelerated functions.*/
+# if defined(OC_X86_ASM)
+#  if defined(_MSC_VER)
+#   include "x86_vc/x86enc.h"
+#  else
+#   include "x86/x86enc.h"
+#  endif
+# endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armenc.h"
+# endif
+
+# if !defined(oc_enc_accel_init)
+#  define oc_enc_accel_init oc_enc_accel_init_c
+# endif
+# if defined(OC_ENC_USE_VTABLE)
+#  if !defined(oc_enc_frag_sub)
+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sub_128)
+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sad)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sad_thresh)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  ((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh))
+#  endif
+#  if !defined(oc_enc_frag_sad2_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  ((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
+#  endif
+#  if !defined(oc_enc_frag_intra_sad)
+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_satd)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_satd2)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  ((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_intra_satd)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_ssd)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_border_ssd)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  ((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask))
+#  endif
+#  if !defined(oc_enc_frag_copy2)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  ((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride))
+#  endif
+#  if !defined(oc_enc_enquant_table_init)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  ((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant))
+#  endif
+#  if !defined(oc_enc_enquant_table_fixup)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  ((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis))
+#  endif
+#  if !defined(oc_enc_quantize)
+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  ((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant))
+#  endif
+#  if !defined(oc_enc_frag_recon_intra)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  ((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue))
+#  endif
+#  if !defined(oc_enc_frag_recon_inter)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  ((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
+#  endif
+#  if !defined(oc_enc_fdct8x8)
+#   define oc_enc_fdct8x8(_enc,_y,_x) \
+  ((*(_enc)->opt_vtable.fdct8x8)(_y,_x))
+#  endif
+# else
+#  if !defined(oc_enc_frag_sub)
+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
+  oc_enc_frag_sub_c(_diff,_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sub_128)
+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
+  oc_enc_frag_sub_128_c(_diff,_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sad)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_sad_c(_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sad_thresh)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh)
+#  endif
+#  if !defined(oc_enc_frag_sad2_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
+#  endif
+#  if !defined(oc_enc_frag_intra_sad)
+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
+  oc_enc_frag_intra_sad_c(_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_satd)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_satd2)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_intra_satd)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  oc_enc_frag_intra_satd_c(_dc,_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_ssd)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_ssd_c(_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_border_ssd)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask)
+#  endif
+#  if !defined(oc_enc_frag_copy2)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride)
+#  endif
+#  if !defined(oc_enc_enquant_table_init)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  oc_enc_enquant_table_init_c(_enquant,_dequant)
+#  endif
+#  if !defined(oc_enc_enquant_table_fixup)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  oc_enc_enquant_table_fixup_c(_enquant,_nqis)
+#  endif
+#  if !defined(oc_enc_quantize)
+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant)
+#  endif
+#  if !defined(oc_enc_frag_recon_intra)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_c(_dst,_ystride,_residue)
+#  endif
+#  if !defined(oc_enc_frag_recon_inter)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
+#  endif
+#  if !defined(oc_enc_fdct8x8)
+#   define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x)
+#  endif
+# endif
+
+
+
 /*Constants for the packet-out state machine specific to the encoder.*/
 
 /*Next packet to emit: Data packet, but none are ready yet.*/
@@ -50,13 +215,61 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #define OC_PACKET_READY (1)
 
 /*All features enabled.*/
-#define OC_SP_LEVEL_SLOW       (0)
+#define OC_SP_LEVEL_SLOW          (0)
 /*Enable early skip.*/
-#define OC_SP_LEVEL_EARLY_SKIP (1)
+#define OC_SP_LEVEL_EARLY_SKIP    (1)
+/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
+#define OC_SP_LEVEL_FAST_ANALYSIS (2)
+/*Use SAD instead of SATD*/
+#define OC_SP_LEVEL_NOSATD        (3)
 /*Disable motion compensation.*/
-#define OC_SP_LEVEL_NOMC       (2)
+#define OC_SP_LEVEL_NOMC          (4)
 /*Maximum valid speed level.*/
-#define OC_SP_LEVEL_MAX        (2)
+#define OC_SP_LEVEL_MAX           (4)
+
+
+/*The number of extra bits of precision at which to store rate metrics.*/
+# define OC_BIT_SCALE  (6)
+/*The number of extra bits of precision at which to store RMSE metrics.
+  This must be at least half OC_BIT_SCALE (rounded up).*/
+# define OC_RMSE_SCALE (5)
+/*The number of quantizer bins to partition statistics into.*/
+# define OC_LOGQ_BINS  (8)
+/*The number of SAD/SATD bins to partition statistics into.*/
+# define OC_COMP_BINS   (24)
+/*The number of bits of precision to drop from SAD and SATD scores
+   to assign them to a bin.*/
+# define OC_SAD_SHIFT  (6)
+# define OC_SATD_SHIFT (9)
+
+/*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
+   or the lambda parameter (via rd_iscale).
+  These are only equivalent within a single block; when more than one block is
+   being considered, the former is the interpretation used.*/
+
+/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/
+# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE)
+# define OC_RD_ISCALE_BITS (11)
+
+/*This macro is applied to _ssd values with just 4 bits of headroom
+   ((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as
+   large as 16, and need additional fractional bits, our only recourse that
+   doesn't lose precision on blocks with very small SSDs is to use a wider
+   multiply.*/
+# if LONG_MAX>2147483647
+#  define OC_RD_SCALE(_ssd,_rd_scale) \
+ ((unsigned)((unsigned long)(_ssd)*(_rd_scale) \
+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
+# else
+#  define OC_RD_SCALE(_ssd,_rd_scale) \
+ (((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \
+ +(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \
+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
+# endif
+# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \
+ ((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4)
+# define OC_RD_ISCALE(_lambda,_rd_iscale) \
+ ((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS)
 
 
 /*The bits used for each of the MB mode codebooks.*/
@@ -78,6 +291,10 @@ extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
 
 /*Encoder specific functions with accelerated variants.*/
 struct oc_enc_opt_vtable{
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
   unsigned (*frag_sad)(const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
   unsigned (*frag_sad_thresh)(const unsigned char *_src,
@@ -85,18 +302,23 @@ struct oc_enc_opt_vtable{
   unsigned (*frag_sad2_thresh)(const unsigned char *_src,
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
    unsigned _thresh);
-  unsigned (*frag_satd_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
-  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+  unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
+  unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
-  void     (*frag_sub_128)(ogg_int16_t _diff[64],
-   const unsigned char *_src,int _ystride);
+  unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+  unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
+  unsigned (*frag_ssd)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_border_ssd)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
   void     (*frag_copy2)(unsigned char *_dst,
    const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*enquant_table_init)(void *_enquant,
+   const ogg_uint16_t _dequant[64]);
+  void     (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
+  int      (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+   const ogg_uint16_t _dequant[64],const void *_enquant);
   void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
    const ogg_int16_t _residue[64]);
   void     (*frag_recon_inter)(unsigned char *_dst,
@@ -105,7 +327,19 @@ struct oc_enc_opt_vtable{
 };
 
 
-void oc_enc_vtable_init(oc_enc_ctx *_enc);
+/*Encoder specific data that varies according to which variants of the above
+   functions are used.*/
+struct oc_enc_opt_data{
+  /*The size of a single quantizer table.
+    This must be a multiple of enquant_table_alignment.*/
+  size_t               enquant_table_size;
+  /*The alignment required for the quantizer tables.
+    This must be a positive power of two.*/
+  int                  enquant_table_alignment;
+};
+
+
+void oc_enc_accel_init(oc_enc_ctx *_enc);
 
 
 
@@ -158,7 +392,7 @@ struct oc_mode_scheme_chooser{
     corresponds to the ranks above.*/
   unsigned char        scheme0_list[OC_NMODES];
   /*The number of times each mode has been chosen so far.*/
-  int                  mode_counts[OC_NMODES];
+  unsigned             mode_counts[OC_NMODES];
   /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
   unsigned char        scheme_list[8];
   /*The number of bits used by each mode coding scheme.*/
@@ -170,6 +404,106 @@ void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
 
 
 
+/*State to track coded block flags and their bit cost.
+  We use opportunity cost to measure the bits required to code or skip the next
+   block, using the cheaper of the cost to code it fully or partially, so long
+   as both are possible.*/
+struct oc_fr_state{
+  /*The number of bits required for the coded block flags so far this frame.*/
+  ptrdiff_t  bits;
+  /*The length of the current run for the partial super block flag, not
+     including the current super block.*/
+  unsigned   sb_partial_count:16;
+  /*The length of the current run for the full super block flag, not
+     including the current super block.*/
+  unsigned   sb_full_count:16;
+  /*The length of the coded block flag run when the current super block
+     started.*/
+  unsigned   b_coded_count_prev:6;
+  /*The coded block flag when the current super block started.*/
+  signed int b_coded_prev:2;
+  /*The length of the current coded block flag run.*/
+  unsigned   b_coded_count:6;
+  /*The current coded block flag.*/
+  signed int b_coded:2;
+  /*The number of blocks processed in the current super block.*/
+  unsigned   b_count:5;
+  /*Whether or not it is cheaper to code the current super block partially,
+     even if it could still be coded fully.*/
+  unsigned   sb_prefer_partial:1;
+  /*Whether the last super block was coded partially.*/
+  signed int sb_partial:2;
+  /*The number of bits required for the flags for the current super block.*/
+  unsigned   sb_bits:6;
+  /*Whether the last non-partial super block was coded fully.*/
+  signed int sb_full:2;
+};
+
+
+
+struct oc_qii_state{
+  ptrdiff_t  bits;
+  unsigned   qi01_count:14;
+  signed int qi01:2;
+  unsigned   qi12_count:14;
+  signed int qi12:2;
+};
+
+
+
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+  /*DCT coefficient storage.
+    This is kept off the stack because a) gcc can't align things on the stack
+     reliably on ARM, and b) it avoids (unintentional) data hazards between
+     ARM and NEON code.*/
+  OC_ALIGN16(ogg_int16_t dct_data[64*3]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  oc_fr_state         fr[3];
+  oc_qii_state        qs[3];
+  /*Skip SSD storage for the current MCU in each plane.*/
+  unsigned           *skip_ssd[3];
+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+  ptrdiff_t          *coded_fragis[3];
+  ptrdiff_t          *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  /*The starting fragment for the current MCU in each plane.*/
+  ptrdiff_t           froffset[3];
+  /*The starting row for the current MCU in each plane.*/
+  int                 fragy0[3];
+  /*The ending row for the current MCU in each plane.*/
+  int                 fragy_end[3];
+  /*The starting superblock for the current MCU in each plane.*/
+  unsigned            sbi0[3];
+  /*The ending superblock for the current MCU in each plane.*/
+  unsigned            sbi_end[3];
+  /*The number of tokens for zzi=1 for each color plane.*/
+  int                 ndct_tokens1[3];
+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
+  int                 eob_run1[3];
+  /*Whether or not the loop filter is enabled.*/
+  int                 loop_filter;
+};
+
+
+
+/*Statistics used to estimate R-D cost of a block in a given coding mode.
+  See modedec.h for more details.*/
+struct oc_mode_rd{
+  /*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/
+  ogg_int16_t rate;
+  /*The expected square root of the sum of squared errors, shifted by
+     OC_RMSE_SCALE.*/
+  ogg_int16_t rmse;
+};
+
+# if defined(OC_COLLECT_METRICS)
+#  include "collect.h"
+# endif
+
+
+
 /*A 2nd order low-pass Bessel follower.
   We use this for rate control because it has fast reaction time, but is
    critically damped.*/
@@ -190,6 +524,8 @@ struct oc_frame_metrics{
   unsigned      dup_count:31;
   /*The frame type from pass 1.*/
   unsigned      frame_type:1;
+  /*The frame activity average from pass 1.*/
+  unsigned      activity_avg;
 };
 
 
@@ -335,10 +671,15 @@ struct th_enc_ctx{
   size_t                   mv_bits[2];
   /*The mode scheme chooser for estimating mode coding costs.*/
   oc_mode_scheme_chooser   chooser;
+  /*Temporary encoder state for the analysis pipeline.*/
+  oc_enc_pipeline_state    pipe;
   /*The number of vertical super blocks in an MCU.*/
   int                      mcu_nvsbs;
   /*The SSD error for skipping each fragment in the current MCU.*/
   unsigned                *mcu_skip_ssd;
+  /*The masking scale factors for chroma blocks in the current MCU.*/
+  ogg_uint16_t            *mcu_rd_scale;
+  ogg_uint16_t            *mcu_rd_iscale;
   /*The DCT token lists for each coefficient and each plane.*/
   unsigned char          **dct_tokens[3];
   /*The extra bits associated with each DCT token.*/
@@ -350,8 +691,10 @@ struct th_enc_ctx{
   /*The offset of the first DCT token for each coefficient for each plane.*/
   unsigned char            dct_token_offs[3][64];
   /*The last DC coefficient for each plane and reference frame.*/
-  int                      dc_pred_last[3][3];
+  int                      dc_pred_last[3][4];
 #if defined(OC_COLLECT_METRICS)
+  /*Fragment SAD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_sad;
   /*Fragment SATD statistics for MB mode estimation metrics.*/
   unsigned                *frag_satd;
   /*Fragment SSD statistics for MB mode estimation metrics.*/
@@ -359,32 +702,56 @@ struct th_enc_ctx{
 #endif
   /*The R-D optimization parameter.*/
   int                      lambda;
+  /*The average block "activity" of the previous frame.*/
+  unsigned                 activity_avg;
+  /*The average MB luma of the previous frame.*/
+  unsigned                 luma_avg;
   /*The huffman tables in use.*/
   th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
   /*The quantization parameters in use.*/
   th_quant_info            qinfo;
-  oc_iquant               *enquant_tables[64][3][2];
-  oc_iquant_table          enquant_table_data[64][3][2];
-  /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
-     value.
-    This is used to paramterize the rate control decisions.
+  /*The original DC coefficients saved off from the dequatization tables.*/
+  ogg_uint16_t             dequant_dc[64][3][2];
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t      *dequant[3][3][2];
+  /*Condensed quantization tables.*/
+  void                    *enquant[3][3][2];
+  /*The full set of quantization tables.*/
+  void                    *enquant_tables[64][3][2];
+  /*Storage for the quantization tables.*/
+  unsigned char           *enquant_table_data;
+  /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
+    This is used to parameterize the rate control decisions.
     They are kept in the log domain to simplify later processing.
-    Keep in mind these are DCT domain quantizers, and so are scaled by an
-     additional factor of 4 from the pixel domain.*/
+    These are DCT domain quantizers, and so are scaled by an additional factor
+     of 4 from the pixel domain.*/
   ogg_int64_t              log_qavg[2][64];
+  /*The "average" quantizer futher partitioned by color plane.
+    This is used to parameterize mode decision.
+    These are DCT domain quantizers, and so are scaled by an additional factor
+     of 4 from the pixel domain.*/
+  ogg_int16_t              log_plq[64][3][2];
+  /*The R-D scale factors to apply to chroma blocks for a given frame type
+     (INTRA or INTER) and qi value.
+    The first is the "D" modifier (rd_scale), while the second is the "lambda"
+     modifier (rd_iscale).*/
+  ogg_uint16_t             chroma_rd_scale[2][64][2];
+  /*The interpolated mode decision R-D lookup tables for the current
+     quantizers, color plane, and quantization type.*/
+  oc_mode_rd               mode_rd[3][3][2][OC_COMP_BINS];
   /*The buffer state used to drive rate control.*/
   oc_rc_state              rc;
+# if defined(OC_ENC_USE_VTABLE)
   /*Table for encoder acceleration functions.*/
   oc_enc_opt_vtable        opt_vtable;
+# endif
+  /*Table for encoder data used by accelerated functions.*/
+  oc_enc_opt_data          opt_data;
 };
 
 
 void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
 int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
-#if defined(OC_COLLECT_METRICS)
-void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
-void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
-#endif
 
 
 
@@ -415,8 +782,13 @@ struct oc_token_checkpoint{
 
 void oc_enc_tokenize_start(oc_enc_ctx *_enc);
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
- int _zzi,oc_token_checkpoint **_stack,int _acmin);
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
  const oc_token_checkpoint *_stack,int _n);
 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
@@ -436,45 +808,13 @@ int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
 
 
 
-/*Encoder-specific accelerated functions.*/
-void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
- unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
- const ogg_int16_t _x[64]);
-
-/*Default pure-C implementations.*/
-void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
+/*Default pure-C implementations of encoder-specific accelerated functions.*/
+void oc_enc_accel_init_c(oc_enc_ctx *_enc);
 
 void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
  const unsigned char *_src,const unsigned char *_ref,int _ystride);
 void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
  const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2_c(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
 unsigned oc_enc_frag_sad_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
@@ -482,12 +822,24 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_enquant_table_init_c(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
 void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif
diff --git a/thirdparty/libtheora/encode.c b/thirdparty/libtheora/encode.c
index 0c5ea6a172..3309f97c03 100644
--- a/thirdparty/libtheora/encode.c
+++ b/thirdparty/libtheora/encode.c
@@ -11,15 +11,13 @@
  ********************************************************************
 
   function:
-  last mod: $Id: encode.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <stdlib.h>
 #include <string.h>
 #include "encint.h"
-#if defined(OC_X86_ASM)
-# include "x86/x86enc.h"
-#endif
+#include "dequant.h"
 
 
 
@@ -288,12 +286,12 @@ const th_quant_info TH_DEF_QUANT_INFO={
      28, 25, 24, 22, 20, 17, 14, 10
   },
   {
-    30,25,20,20,15,15,14,14,
-    13,13,12,12,11,11,10,10,
-     9, 9, 8, 8, 7, 7, 7, 7,
-     6, 6, 6, 6, 5, 5, 5, 5,
-     4, 4, 4, 4, 3, 3, 3, 3,
+    15,12, 9, 8, 6, 6, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5,
+     4, 4, 4, 4, 4, 4, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3,
      2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 0,
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0
   },
@@ -623,11 +621,15 @@ static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
   }
 }
 
-static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,int _dx,int _dy){
+static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,oc_mv _mv){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
   oggpackB_write(&_enc->opb,
-   OC_MV_CODES[_mv_scheme][_dx+31],OC_MV_BITS[_mv_scheme][_dx+31]);
+   OC_MV_CODES[_mv_scheme][dx+31],OC_MV_BITS[_mv_scheme][dx+31]);
   oggpackB_write(&_enc->opb,
-   OC_MV_CODES[_mv_scheme][_dy+31],OC_MV_BITS[_mv_scheme][_dy+31]);
+   OC_MV_CODES[_mv_scheme][dy+31],OC_MV_BITS[_mv_scheme][dy+31]);
 }
 
 static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
@@ -650,7 +652,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
   mb_modes=_enc->state.mb_modes;
   mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
   frags=_enc->state.frags;
-  frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
+  frag_mvs=_enc->state.frag_mvs;
   for(mbii=0;mbii<ncoded_mbis;mbii++){
     ptrdiff_t fragi;
     unsigned  mbi;
@@ -662,8 +664,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
         for(bi=0;;bi++){
           fragi=mb_maps[mbi][0][bi];
           if(frags[fragi].coded){
-            oc_enc_mv_pack(_enc,mv_scheme,
-             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
             /*Only code a single MV for this macro block.*/
             break;
           }
@@ -673,8 +674,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
         for(bi=0;bi<4;bi++){
           fragi=mb_maps[mbi][0][bi];
           if(frags[fragi].coded){
-            oc_enc_mv_pack(_enc,mv_scheme,
-             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
             /*Keep coding all the MVs for this macro block.*/
           }
         }
@@ -863,11 +863,55 @@ static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
   }
 }
 
+/*Packs an explicit drop frame, instead of using the more efficient 0-byte
+   packet.
+  This is only enabled in VP3-compatibility mode, even though it is not
+   strictly required for VP3 compatibility (VP3 could be encoded in AVI, which
+   also supports dropping frames by inserting 0 byte packets).
+  However, almost every _Theora_ player used to get this wrong (and many still
+   do), and it wasn't until we started shipping a post-VP3 encoder that
+   actually used non-VP3 features that this began to be discovered and fixed,
+   despite being in the standard since 2004.
+  The pack buffer must be reset before calling this function.*/
+static void oc_enc_drop_frame_pack(oc_enc_ctx *_enc){
+  unsigned nsbs;
+  /*Mark this as a data packet.*/
+  oggpackB_write(&_enc->opb,0,1);
+  /*Output the frame type (key frame or delta frame).*/
+  oggpackB_write(&_enc->opb,OC_INTER_FRAME,1);
+  /*Write out the current qi list.
+    We always use just 1 qi, to avoid wasting bits on the others.*/
+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
+  oggpackB_write(&_enc->opb,0,1);
+  /*Coded block flags: everything is uncoded.*/
+  nsbs=_enc->state.nsbs;
+  /*No partially coded SBs.*/
+  oggpackB_write(&_enc->opb,0,1);
+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
+  /*No fully coded SBs.*/
+  oggpackB_write(&_enc->opb,0,1);
+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
+  /*MB modes: just need write which scheme to use.
+    Since we have no coded MBs, we can pick any of them except 0, which would
+     require writing out an additional mode list.*/
+  oggpackB_write(&_enc->opb,7,3);
+  /*MVs: just need write which scheme to use.
+    We can pick either one, since we have no MVs.*/
+  oggpackB_write(&_enc->opb,1,1);
+  /*Write the chosen DC token tables.*/
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][0],4);
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][1],4);
+  /*Write the chosen AC token tables.*/
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][0],4);
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][1],4);
+}
+
 static void oc_enc_frame_pack(oc_enc_ctx *_enc){
+  /*musl libc malloc()/realloc() calls might use floating point, so make sure
+     we've cleared the MMX state for them.*/
+  oc_restore_fpu(&_enc->state);
   oggpackB_reset(&_enc->opb);
-  /*Only proceed if we have some coded blocks.
-    If there are no coded blocks, we can drop this frame simply by emitting a
-     0 byte packet.*/
+  /*Only proceed if we have some coded blocks.*/
   if(_enc->state.ntotal_coded_fragis>0){
     oc_enc_frame_header_pack(_enc);
     if(_enc->state.frame_type==OC_INTER_FRAME){
@@ -880,6 +924,10 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){
     oc_enc_tokenize_finish(_enc);
     oc_enc_residual_tokens_pack(_enc);
   }
+  /*If there are no coded blocks, we can drop this frame simply by emitting a
+     0 byte packet.
+    We emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+  else if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
   /*Success: Mark the packet as ready to be flushed.*/
   _enc->packet_state=OC_PACKET_READY;
 #if defined(OC_COLLECT_METRICS)
@@ -888,21 +936,31 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){
 }
 
 
-void oc_enc_vtable_init_c(oc_enc_ctx *_enc){
+void oc_enc_accel_init_c(oc_enc_ctx *_enc){
   /*The implementations prefixed with oc_enc_ are encoder-specific.
     The rest we re-use from the decoder.*/
+# if defined(OC_ENC_USE_VTABLE)
+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
   _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
   _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
   _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
-  _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c;
-  _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c;
+  _enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c;
+  _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
+  _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
   _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
-  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
-  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c;
+  _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c;
   _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
+  _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
+  _enc->opt_vtable.quantize=oc_enc_quantize_c;
   _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
+# endif
+  _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
+  _enc->opt_data.enquant_table_alignment=16;
 }
 
 /*Initialize the macro block neighbor lists for MC analysis.
@@ -1003,6 +1061,55 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
   return 0;
 }
 
+static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  unsigned char *etd;
+  size_t         ets;
+  int            align;
+  int            qii;
+  int            qi;
+  int            pli;
+  int            qti;
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+  }
+  /*Initialize the dequantization tables.*/
+  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
+  /*And save off the DC values.*/
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
+  }
+  /*Set up storage for the quantization tables.*/
+  etd=_enc->enquant_table_data;
+  ets=_enc->opt_data.enquant_table_size;
+  align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
+  etd+=align;
+  /*Set up the main tables.*/
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->enquant_tables[qi][pli][qti]=etd;
+    oc_enc_enquant_table_init(_enc,etd,
+     _enc->state.dequant_tables[qi][pli][qti]);
+    etd+=ets;
+  }
+  /*Set up storage for the local copies we modify for each frame.*/
+  for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
+    _enc->enquant[pli][qii][qti]=etd;
+    etd+=ets;
+  }
+}
+
+/*Updates the encoder state after the quantization parameters have been
+   changed.*/
+static void oc_enc_quant_params_updated(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  oc_enc_enquant_tables_init(_enc,_qinfo);
+  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
+   sizeof(_enc->state.loop_filter_limits));
+  oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale,
+   _enc->state.dequant_tables,_enc->state.info.pixel_fmt);
+}
+
 /*Sets the quantization parameters to use.
   This may only be called before the setup header is written.
   If it is called multiple times, only the last call has any effect.
@@ -1012,25 +1119,20 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
            will be used.*/
 static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
  const th_quant_info *_qinfo){
-  int qi;
-  int pli;
-  int qti;
+  th_quant_info old_qinfo;
+  int           ret;
   if(_enc==NULL)return TH_EFAULT;
   if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
   if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
-  /*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/
-  memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo));
-  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
-    _enc->state.dequant_tables[qi][pli][qti]=
-     _enc->state.dequant_table_data[qi][pli][qti];
-    _enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti];
+  memcpy(&old_qinfo,&_enc->qinfo,sizeof(old_qinfo));
+  ret=oc_quant_params_clone(&_enc->qinfo,_qinfo);
+  if(ret<0){
+    oc_quant_params_clear(&_enc->qinfo);
+    memcpy(&_enc->qinfo,&old_qinfo,sizeof(old_qinfo));
+    return ret;
   }
-  oc_enquant_tables_init(_enc->state.dequant_tables,
-   _enc->enquant_tables,_qinfo);
-  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
-   sizeof(_enc->state.loop_filter_limits));
-  oc_enquant_qavg_init(_enc->log_qavg,_enc->state.dequant_tables,
-   _enc->state.info.pixel_fmt);
+  else oc_quant_params_clear(&old_qinfo);
+  oc_enc_quant_params_updated(_enc,_qinfo);
   return 0;
 }
 
@@ -1039,6 +1141,7 @@ static void oc_enc_clear(oc_enc_ctx *_enc);
 static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
   th_info   info;
   size_t    mcu_nmbs;
+  ptrdiff_t mcu_ncfrags;
   ptrdiff_t mcu_nfrags;
   int       hdec;
   int       vdec;
@@ -1053,8 +1156,9 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
   if(info.quality<0)info.quality=32;
   if(info.target_bitrate<0)info.target_bitrate=0;
   /*Initialize the shared encoder/decoder state.*/
-  ret=oc_state_init(&_enc->state,&info,4);
+  ret=oc_state_init(&_enc->state,&info,6);
   if(ret<0)return ret;
+  oc_enc_accel_init(_enc);
   _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
   _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
   _enc->coded_mbis=
@@ -1065,9 +1169,14 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
      super block rows of Y' for each super block row of Cb and Cr.*/
   _enc->mcu_nvsbs=1<<vdec;
   mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
-  mcu_nfrags=4*mcu_nmbs+(8*mcu_nmbs>>hdec+vdec);
+  mcu_ncfrags=mcu_nmbs<<3-(hdec+vdec);
+  mcu_nfrags=4*mcu_nmbs+mcu_ncfrags;
   _enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
    mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
+  _enc->mcu_rd_scale=(ogg_uint16_t *)_ogg_malloc(
+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_scale));
+  _enc->mcu_rd_iscale=(ogg_uint16_t *)_ogg_malloc(
+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_iscale));
   for(pli=0;pli<3;pli++){
     _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
      _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
@@ -1075,34 +1184,22 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
      _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
   }
 #if defined(OC_COLLECT_METRICS)
+  _enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad));
   _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
   _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
 #endif
-#if defined(OC_X86_ASM)
-  oc_enc_vtable_init_x86(_enc);
-#else
-  oc_enc_vtable_init_c(_enc);
-#endif
+  _enc->enquant_table_data=(unsigned char *)_ogg_malloc(
+   (64+3)*3*2*_enc->opt_data.enquant_table_size
+   +_enc->opt_data.enquant_table_alignment-1);
   _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
   _enc->state.qis[0]=_enc->state.info.quality;
   _enc->state.nqis=1;
+  _enc->activity_avg=90<<12;
+  _enc->luma_avg=128<<8;
   oc_rc_state_init(&_enc->rc,_enc);
   oggpackB_writeinit(&_enc->opb);
-  if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL||
-   _enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL||
-   _enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL||
-   _enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL||
-   _enc->extra_bits[2]==NULL
-#if defined(OC_COLLECT_METRICS)
-   ||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
-#endif
-   ){
-    oc_enc_clear(_enc);
-    return TH_EFAULT;
-  }
-  oc_mode_scheme_chooser_init(&_enc->chooser);
-  oc_enc_mb_info_init(_enc);
-  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
+  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
+  memset(_enc->qinfo.qi_ranges,0,sizeof(_enc->qinfo.qi_ranges));
   /*Reset the packet-out state machine.*/
   _enc->packet_state=OC_PACKET_INFO_HDR;
   _enc->dup_count=0;
@@ -1114,26 +1211,45 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
   _enc->vp3_compatible=0;
   /*No INTER frames coded yet.*/
   _enc->coded_inter_frame=0;
-  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
-  oc_enc_set_quant_params(_enc,NULL);
+  if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL
+   ||_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL
+   ||_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL
+   ||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL
+   ||_enc->extra_bits[2]==NULL
+#if defined(OC_COLLECT_METRICS)
+   ||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
+#endif
+   ||oc_enc_set_quant_params(_enc,NULL)<0){
+    oc_enc_clear(_enc);
+    return TH_EFAULT;
+  }
+  oc_mode_scheme_chooser_init(&_enc->chooser);
+  oc_enc_mb_info_init(_enc);
+  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
   return 0;
 }
 
 static void oc_enc_clear(oc_enc_ctx *_enc){
   int pli;
   oc_rc_state_clear(&_enc->rc);
-#if defined(OC_COLLECT_METRICS)
-  oc_enc_mode_metrics_dump(_enc);
-#endif
   oggpackB_writeclear(&_enc->opb);
+  oc_quant_params_clear(&_enc->qinfo);
+  _ogg_free(_enc->enquant_table_data);
 #if defined(OC_COLLECT_METRICS)
+  /*Save the collected metrics from this run.
+    Use tools/process_modedec_stats to actually generate modedec.h from the
+     resulting file.*/
+  oc_mode_metrics_dump();
   _ogg_free(_enc->frag_ssd);
   _ogg_free(_enc->frag_satd);
+  _ogg_free(_enc->frag_sad);
 #endif
   for(pli=3;pli-->0;){
     oc_free_2d(_enc->extra_bits[pli]);
     oc_free_2d(_enc->dct_tokens[pli]);
   }
+  _ogg_free(_enc->mcu_rd_iscale);
+  _ogg_free(_enc->mcu_rd_scale);
   _ogg_free(_enc->mcu_skip_ssd);
   _ogg_free(_enc->coded_mbis);
   _ogg_free(_enc->frag_dc);
@@ -1145,10 +1261,14 @@ static void oc_enc_drop_frame(th_enc_ctx *_enc){
   /*Use the previous frame's reconstruction.*/
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=
    _enc->state.ref_frame_idx[OC_FRAME_PREV];
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_data[OC_FRAME_PREV];
   /*Flag motion vector analysis about the frame drop.*/
   _enc->prevframe_dropped=1;
   /*Zero the packet.*/
   oggpackB_reset(&_enc->opb);
+  /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+  if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
 }
 
 static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
@@ -1222,9 +1342,9 @@ static void oc_enc_set_granpos(oc_enc_ctx *_enc){
 th_enc_ctx *th_encode_alloc(const th_info *_info){
   oc_enc_ctx *enc;
   if(_info==NULL)return NULL;
-  enc=_ogg_malloc(sizeof(*enc));
+  enc=oc_aligned_malloc(sizeof(*enc),16);
   if(enc==NULL||oc_enc_init(enc,_info)<0){
-    _ogg_free(enc);
+    oc_aligned_free(enc);
     return NULL;
   }
   return enc;
@@ -1233,7 +1353,7 @@ th_enc_ctx *th_encode_alloc(const th_info *_info){
 void th_encode_free(th_enc_ctx *_enc){
   if(_enc!=NULL){
     oc_enc_clear(_enc);
-    _ogg_free(_enc);
+    oc_aligned_free(_enc);
   }
 }
 
@@ -1272,12 +1392,17 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
     }break;
     case TH_ENCCTL_SET_VP3_COMPATIBLE:{
       int vp3_compatible;
+      int ret;
       if(_enc==NULL||_buf==NULL)return TH_EFAULT;
       if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
+      /*Try this before we change anything else, because it can fail.*/
+      ret=oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO);
+      /*If we can't allocate enough memory, don't change any of the state.*/
+      if(ret==TH_EFAULT)return ret;
       vp3_compatible=*(int *)_buf;
       _enc->vp3_compatible=vp3_compatible;
       if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
-      if(oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO)<0)vp3_compatible=0;
+      if(ret<0)vp3_compatible=0;
       if(_enc->state.info.pixel_fmt!=TH_PF_420||
        _enc->state.info.pic_width<_enc->state.info.frame_width||
        _enc->state.info.pic_height<_enc->state.info.frame_height||
@@ -1386,6 +1511,44 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
       }
       return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
     }break;
+    case TH_ENCCTL_SET_COMPAT_CONFIG:{
+      unsigned char buf[7];
+      oc_pack_buf   opb;
+      th_quant_info qinfo;
+      th_huff_code  huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+      int           ret;
+      int           i;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+      oc_pack_readinit(&opb,_buf,_buf_sz);
+      /*Validate the setup packet header.*/
+      for(i=0;i<7;i++)buf[i]=(unsigned char)oc_pack_read(&opb,8);
+      if(!(buf[0]&0x80)||memcmp(buf+1,"theora",6)!=0)return TH_ENOTFORMAT;
+      if(buf[0]!=0x82)return TH_EBADHEADER;
+      /*Reads its contents.*/
+      ret=oc_quant_params_unpack(&opb,&qinfo);
+      if(ret<0){
+        oc_quant_params_clear(&qinfo);
+        return ret;
+      }
+      ret=oc_huff_codes_unpack(&opb,huff_codes);
+      if(ret<0){
+        oc_quant_params_clear(&qinfo);
+        return ret;
+      }
+      /*Install the new state.*/
+      oc_quant_params_clear(&_enc->qinfo);
+      memcpy(&_enc->qinfo,&qinfo,sizeof(qinfo));
+      oc_enc_quant_params_updated(_enc,&qinfo);
+      memcpy(_enc->huff_codes,huff_codes,sizeof(_enc->huff_codes));
+      return 0;
+    }
+#if defined(OC_COLLECT_METRICS)
+    case TH_ENCCTL_SET_METRICS_FILE:{
+      OC_MODE_METRICS_FILENAME=(const char *)_buf;
+      return 0;
+    }
+#endif
     default:return TH_EIMPL;
   }
 }
@@ -1477,6 +1640,12 @@ static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
 
 int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   th_ycbcr_buffer img;
+  int             frame_width;
+  int             frame_height;
+  int             pic_width;
+  int             pic_height;
+  int             pic_x;
+  int             pic_y;
   int             cframe_width;
   int             cframe_height;
   int             cpic_width;
@@ -1492,53 +1661,94 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   if(_enc==NULL||_img==NULL)return TH_EFAULT;
   if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
   if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
-  if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width||
-   (ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){
-    return TH_EINVAL;
-  }
   hdec=!(_enc->state.info.pixel_fmt&1);
   vdec=!(_enc->state.info.pixel_fmt&2);
-  cframe_width=_enc->state.info.frame_width>>hdec;
-  cframe_height=_enc->state.info.frame_height>>vdec;
-  if(_img[1].width!=cframe_width||_img[2].width!=cframe_width||
-   _img[1].height!=cframe_height||_img[2].height!=cframe_height){
-    return TH_EINVAL;
-  }
-  /*Step 2: Copy the input to our internal buffer.
-    This lets us add padding, if necessary, so we don't have to worry about
-     dereferencing possibly invalid addresses, and allows us to use the same
-     strides and fragment offsets for both the input frame and the reference
-     frames.*/
+  frame_width=_enc->state.info.frame_width;
+  frame_height=_enc->state.info.frame_height;
+  pic_x=_enc->state.info.pic_x;
+  pic_y=_enc->state.info.pic_y;
+  pic_width=_enc->state.info.pic_width;
+  pic_height=_enc->state.info.pic_height;
+  cframe_width=frame_width>>hdec;
+  cframe_height=frame_height>>vdec;
+  cpic_x=pic_x>>hdec;
+  cpic_y=pic_y>>vdec;
+  cpic_width=(pic_x+pic_width+hdec>>hdec)-cpic_x;
+  cpic_height=(pic_y+pic_height+vdec>>vdec)-cpic_y;
   /*Flip the input buffer upside down.*/
   oc_ycbcr_buffer_flip(img,_img);
-  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+0,img+0,
-   _enc->state.info.pic_x,_enc->state.info.pic_y,
-   _enc->state.info.pic_width,_enc->state.info.pic_height);
-  cpic_x=_enc->state.info.pic_x>>hdec;
-  cpic_y=_enc->state.info.pic_y>>vdec;
-  cpic_width=(_enc->state.info.pic_x+_enc->state.info.pic_width+hdec>>hdec)
-   -cpic_x;
-  cpic_height=(_enc->state.info.pic_y+_enc->state.info.pic_height+vdec>>vdec)
-   -cpic_y;
-  for(pli=1;pli<3;pli++){
-    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+pli,img+pli,
-     cpic_x,cpic_y,cpic_width,cpic_height);
+  if(img[0].width!=frame_width||img[0].height!=frame_height||
+   img[1].width!=cframe_width||img[2].width!=cframe_width||
+   img[1].height!=cframe_height||img[2].height!=cframe_height){
+    /*The buffer does not match the frame size.
+      Check to see if it matches the picture size.*/
+    if(img[0].width!=pic_width||img[0].height!=pic_height||
+     img[1].width!=cpic_width||img[2].width!=cpic_width||
+     img[1].height!=cpic_height||img[2].height!=cpic_height){
+      /*It doesn't; we don't know how to handle it.*/
+      return TH_EINVAL;
+    }
+    /*Adjust the pointers to address a full frame.
+      We still only use the picture region, however.*/
+    img[0].data-=pic_y*(ptrdiff_t)img[0].stride+pic_x;
+    img[1].data-=cpic_y*(ptrdiff_t)img[1].stride+cpic_x;
+    img[2].data-=cpic_y*(ptrdiff_t)img[2].stride+cpic_x;
   }
-  /*Step 3: Update the buffer state.*/
+  /*Step 2: Update the buffer state.*/
   if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
     _enc->state.ref_frame_idx[OC_FRAME_PREV]=
      _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    _enc->state.ref_frame_data[OC_FRAME_PREV]=
+     _enc->state.ref_frame_data[OC_FRAME_SELF];
     if(_enc->state.frame_type==OC_INTRA_FRAME){
       /*The new frame becomes both the previous and gold reference frames.*/
       _enc->state.keyframe_num=_enc->state.curframe_num;
       _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
        _enc->state.ref_frame_idx[OC_FRAME_SELF];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_data[OC_FRAME_SELF];
+    }
+  }
+  if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
+    _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
+     _enc->state.ref_frame_idx[OC_FRAME_IO];
+    _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
+     _enc->state.ref_frame_data[OC_FRAME_IO];
+    if(_enc->state.frame_type==OC_INTRA_FRAME){
+      /*The new input frame becomes both the previous and gold
+         original-reference frames.*/
+      _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
+       _enc->state.ref_frame_idx[OC_FRAME_IO];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
+       _enc->state.ref_frame_data[OC_FRAME_IO];
     }
   }
+  /*Select a free buffer to use for the incoming frame*/
+  for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
+  _enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_IO]=
+   _enc->state.ref_frame_bufs[refi][0].data;
+  /*Step 3: Copy the input to our internal buffer.
+    This lets us add padding, so we don't have to worry about dereferencing
+     possibly invalid addresses, and allows us to use the same strides and
+     fragment offsets for both the input frame and the reference frames.*/
+  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+0,img+0,
+   pic_x,pic_y,pic_width,pic_height);
+  oc_state_borders_fill_rows(&_enc->state,refi,0,0,frame_height);
+  oc_state_borders_fill_caps(&_enc->state,refi,0);
+  for(pli=1;pli<3;pli++){
+    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+pli,img+pli,
+     cpic_x,cpic_y,cpic_width,cpic_height);
+    oc_state_borders_fill_rows(&_enc->state,refi,pli,0,cframe_height);
+    oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  }
   /*Select a free buffer to use for the reconstructed version of this frame.*/
   for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
    refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_bufs[refi][0].data;
   _enc->state.curframe_num+=_enc->prev_dup_count+1;
   /*Step 4: Compress the frame.*/
   /*Start with a keyframe, and don't allow the generation of invalid files that
@@ -1575,11 +1785,11 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
 }
 
 int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
+  unsigned char *packet;
   if(_enc==NULL||_op==NULL)return TH_EFAULT;
   if(_enc->packet_state==OC_PACKET_READY){
     _enc->packet_state=OC_PACKET_EMPTY;
     if(_enc->rc.twopass!=1){
-      unsigned char *packet;
       packet=oggpackB_get_buffer(&_enc->opb);
       /*If there's no packet, malloc failed while writing; it's lost forever.*/
       if(packet==NULL)return TH_EFAULT;
@@ -1595,8 +1805,22 @@ int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
   else if(_enc->packet_state==OC_PACKET_EMPTY){
     if(_enc->nqueued_dups>0){
       _enc->nqueued_dups--;
-      _op->packet=NULL;
-      _op->bytes=0;
+      /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+      if(_enc->vp3_compatible){
+        oggpackB_reset(&_enc->opb);
+        oc_enc_drop_frame_pack(_enc);
+        packet=oggpackB_get_buffer(&_enc->opb);
+        /*If there's no packet, malloc failed while writing; it's lost
+           forever.*/
+        if(packet==NULL)return TH_EFAULT;
+        _op->packet=packet;
+        _op->bytes=oggpackB_bytes(&_enc->opb);
+      }
+      /*Otherwise emit a 0-byte packet.*/
+      else{
+        _op->packet=NULL;
+        _op->bytes=0;
+      }
     }
     else{
       if(_last_p)_enc->packet_state=OC_PACKET_DONE;
diff --git a/thirdparty/libtheora/encoder_disabled.c b/thirdparty/libtheora/encoder_disabled.c
index 0cbf6645ac..ba6d995505 100644
--- a/thirdparty/libtheora/encoder_disabled.c
+++ b/thirdparty/libtheora/encoder_disabled.c
@@ -11,12 +11,15 @@
  ********************************************************************
 
   function:
-  last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include "apiwrapper.h"
 #include "encint.h"
 
+const th_quant_info TH_VP31_QUANT_INFO = {};
+const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+
 th_enc_ctx *th_encode_alloc(const th_info *_info){
   return NULL;
 }
diff --git a/thirdparty/libtheora/enquant.c b/thirdparty/libtheora/enquant.c
index 3372fed221..8fd220edd7 100644
--- a/thirdparty/libtheora/enquant.c
+++ b/thirdparty/libtheora/enquant.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: enquant.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <stdlib.h>
@@ -20,6 +20,69 @@
 
 
 
+int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src){
+  int i;
+  memcpy(_dst,_src,sizeof(*_dst));
+  memset(_dst->qi_ranges,0,sizeof(_dst->qi_ranges));
+  for(i=0;i<6;i++){
+    int nranges;
+    int qti;
+    int pli;
+    int qtj;
+    int plj;
+    int pdup;
+    int qdup;
+    qti=i/3;
+    pli=i%3;
+    qtj=(i-1)/3;
+    plj=(i-1)%3;
+    nranges=_src->qi_ranges[qti][pli].nranges;
+    /*Check for those duplicates that can be cleanly handled by
+       oc_quant_params_clear().*/
+    pdup=i>0&&nranges<=_src->qi_ranges[qtj][plj].nranges;
+    qdup=qti>0&&nranges<=_src->qi_ranges[0][pli].nranges;
+    _dst->qi_ranges[qti][pli].nranges=nranges;
+    if(pdup&&_src->qi_ranges[qti][pli].sizes==_src->qi_ranges[qtj][plj].sizes){
+      _dst->qi_ranges[qti][pli].sizes=_dst->qi_ranges[qtj][plj].sizes;
+    }
+    else if(qdup&&_src->qi_ranges[1][pli].sizes==_src->qi_ranges[0][pli].sizes){
+      _dst->qi_ranges[1][pli].sizes=_dst->qi_ranges[0][pli].sizes;
+    }
+    else{
+      int *sizes;
+      sizes=(int *)_ogg_malloc(nranges*sizeof(*sizes));
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      if(sizes==NULL)return TH_EFAULT;
+      memcpy(sizes,_src->qi_ranges[qti][pli].sizes,nranges*sizeof(*sizes));
+      _dst->qi_ranges[qti][pli].sizes=sizes;
+    }
+    if(pdup&&_src->qi_ranges[qti][pli].base_matrices==
+     _src->qi_ranges[qtj][plj].base_matrices){
+      _dst->qi_ranges[qti][pli].base_matrices=
+       _dst->qi_ranges[qtj][plj].base_matrices;
+    }
+    else if(qdup&&_src->qi_ranges[1][pli].base_matrices==
+     _src->qi_ranges[0][pli].base_matrices){
+      _dst->qi_ranges[1][pli].base_matrices=
+       _dst->qi_ranges[0][pli].base_matrices;
+    }
+    else{
+      th_quant_base *base_matrices;
+      base_matrices=(th_quant_base *)_ogg_malloc(
+       (nranges+1)*sizeof(*base_matrices));
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      if(base_matrices==NULL)return TH_EFAULT;
+      memcpy(base_matrices,_src->qi_ranges[qti][pli].base_matrices,
+       (nranges+1)*sizeof(*base_matrices));
+      _dst->qi_ranges[qti][pli].base_matrices=
+       (const th_quant_base *)base_matrices;
+    }
+  }
+  return 0;
+}
+
 void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
   const th_quant_ranges *qranges;
   const th_quant_base   *base_mats[2*3*64];
@@ -119,7 +182,7 @@ void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
   }
 }
 
-static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
+void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
   ogg_uint32_t t;
   int          l;
   _d<<=1;
@@ -129,48 +192,61 @@ static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
   _this->l=l;
 }
 
-/*See comments at oc_dequant_tables_init() for how the quantization tables'
-   storage should be initialized.*/
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){
-  int qi;
+void oc_enc_enquant_table_init_c(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  oc_iquant *enquant;
+  int        zzi;
+  /*In the original VP3.2 code, the rounding offset and the size of the
+     dead zone around 0 were controlled by a "sharpness" parameter.
+    We now R-D optimize the tokens for each block after quantization,
+     so the rounding offset should always be 1/2, and an explicit dead
+     zone is unnecessary.
+    Hence, all of that VP3.2 code is gone from here, and the remaining
+     floating point code has been implemented as equivalent integer
+     code with exact precision.*/
+  enquant=(oc_iquant *)_enquant;
+  for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
+}
+
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){
   int pli;
+  int qii;
   int qti;
-  /*Initialize the dequantization tables first.*/
-  oc_dequant_tables_init(_dequant,NULL,_qinfo);
-  /*Derive the quantization tables directly from the dequantization tables.*/
-  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-    int zzi;
-    int plj;
-    int qtj;
-    int dupe;
-    dupe=0;
-    for(qtj=0;qtj<=qti;qtj++){
-      for(plj=0;plj<(qtj<qti?3:pli);plj++){
-        if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){
-          dupe=1;
-          break;
-        }
-      }
-      if(dupe)break;
-    }
-    if(dupe){
-      _enquant[qi][pli][qti]=_enquant[qi][plj][qtj];
-      continue;
-    }
-    /*In the original VP3.2 code, the rounding offset and the size of the
-       dead zone around 0 were controlled by a "sharpness" parameter.
-      We now R-D optimize the tokens for each block after quantization,
-       so the rounding offset should always be 1/2, and an explicit dead
-       zone is unnecessary.
-      Hence, all of that VP3.2 code is gone from here, and the remaining
-       floating point code has been implemented as equivalent integer
-       code with exact precision.*/
-    for(zzi=0;zzi<64;zzi++){
-      oc_iquant_init(_enquant[qi][pli][qti]+zzi,
-       _dequant[qi][pli][qti][zzi]);
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    *((oc_iquant *)_enquant[pli][qii][qti])=
+     *((oc_iquant *)_enquant[pli][0][qti]);
+  }
+}
+
+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+  const oc_iquant *enquant;
+  int              nonzero;
+  int              zzi;
+  int              val;
+  int              d;
+  int              s;
+  enquant=(const oc_iquant *)_enquant;
+  nonzero=0;
+  for(zzi=0;zzi<64;zzi++){
+    val=_dct[zzi];
+    d=_dequant[zzi];
+    val=val<<1;
+    if(abs(val)>=d){
+      s=OC_SIGNMASK(val);
+      /*The bias added here rounds ties away from zero, since token
+         optimization can only decrease the magnitude of the quantized
+         value.*/
+      val+=d+s^s;
+      /*Note the arithmetic right shift is not guaranteed by ANSI C.
+        Hopefully no one still uses ones-complement architectures.*/
+      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
+      _qdct[zzi]=(ogg_int16_t)val;
+      nonzero=zzi;
     }
+    else _qdct[zzi]=0;
   }
+  return nonzero;
 }
 
 
@@ -226,7 +302,7 @@ static const ogg_uint16_t OC_RPSD[2][64]={
    relative to the total, scaled by 2**16, for each pixel format.
   These values were measured after motion-compensated prediction, before
    quantization, over a large set of test video encoded at all possible rates.
-  TODO: These values are only from INTER frames; it should be re-measured for
+  TODO: These values are only from INTER frames; they should be re-measured for
    INTRA frames.*/
 static const ogg_uint16_t OC_PCD[4][3]={
   {59926, 3038, 2572},
@@ -236,38 +312,58 @@ static const ogg_uint16_t OC_PCD[4][3]={
 };
 
 
-/*Compute an "average" quantizer for each qi level.
-  We do one for INTER and one for INTRA, since their behavior is very
-   different, but average across chroma channels.
+/*Compute "average" quantizers for each qi level to use for rate control.
+  We do one for each color channel, as well as an average across color
+   channels, separately for INTER and INTRA, since their behavior is very
+   different.
   The basic approach is to compute a harmonic average of the squared quantizer,
    weighted by the expected squared magnitude of the DCT coefficients.
   Under the (not quite true) assumption that DCT coefficients are
    Laplacian-distributed, this preserves the product Q*lambda, where
    lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be
    confused with the lambda used in R-D optimization throughout most of the
-   rest of the code).
-  The value Q*lambda completely determines the entropy of the coefficients.*/
+   rest of the code), when the distributions from multiple coefficients are
+   pooled.
+  The value Q*lambda completely determines the entropy of coefficients drawn
+   from a Laplacian distribution, and thus the expected bitrate.*/
 void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _chroma_rd_scale[2][64][2],
  ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
   int qi;
   int pli;
   int qti;
   int ci;
   for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
-    ogg_int64_t q2;
+    ogg_int64_t  q2;
+    ogg_uint32_t qp[3];
+    ogg_uint32_t cqp;
+    ogg_uint32_t d;
     q2=0;
     for(pli=0;pli<3;pli++){
-      ogg_uint32_t qp;
-      qp=0;
+      qp[pli]=0;
       for(ci=0;ci<64;ci++){
         unsigned rq;
         unsigned qd;
         qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
         rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
-        qp+=rq*(ogg_uint32_t)rq;
+        qp[pli]+=rq*(ogg_uint32_t)rq;
       }
-      q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp;
+      q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp[pli];
+      /*plq=1.0/sqrt(qp)*/
+      _log_plq[qi][pli][qti]=
+       (ogg_int16_t)(OC_Q10(32)-oc_blog32_q10(qp[pli])>>1);
     }
+    d=OC_PCD[_pixel_fmt][1]+OC_PCD[_pixel_fmt][2];
+    cqp=(ogg_uint32_t)((OC_PCD[_pixel_fmt][1]*(ogg_int64_t)qp[1]+
+     OC_PCD[_pixel_fmt][2]*(ogg_int64_t)qp[2]+(d>>1))/d);
+    /*chroma_rd_scale=clamp(0.25,cqp/qp[0],4)*/
+    d=OC_MAXI(qp[0]+(1<<OC_RD_SCALE_BITS-1)>>OC_RD_SCALE_BITS,1);
+    d=OC_CLAMPI(1<<OC_RD_SCALE_BITS-2,(cqp+(d>>1))/d,4<<OC_RD_SCALE_BITS);
+    _chroma_rd_scale[qti][qi][0]=(ogg_int16_t)d;
+    /*chroma_rd_iscale=clamp(0.25,qp[0]/cqp,4)*/
+    d=OC_MAXI(OC_RD_ISCALE(cqp,1),1);
+    d=OC_CLAMPI(1<<OC_RD_ISCALE_BITS-2,(qp[0]+(d>>1))/d,4<<OC_RD_ISCALE_BITS);
+    _chroma_rd_scale[qti][qi][1]=(ogg_int16_t)d;
     /*qavg=1.0/sqrt(q2).*/
     _log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
   }
diff --git a/thirdparty/libtheora/enquant.h b/thirdparty/libtheora/enquant.h
index d62df10d1a..e5f78144cc 100644
--- a/thirdparty/libtheora/enquant.h
+++ b/thirdparty/libtheora/enquant.h
@@ -14,14 +14,13 @@ struct oc_iquant{
   ogg_int16_t l;
 };
 
-typedef oc_iquant        oc_iquant_table[64];
-
 
 
+int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src);
 void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
+void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d);
 void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _pl_rd_scale[2][64][2],
  ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
 
 #endif
diff --git a/thirdparty/libtheora/fdct.c b/thirdparty/libtheora/fdct.c
index dc3a66f245..9c2f8b0446 100644
--- a/thirdparty/libtheora/fdct.c
+++ b/thirdparty/libtheora/fdct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: fdct.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include "encint.h"
@@ -120,11 +120,6 @@ static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){
   _y[7]=v;
 }
 
-void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
- const ogg_int16_t _x[64]){
-  (*_enc->opt_vtable.fdct8x8)(_y,_x);
-}
-
 /*Performs a forward 8x8 Type-II DCT transform.
   The output is scaled by a factor of 4 relative to the orthonormal version
    of the transform.
@@ -152,7 +147,7 @@ void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   /*Round the result back to the external working precision (which is still
      scaled by four relative to the orthogonal result).
     TODO: We should just update the external working precision.*/
-  for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
+  for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
 }
 
 
diff --git a/thirdparty/libtheora/fragment.c b/thirdparty/libtheora/fragment.c
index 15372e9d9f..14c38be507 100644
--- a/thirdparty/libtheora/fragment.c
+++ b/thirdparty/libtheora/fragment.c
@@ -11,17 +11,12 @@
  ********************************************************************
 
   function:
-    last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 #include <string.h>
 #include "internal.h"
 
-void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride){
-  (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
-}
-
 void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
   int i;
   for(i=8;i-->0;){
@@ -31,9 +26,24 @@ void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
   }
 }
 
-void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
- int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    oc_frag_copy_c(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
 }
 
 void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
@@ -46,11 +56,6 @@ void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
   }
 }
 
-void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
-}
-
 void oc_frag_recon_inter_c(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
   int i;
@@ -62,12 +67,6 @@ void oc_frag_recon_inter_c(unsigned char *_dst,
   }
 }
 
-void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride,
- const ogg_int16_t _residue[64]){
-  _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
-}
-
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
   int i;
@@ -80,8 +79,4 @@ void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
   }
 }
 
-void oc_restore_fpu(const oc_theora_state *_state){
-  _state->opt_vtable.restore_fpu();
-}
-
 void oc_restore_fpu_c(void){}
diff --git a/thirdparty/libtheora/huffdec.c b/thirdparty/libtheora/huffdec.c
index 8cf27f0341..5a83c5f150 100644
--- a/thirdparty/libtheora/huffdec.c
+++ b/thirdparty/libtheora/huffdec.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: huffdec.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,14 +22,60 @@
 #include "decint.h"
 
 
-/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
-#define _ogg_offsetof(_type,_field)\
- ((size_t)((char *)&((_type *)0)->_field-(char *)0))
 
-/*The number of internal tokens associated with each of the spec tokens.*/
-static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
-  1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
-};
+/*Instead of storing every branching in the tree, subtrees can be collapsed
+   into one node, with a table of size 1<<nbits pointing directly to its
+   descedents nbits levels down.
+  This allows more than one bit to be read at a time, and avoids following all
+   the intermediate branches with next to no increased code complexity once
+   the collapsed tree has been built.
+  We do _not_ require that a subtree be complete to be collapsed, but instead
+   store duplicate pointers in the table, and record the actual depth of the
+   node below its parent.
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
+
+  We store the tree packed in an array of 16-bit integers (words).
+  Each node consists of a single word, followed consecutively by two or more
+   indices of its children.
+  Let n be the value of this first word.
+  This is the number of bits that need to be read to traverse the node, and
+   must be positive.
+  1<<n entries follow in the array, each an index to a child node.
+  If the child is positive, then it is the index of another internal node in
+   the table.
+  If the child is negative or zero, then it is a leaf node.
+  These are stored directly in the child pointer to save space, since they only
+   require a single word.
+  If a leaf node would have been encountered before reading n bits, then it is
+   duplicated the necessary number of times in this table.
+  Leaf nodes pack both a token value and their actual depth in the tree.
+  The token in the leaf node is (-leaf&255).
+  The number of bits that need to be consumed to reach the leaf, starting from
+   the current node, is (-leaf>>8).
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
+
+
 
 /*The map from external spec-defined tokens to internal tokens.
   This is constructed so that any extra bits read with the original token value
@@ -99,391 +145,371 @@ static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
   40
 };
 
-/*These three functions are really part of the bitpack.c module, but
-   they are only used here.
-  Declaring local static versions so they can be inlined saves considerable
-   function call overhead.*/
-
-static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
-  const unsigned char *ptr;
-  const unsigned char *stop;
-  oc_pb_window         window;
-  int                  available;
-  window=_b->window;
-  available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  /*This version of _refill() doesn't bother setting eof because we won't
-     check for it after we've started decoding DCT tokens.*/
-  if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  while(available<=OC_PB_WINDOW_SIZE-8){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
-    if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  }
-  _b->ptr=ptr;
-  if(_bits>available)window|=*ptr>>(available&7);
-  _b->bits=available;
-  return window;
-}
-
-
-/*Read in bits without advancing the bit pointer.
-  Here we assume 0<=_bits&&_bits<=32.*/
-static long oc_pack_look(oc_pack_buf *_b,int _bits){
-  oc_pb_window window;
-  int          available;
-  long         result;
-  window=_b->window;
-  available=_b->bits;
-  if(_bits==0)return 0;
-  if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
-  result=window>>OC_PB_WINDOW_SIZE-_bits;
-  return result;
-}
-
-/*Advance the bit pointer.*/
-static void oc_pack_adv(oc_pack_buf *_b,int _bits){
-  /*We ignore the special cases for _bits==0 and _bits==32 here, since they are
-     never used actually used.
-    OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
-     32 bits in a single go, and would require a 32 GB lookup table (assuming
-     8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
-  _b->window<<=_bits;
-  _b->bits-=_bits;
-}
+/*The log base 2 of number of internal tokens associated with each of the spec
+   tokens (i.e., how many of the extra bits are folded into the token value).
+  Increasing the maximum value beyond 3 will enlarge the amount of stack
+   required for tree construction.*/
+static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
+  0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
+};
 
 
-/*The log_2 of the size of a lookup table is allowed to grow to relative to
-   the number of unique nodes it contains.
-  E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
-   wasted (each node will have an amortized cost of at most 20 bytes when using
-   4-byte pointers).
+/*The size a lookup table is allowed to grow to relative to the number of
+   unique nodes it contains.
+  E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
+   wasted (1/4 of the space must be used).
   Larger numbers can decode tokens with fewer read operations, while smaller
-   numbers may save more space (requiring as little as 8 bytes amortized per
-   node, though there will be more nodes).
+   numbers may save more space.
   With a sample file:
   32233473 read calls are required when no tree collapsing is done (100.0%).
-  19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
-  11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
-  10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
-  10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
-  Since a value of 1 gets us the vast majority of the speed-up with only a
-   small amount of wasted memory, this is what we use.*/
-#define OC_HUFF_SLUSH (1)
-
-
-/*Determines the size in bytes of a Huffman tree node that represents a
-   subtree of depth _nbits.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise 1<<_nbits pointers are allocated for children.
-  Return: The number of bytes required to store the node.*/
-static size_t oc_huff_node_size(int _nbits){
-  size_t size;
-  size=_ogg_offsetof(oc_huff_node,nodes);
-  if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
-  return size;
-}
-
-static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
-  oc_huff_node *ret;
-  ret=(oc_huff_node *)*_storage;
-  ret->nbits=(unsigned char)_nbits;
-  (*_storage)+=_size;
-  return ret;
-}
-
-
-/*Determines the size in bytes of a Huffman tree.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise storage for 1<<_nbits pointers are added for children.
-  Return: The number of bytes required to store the tree.*/
-static size_t oc_huff_tree_size(const oc_huff_node *_node){
-  size_t size;
-  size=oc_huff_node_size(_node->nbits);
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
-      size+=oc_huff_tree_size(_node->nodes[i]);
-    }
-  }
-  return size;
-}
-
-
-/*Unpacks a sub-tree from the given buffer.
-  _opb:      The buffer to unpack from.
-  _binodes:  The nodes to store the sub-tree in.
-  _nbinodes: The number of nodes available for the sub-tree.
-  Return: 0 on success, or a negative value on error.*/
-static int oc_huff_tree_unpack(oc_pack_buf *_opb,
- oc_huff_node *_binodes,int _nbinodes){
-  oc_huff_node *binode;
-  long          bits;
-  int           nused;
-  if(_nbinodes<1)return TH_EBADHEADER;
-  binode=_binodes;
-  nused=0;
-  bits=oc_pack_read1(_opb);
-  if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-  /*Read an internal node:*/
-  if(!bits){
-    int ret;
-    nused++;
-    binode->nbits=1;
-    binode->depth=1;
-    binode->nodes[0]=_binodes+nused;
-    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    if(ret>=0){
-      nused+=ret;
-      binode->nodes[1]=_binodes+nused;
-      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    }
-    if(ret<0)return ret;
-    nused+=ret;
-  }
-  /*Read a leaf node:*/
-  else{
-    int ntokens;
-    int token;
-    int i;
-    bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+  19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
+  11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
+  10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
+  10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
+  Since a value of 2 gets us the vast majority of the speed-up with only a
+   small amount of wasted memory, this is what we use.
+  This value must be less than 128, or you could create a tree with more than
+   32767 entries, which would overflow the 16-bit words used to index it.*/
+#define OC_HUFF_SLUSH (2)
+/*The root of the tree is on the fast path, and a larger value here is more
+   beneficial than elsewhere in the tree.
+  7 appears to give the best performance, trading off between increased use of
+   the single-read fast path and cache footprint for the tables, though
+   obviously this will depend on your cache size.
+  Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
+#define OC_ROOT_HUFF_SLUSH (7)
+
+
+
+/*Unpacks a Huffman codebook.
+  _opb:    The buffer to unpack from.
+  _tokens: Stores a list of internal tokens, in the order they were found in
+            the codebook, and the lengths of their corresponding codewords.
+           This is enough to completely define the codebook, while minimizing
+            stack usage and avoiding temporary allocations (for platforms
+            where free() is a no-op).
+  Return: The number of internal tokens in the codebook, or a negative value
+   on error.*/
+int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
+  ogg_uint32_t code;
+  int          len;
+  int          ntokens;
+  int          nleaves;
+  code=0;
+  len=ntokens=nleaves=0;
+  for(;;){
+    long bits;
+    bits=oc_pack_read1(_opb);
+    /*Only process nodes so long as there's more bits in the buffer.*/
     if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-    /*Find out how many internal tokens we translate this external token into.*/
-    ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
-    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
-    /*Fill in a complete binary tree pointing to the internal tokens.*/
-    for(i=1;i<ntokens;i<<=1){
-      int j;
-      binode=_binodes+nused;
-      nused+=i;
-      for(j=0;j<i;j++){
-        binode[j].nbits=1;
-        binode[j].depth=1;
-        binode[j].nodes[0]=_binodes+nused+2*j;
-        binode[j].nodes[1]=_binodes+nused+2*j+1;
-      }
+    /*Read an internal node:*/
+    if(!bits){
+      len++;
+      /*Don't allow codewords longer than 32 bits.*/
+      if(len>32)return TH_EBADHEADER;
     }
-    /*And now the leaf nodes with those tokens.*/
-    token=OC_DCT_TOKEN_MAP[bits];
-    for(i=0;i<ntokens;i++){
-      binode=_binodes+nused++;
-      binode->nbits=0;
-      binode->depth=1;
-      binode->token=token+i;
+    /*Read a leaf node:*/
+    else{
+      ogg_uint32_t code_bit;
+      int          neb;
+      int          nentries;
+      int          token;
+      /*Don't allow more than 32 spec-tokens per codebook.*/
+      if(++nleaves>32)return TH_EBADHEADER;
+      bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+      neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
+      token=OC_DCT_TOKEN_MAP[bits];
+      nentries=1<<neb;
+      while(nentries-->0){
+        _tokens[ntokens][0]=(unsigned char)token++;
+        _tokens[ntokens][1]=(unsigned char)(len+neb);
+        ntokens++;
+      }
+      code_bit=0x80000000U>>len-1;
+      while(len>0&&(code&code_bit)){
+        code^=code_bit;
+        code_bit<<=1;
+        len--;
+      }
+      if(len<=0)break;
+      code|=code_bit;
     }
   }
-  return nused;
-}
-
-/*Finds the depth of shortest branch of the given sub-tree.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The smallest depth of a leaf node in this sub-tree.
-          0 indicates this sub-tree is a leaf node.*/
-static int oc_huff_tree_mindepth(oc_huff_node *_binode){
-  int depth0;
-  int depth1;
-  if(_binode->nbits==0)return 0;
-  depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
-  depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
-  return OC_MINI(depth0,depth1)+1;
-}
-
-/*Finds the number of internal nodes at a given depth, plus the number of
-   leaves at that depth or shallower.
-  The tree must be binary.
-  _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The number of entries that would be contained in a jump table of the
-           given depth.*/
-static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
-  if(_binode->nbits==0||_depth<=0)return 1;
-  else{
-    return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
-  }
+  return ntokens;
 }
 
-/*Makes a copy of the given Huffman tree.
-  _node: The Huffman tree to copy.
-  Return: The copy of the Huffman tree.*/
-static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
- char **_storage){
-  oc_huff_node *ret;
-  ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
-  ret->depth=_node->depth;
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    int inext;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;){
-      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
-      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
-      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
+/*Count how many tokens would be required to fill a subtree at depth _depth.
+  _tokens: A list of internal tokens, in the order they are found in the
+            codebook, and the lengths of their corresponding codewords.
+  _depth:  The depth of the desired node in the corresponding tree structure.
+  Return: The number of tokens that belong to that subtree.*/
+static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
+  ogg_uint32_t code;
+  int          ti;
+  code=0;
+  ti=0;
+  do{
+    if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
+    else{
+      /*Because of the expanded internal tokens, we can have codewords as long
+         as 35 bits.
+        A single recursion here is enough to advance past them.*/
+      code++;
+      ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
     }
   }
-  else ret->token=_node->token;
-  return ret;
+  while(code<0x80000000U);
+  return ti;
 }
 
-static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
-  size_t size;
-  int    mindepth;
-  int    depth;
-  int    loccupancy;
-  int    occupancy;
-  if(_binode->nbits!=0&&_depth>0){
-    return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
-  }
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
+/*Compute the number of bits to use for a collapsed tree node at the given
+   depth.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  _depth:   The depth of this tree node.
+  Return: The number of bits to use for a collapsed tree node rooted here.
+          This is always at least one, even if this was a leaf node.*/
+static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
+ int _ntokens,int _depth){
+  int got_leaves;
+  int loccupancy;
+  int occupancy;
+  int slush;
+  int nbits;
+  int best_nbits;
+  slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
+  /*It's legal to have a tree with just a single node, which requires no bits
+     to decode and always returns the same token.
+    However, no encoder actually does this (yet).
+    To avoid a special case in oc_huff_token_decode(), we force the number of
+     lookahead bits to be at least one.
+    This will produce a tree that looks ahead one bit and then advances the
+     stream zero bits.*/
+  nbits=1;
+  occupancy=2;
+  got_leaves=1;
   do{
+    int ti;
+    if(got_leaves)best_nbits=nbits;
+    nbits++;
+    got_leaves=0;
     loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
-  }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  size=oc_huff_node_size(depth);
-  if(depth>0){
-    size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
-    size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
+    for(occupancy=ti=0;ti<_ntokens;occupancy++){
+      if(_tokens[ti][1]<_depth+nbits)ti++;
+      else if(_tokens[ti][1]==_depth+nbits){
+        got_leaves=1;
+        ti++;
+      }
+      else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
+    }
   }
-  return size;
+  while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
+  return best_nbits;
 }
 
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage);
-
-/*Fills the given nodes table with all the children in the sub-tree at the
-   given depth.
-  The nodes in the sub-tree with a depth less than that stored in the table
-   are freed.
-  The sub-tree must be binary and complete up until the given depth.
-  _nodes:  The nodes table to fill.
-  _binode: The root of the sub-tree to fill it with.
-           _binode->nbits must be 0 or 1.
-  _level:  The current level in the table.
-           0 indicates that the current node should be stored, regardless of
-            whether it is a leaf node or an internal node.
-  _depth:  The depth of the nodes to fill the table with, relative to their
-            parent.*/
-static void oc_huff_node_fill(oc_huff_node **_nodes,
- oc_huff_node *_binode,int _level,int _depth,char **_storage){
-  if(_level<=0||_binode->nbits==0){
-    int i;
-    _binode->depth=(unsigned char)(_depth-_level);
-    _nodes[0]=oc_huff_tree_collapse(_binode,_storage);
-    for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
-  }
-  else{
-    _level--;
-    oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
-    _nodes+=1<<_level;
-    oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
-  }
+/*Determines the size in words of a Huffman tree node that represents a
+   subtree of depth _nbits.
+  _nbits: The depth of the subtree.
+          This must be greater than zero.
+  Return: The number of words required to store the node.*/
+static size_t oc_huff_node_size(int _nbits){
+  return 1+(1<<_nbits);
 }
 
-/*Finds the largest complete sub-tree rooted at the current node and collapses
-   it into a single node.
-  This procedure is then applied recursively to all the children of that node.
-  _binode: The root of the sub-tree to collapse.
-           _binode->nbits must be 0 or 1.
-  Return: The new root of the collapsed sub-tree.*/
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage){
-  oc_huff_node *root;
-  size_t        size;
-  int           mindepth;
-  int           depth;
-  int           loccupancy;
-  int           occupancy;
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
-  occupancy=1<<mindepth;
+/*Produces a collapsed-tree representation of the given token list.
+  _tree: The storage for the collapsed Huffman tree.
+         This may be NULL to compute the required storage size instead of
+          constructing the tree.
+  _tokens:  A list of internal tokens, in the order they are found in the
+             codebook, and the lengths of their corresponding codewords.
+  _ntokens: The number of tokens corresponding to this tree node.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
+ unsigned char _tokens[][2],int _ntokens){
+  ogg_int16_t   node[34];
+  unsigned char depth[34];
+  unsigned char last[34];
+  size_t        ntree;
+  int           ti;
+  int           l;
+  depth[0]=0;
+  last[0]=(unsigned char)(_ntokens-1);
+  ntree=0;
+  ti=0;
+  l=0;
   do{
-    loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
+    int nbits;
+    nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
+    node[l]=(ogg_int16_t)ntree;
+    ntree+=oc_huff_node_size(nbits);
+    if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
+    do{
+      while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
+        if(_tree!=NULL){
+          ogg_int16_t leaf;
+          int         nentries;
+          nentries=1<<depth[l]+nbits-_tokens[ti][1];
+          leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
+          while(nentries-->0)_tree[node[l]++]=leaf;
+        }
+        ti++;
+      }
+      if(ti<=last[l]){
+        /*We need to recurse*/
+        depth[l+1]=(unsigned char)(depth[l]+nbits);
+        if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
+        l++;
+        last[l]=
+         (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
+        break;
+      }
+      /*Pop back up a level of recursion.*/
+      else if(l-->0)nbits=depth[l+1]-depth[l];
+    }
+    while(l>=0);
   }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
-  depth--;
-  if(depth<=1)return oc_huff_tree_copy(_binode,_storage);
-  size=oc_huff_node_size(depth);
-  root=oc_huff_node_init(_storage,size,depth);
-  root->depth=_binode->depth;
-  oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
-  return root;
+  while(l>=0);
+  return ntree;
 }
 
 /*Unpacks a set of Huffman trees, and reduces them to a collapsed
    representation.
   _opb:   The buffer to unpack the trees from.
   _nodes: The table to fill with the Huffman trees.
-  Return: 0 on success, or a negative value on error.*/
+  Return: 0 on success, or a negative value on error.
+          The caller is responsible for cleaning up any partially initialized
+           _nodes on failure.*/
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    oc_huff_node  nodes[511];
-    char         *storage;
-    size_t        size;
-    int           ret;
+    unsigned char  tokens[256][2];
+    int            ntokens;
+    ogg_int16_t   *tree;
+    size_t         size;
     /*Unpack the full tree into a temporary buffer.*/
-    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
-    if(ret<0)return ret;
-    /*Figure out how big the collapsed tree will be.*/
-    size=oc_huff_tree_collapse_size(nodes,0);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL)return TH_EFAULT;
-    /*And collapse it.*/
-    _nodes[i]=oc_huff_tree_collapse(nodes,&storage);
+    ntokens=oc_huff_tree_unpack(_opb,tokens);
+    if(ntokens<0)return ntokens;
+    /*Figure out how big the collapsed tree will be and allocate space for it.*/
+    size=oc_huff_tree_collapse(NULL,tokens,ntokens);
+    /*This should never happen; if it does it means you set OC_HUFF_SLUSH or
+       OC_ROOT_HUFF_SLUSH too large.*/
+    if(size>32767)return TH_EIMPL;
+    tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
+    if(tree==NULL)return TH_EFAULT;
+    /*Construct the collapsed the tree.*/
+    oc_huff_tree_collapse(tree,tokens,ntokens);
+    _nodes[i]=tree;
   }
   return 0;
 }
 
+/*Determines the size in words of a Huffman subtree.
+  _tree: The complete Huffman tree.
+  _node: The index of the root of the desired subtree.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
+  size_t size;
+  int    nchildren;
+  int    n;
+  int    i;
+  n=_tree[_node];
+  size=oc_huff_node_size(n);
+  nchildren=1<<n;
+  i=0;
+  do{
+    int child;
+    child=_tree[_node+i+1];
+    if(child<=0)i+=1<<n-(-child>>8);
+    else{
+      size+=oc_huff_tree_size(_tree,child);
+      i++;
+    }
+  }
+  while(i<nchildren);
+  return size;
+}
+
 /*Makes a copy of the given set of Huffman trees.
   _dst: The array to store the copy in.
   _src: The array of trees to copy.*/
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
+  int total;
   int i;
+  total=0;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    size_t  size;
-    char   *storage;
-    size=oc_huff_tree_size(_src[i]);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL){
+    size_t size;
+    size=oc_huff_tree_size(_src[i],0);
+    total+=size;
+    _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
+    if(_dst[i]==NULL){
       while(i-->0)_ogg_free(_dst[i]);
       return TH_EFAULT;
     }
-    _dst[i]=oc_huff_tree_copy(_src[i],&storage);
+    memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
   }
   return 0;
 }
 
 /*Frees the memory used by a set of Huffman trees.
   _nodes: The array of trees to free.*/
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
 }
 
+
 /*Unpacks a single token using the given Huffman tree.
   _opb:  The buffer to unpack the token from.
   _node: The tree to unpack the token with.
   Return: The token value.*/
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
-  long bits;
-  while(_node->nbits!=0){
-    bits=oc_pack_look(_opb,_node->nbits);
-    _node=_node->nodes[bits];
-    oc_pack_adv(_opb,_node->depth);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  long                 bits;
+  int                  node;
+  int                  n;
+  ptr=_opb->ptr;
+  window=_opb->window;
+  stop=_opb->stop;
+  available=_opb->bits;
+  node=0;
+  for(;;){
+    n=_tree[node];
+    if(n>available){
+      unsigned shift;
+      shift=OC_PB_WINDOW_SIZE-available;
+      do{
+        /*We don't bother setting eof because we won't check for it after we've
+           started decoding DCT tokens.*/
+        if(ptr>=stop){
+          shift=(unsigned)-OC_LOTS_OF_BITS;
+          break;
+        }
+        shift-=8;
+        window|=(oc_pb_window)*ptr++<<shift;
+      }
+      while(shift>=8);
+      /*Note: We never request more than 24 bits, so there's no need to fill in
+         the last partial byte here.*/
+      available=OC_PB_WINDOW_SIZE-shift;
+    }
+    bits=window>>OC_PB_WINDOW_SIZE-n;
+    node=_tree[node+1+bits];
+    if(node<=0)break;
+    window<<=n;
+    available-=n;
   }
-  return _node->token;
+  node=-node;
+  n=node>>8;
+  window<<=n;
+  available-=n;
+  _opb->ptr=ptr;
+  _opb->window=window;
+  _opb->bits=available;
+  return node&255;
 }
diff --git a/thirdparty/libtheora/huffdec.h b/thirdparty/libtheora/huffdec.h
index d7ffa0e99b..03d25dcd1e 100644
--- a/thirdparty/libtheora/huffdec.h
+++ b/thirdparty/libtheora/huffdec.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,71 +22,11 @@
 
 
 
-typedef struct oc_huff_node oc_huff_node;
-
-/*A node in the Huffman tree.
-  Instead of storing every branching in the tree, subtrees can be collapsed
-   into one node, with a table of size 1<<nbits pointing directly to its
-   descedents nbits levels down.
-  This allows more than one bit to be read at a time, and avoids following all
-   the intermediate branches with next to no increased code complexity once
-   the collapsed tree has been built.
-  We do _not_ require that a subtree be complete to be collapsed, but instead
-   store duplicate pointers in the table, and record the actual depth of the
-   node below its parent.
-  This tells us the number of bits to advance the stream after reaching it.
-
-  This turns out to be equivalent to the method described in \cite{Hash95},
-   without the requirement that codewords be sorted by length.
-  If the codewords were sorted by length (so-called ``canonical-codes''), they
-   could be decoded much faster via either Lindell and Moffat's approach or
-   Hashemian's Condensed Huffman Code approach, the latter of which has an
-   extremely small memory footprint.
-  We can't use Choueka et al.'s finite state machine approach, which is
-   extremely fast, because we can't allow multiple symbols to be output at a
-   time; the codebook can and does change between symbols.
-  It also has very large memory requirements, which impairs cache coherency.
-
-  @ARTICLE{Hash95,
-    author="Reza Hashemian",
-    title="Memory Efficient and High-Speed Search {Huffman} Coding",
-    journal="{IEEE} Transactions on Communications",
-    volume=43,
-    number=10,
-    pages="2576--2581",
-    month=Oct,
-    year=1995
-  }*/
-struct oc_huff_node{
-  /*The number of bits of the code needed to descend through this node.
-    0 indicates a leaf node.
-    Otherwise there are 1<<nbits nodes in the nodes table, which can be
-     indexed by reading nbits bits from the stream.*/
-  unsigned char  nbits;
-  /*The value of a token stored in a leaf node.
-    The value in non-leaf nodes is undefined.*/
-  unsigned char  token;
-  /*The depth of the current node, relative to its parent in the collapsed
-     tree.
-    This can be less than its parent's nbits value, in which case there are
-     1<<nbits-depth copies of this node in the table, and the bitstream should
-     only be advanced depth bits after reaching this node.*/
-  unsigned char  depth;
-  /*The table of child nodes.
-    The ACTUAL size of this array is 1<<nbits, despite what the declaration
-     below claims.
-    The exception is that for leaf nodes the size is 0.*/
-  oc_huff_node  *nodes[2];
-};
-
-
-
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
-
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
 
 #endif
diff --git a/thirdparty/libtheora/huffenc.c b/thirdparty/libtheora/huffenc.c
index bf624e0523..77ab584a19 100644
--- a/thirdparty/libtheora/huffenc.c
+++ b/thirdparty/libtheora/huffenc.c
@@ -859,9 +859,10 @@ int oc_huff_codes_pack(oggpack_buffer *_opb,
     /*First, find the maximum code length so we can align all the bit
        patterns.*/
     maxlen=_codes[i][0].nbits;
-    for(j=1;j<TH_NDCT_TOKENS;j++){
-      maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
-    }
+    for(j=1;j<TH_NDCT_TOKENS;j++)maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
+    /*It's improbable that a code with more than 32 bits could pass the
+       validation below, but abort early in any case.*/
+    if(maxlen>32)return TH_EINVAL;
     mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1;
     /*Copy over the codes into our temporary workspace.
       The bit patterns are aligned, and the original entry each code is from
@@ -877,34 +878,89 @@ int oc_huff_codes_pack(oggpack_buffer *_opb,
     /*For each leaf of the tree:*/
     bpos=maxlen;
     for(j=0;j<TH_NDCT_TOKENS;j++){
-      int bit;
-      /*If this code has any bits at all.*/
-      if(entries[j].shift<maxlen){
-        /*Descend into the tree, writing a bit for each branch.*/
-        for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
-        /*Mark this as a leaf node, and write its value.*/
-        oggpackB_write(_opb,1,1);
-        oggpackB_write(_opb,entries[j].token,5);
-        /*For each 1 branch we've descended, back up the tree until we reach a
-           0 branch.*/
-        bit=1<<bpos;
-        for(;entries[j].pattern&bit;bpos++)bit<<=1;
-        /*Validate the code.*/
-        if(j+1<TH_NDCT_TOKENS){
-          mask=~(bit-1)<<1;
-          /*The next entry should have a 1 bit where we had a 0, and should
-             match our code above that bit.
-            This verifies both fullness and prefix-freeness simultaneously.*/
-          if(!(entries[j+1].pattern&bit)||
-           (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
-            return TH_EINVAL;
-          }
+      ogg_uint32_t bit;
+      /*Fail if this code has no bits at all.
+        Technically a codebook with a single 0-bit entry is legal, but the
+         encoder currently does not support codebooks which do not contain all
+         the tokens.*/
+      if(entries[j].shift>=maxlen)return TH_EINVAL;
+      /*Descend into the tree, writing a bit for each branch.*/
+      for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
+      /*Mark this as a leaf node, and write its value.*/
+      oggpackB_write(_opb,1,1);
+      oggpackB_write(_opb,entries[j].token,5);
+      /*For each 1 branch we've descended, back up the tree until we reach a
+         0 branch.*/
+      bit=(ogg_uint32_t)1<<bpos;
+      for(;entries[j].pattern&bit;bpos++)bit<<=1;
+      /*Validate the code.*/
+      if(j+1<TH_NDCT_TOKENS){
+        mask=~(bit-1)<<1;
+        /*The next entry should have a 1 bit where we had a 0, and should
+           match our code above that bit.
+          This verifies both fullness and prefix-freeness simultaneously.*/
+        if(!(entries[j+1].pattern&bit)||
+         (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
+          return TH_EINVAL;
+        }
+      }
+      /*If there are no more codes, we should have ascended back to the top
+         of the tree.*/
+      else if(bpos<maxlen)return TH_EINVAL;
+    }
+  }
+  return 0;
+}
+
+/*This is used to copy the configuration of an existing setup header for use by
+   the encoder.
+  The decoder uses a completely different data structure for the Huffman
+   codebooks.*/
+int oc_huff_codes_unpack(oc_pack_buf *_opb,
+ th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
+    ogg_uint32_t code;
+    int          len;
+    int          nleaves;
+    code=0;
+    len=nleaves=0;
+    memset(_codes[i],0,TH_NDCT_TOKENS*sizeof(*_codes[i]));
+    for(;;){
+      long bits;
+      bits=oc_pack_read1(_opb);
+      /*Only process nodes so long as there's more bits in the buffer.*/
+      if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+      /*Read an internal node:*/
+      if(!bits){
+        len++;
+        /*Don't allow codewords longer than 32 bits.*/
+        if(len>32)return TH_EBADHEADER;
+      }
+      /*Read a leaf node:*/
+      else{
+        ogg_uint32_t code_bit;
+        /*Don't allow more than 32 tokens per codebook.*/
+        if(++nleaves>32)return TH_EBADHEADER;
+        bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+        /*The current encoder does not support codebooks that do not contain
+           all of the tokens.*/
+        if(_codes[i][bits].nbits>0)return TH_EINVAL;
+        _codes[i][bits].pattern=code>>32-len;
+        _codes[i][bits].nbits=len;
+        code_bit=0x80000000U>>len-1;
+        while(len>0&&(code&code_bit)){
+          code^=code_bit;
+          code_bit<<=1;
+          len--;
         }
-        /*If there are no more codes, we should have ascended back to the top
-           of the tree.*/
-        else if(bpos<maxlen)return TH_EINVAL;
+        if(len<=0)break;
+        code|=code_bit;
       }
     }
+    /*The current encoder does not support codebooks that do not contain all of
+       the tokens.*/
+    if(nleaves<32)return TH_EINVAL;
   }
   return 0;
 }
diff --git a/thirdparty/libtheora/huffenc.h b/thirdparty/libtheora/huffenc.h
index c5a3956f1f..0554cc4060 100644
--- a/thirdparty/libtheora/huffenc.h
+++ b/thirdparty/libtheora/huffenc.h
@@ -1,6 +1,7 @@
 #if !defined(_huffenc_H)
 # define _huffenc_H (1)
 # include "huffman.h"
+# include "bitpack.h"
 
 
 
@@ -15,5 +16,7 @@ extern const th_huff_code
 
 int oc_huff_codes_pack(oggpack_buffer *_opb,
  const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
+int oc_huff_codes_unpack(oc_pack_buf *_opb,
+ th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
 
 #endif
diff --git a/thirdparty/libtheora/huffman.h b/thirdparty/libtheora/huffman.h
index 36cf7572e5..eb805866b9 100644
--- a/thirdparty/libtheora/huffman.h
+++ b/thirdparty/libtheora/huffman.h
@@ -11,12 +11,12 @@
  ********************************************************************
 
   function:
-    last mod: $Id: huffman.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
 #if !defined(_huffman_H)
-# define _hufffman_H (1)
+# define _huffman_H (1)
 # include "theora/codec.h"
 # include "ocintrin.h"
 
diff --git a/thirdparty/libtheora/idct.c b/thirdparty/libtheora/idct.c
index 0e68ac7658..838e3ad8ca 100644
--- a/thirdparty/libtheora/idct.c
+++ b/thirdparty/libtheora/idct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -231,18 +231,18 @@ static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_2(w,_x);
   idct8_1(w+1,_x+8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block.*/
+  _x[0]=_x[1]=_x[8]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -260,20 +260,20 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_4(w,_x);
   idct8_3(w+1,_x+8);
   idct8_2(w+2,_x+16);
   idct8_1(w+3,_x+24);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+  for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block.*/
+  _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -282,28 +282,23 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
-  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(w+i,_x+i*8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
-}
-
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- int _last_zzi){
-  (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block.*/
+  for(i=0;i<64;i++)_x[i]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -329,7 +324,7 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
-  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
-  else oc_idct8x8_slow(_y,_y);
+  if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
+  else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
diff --git a/thirdparty/libtheora/info.c b/thirdparty/libtheora/info.c
index 6b9762978b..e5cecd2de5 100644
--- a/thirdparty/libtheora/info.c
+++ b/thirdparty/libtheora/info.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: info.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -54,7 +54,7 @@ void th_comment_init(th_comment *_tc){
   memset(_tc,0,sizeof(*_tc));
 }
 
-void th_comment_add(th_comment *_tc,char *_comment){
+void th_comment_add(th_comment *_tc,const char *_comment){
   char **user_comments;
   int   *comment_lengths;
   int    comment_len;
@@ -75,7 +75,7 @@ void th_comment_add(th_comment *_tc,char *_comment){
   _tc->user_comments[_tc->comments]=NULL;
 }
 
-void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
+void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){
   char *comment;
   int   tag_len;
   int   val_len;
@@ -91,7 +91,7 @@ void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
   _ogg_free(comment);
 }
 
-char *th_comment_query(th_comment *_tc,char *_tag,int _count){
+char *th_comment_query(th_comment *_tc,const char *_tag,int _count){
   long i;
   int  found;
   int  tag_len;
@@ -107,7 +107,7 @@ char *th_comment_query(th_comment *_tc,char *_tag,int _count){
   return NULL;
 }
 
-int th_comment_query_count(th_comment *_tc,char *_tag){
+int th_comment_query_count(th_comment *_tc,const char *_tag){
   long i;
   int  tag_len;
   int  count;
diff --git a/thirdparty/libtheora/internal.c b/thirdparty/libtheora/internal.c
index 0fe4f63e72..afbb6efae7 100644
--- a/thirdparty/libtheora/internal.c
+++ b/thirdparty/libtheora/internal.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -97,79 +97,29 @@ int oc_ilog(unsigned _v){
 
 
 
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X and Y directions
-   (4:2:0).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
-}
-
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the Y direction.
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[2][0];
-  dy=_lbmvs[0][1]+_lbmvs[2][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[1][0]+_lbmvs[3][0];
-  dy=_lbmvs[1][1]+_lbmvs[3][1];
-  _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-}
-
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with chroma decimated in the X direction (4:2:2).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  int dx;
-  int dy;
-  dx=_lbmvs[0][0]+_lbmvs[1][0];
-  dy=_lbmvs[0][1]+_lbmvs[1][1];
-  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
-  dx=_lbmvs[2][0]+_lbmvs[3][0];
-  dy=_lbmvs[2][1]+_lbmvs[3][1];
-  _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
-  _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+void *oc_aligned_malloc(size_t _sz,size_t _align){
+  unsigned char *p;
+  if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
+  p=(unsigned char *)_ogg_malloc(_sz+_align);
+  if(p!=NULL){
+    int offs;
+    offs=((p-(unsigned char *)0)-1&_align-1);
+    p[offs]=offs;
+    p+=offs+1;
+  }
+  return p;
 }
 
-/*The function used to fill in the chroma plane motion vectors for a macro
-   block when 4 different motion vectors are specified in the luma plane.
-  This version is for use with no chroma decimation (4:4:4).
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lmbmv: The luma macro-block level motion vector to fill in for use in
-           prediction.
-  _lbmvs: The luma block-level motion vectors.*/
-static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
-  memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
+void oc_aligned_free(void *_ptr){
+  unsigned char *p;
+  p=(unsigned char *)_ptr;
+  if(p!=NULL){
+    int offs;
+    offs=*--p;
+    _ogg_free(p-offs);
+  }
 }
 
-/*A table of functions used to fill in the chroma plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.*/
-const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
-  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
-};
-
-
 
 void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
   size_t  rowsz;
@@ -181,7 +131,6 @@ void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
   datsz=rowsz*_height;
   /*Alloc array and row pointers.*/
   ret=(char *)_ogg_malloc(datsz+colsz);
-  if(ret==NULL)return NULL;
   /*Initialize the array.*/
   if(ret!=NULL){
     size_t   i;
@@ -204,7 +153,6 @@ void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){
   datsz=rowsz*_height;
   /*Alloc array and row pointers.*/
   ret=(char *)_ogg_calloc(datsz+colsz,1);
-  if(ret==NULL)return NULL;
   /*Initialize the array.*/
   if(ret!=NULL){
     size_t   i;
diff --git a/thirdparty/libtheora/internal.h b/thirdparty/libtheora/internal.h
index d81263e13e..53c77b88be 100644
--- a/thirdparty/libtheora/internal.h
+++ b/thirdparty/libtheora/internal.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 #if !defined(_internal_H)
@@ -19,10 +19,20 @@
 # include <stdlib.h>
 # include <limits.h>
 # if defined(HAVE_CONFIG_H)
-#  include <config.h>
+#  include "config.h"
 # endif
 # include "theora/codec.h"
 # include "theora/theora.h"
+# include "ocintrin.h"
+
+# if !defined(__GNUC_PREREQ)
+#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
+#   define __GNUC_PREREQ(_maj,_min) \
+ ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
+#  else
+#   define __GNUC_PREREQ(_maj,_min) 0
+#  endif
+# endif
 
 # if defined(_MSC_VER)
 /*Disable missing EMMS warnings.*/
@@ -31,24 +41,25 @@
 #  pragma warning(disable:4554)
 # endif
 /*You, too, gcc.*/
-# if defined(__GNUC_PREREQ)
-#  if __GNUC_PREREQ(4,2)
-#   pragma GCC diagnostic ignored "-Wparentheses"
-#  endif
+# if __GNUC_PREREQ(4,2)
+#  pragma GCC diagnostic ignored "-Wparentheses"
 # endif
 
-# include "ocintrin.h"
-# include "huffman.h"
-# include "quant.h"
-
-/*Some assembly constructs require aligned operands.*/
-# if defined(OC_X86_ASM)
+/*Some assembly constructs require aligned operands.
+  The following macros are _only_ intended for structure member declarations.
+  Although they will sometimes work on stack variables, gcc will often silently
+   ignore them.
+  A separate set of macros could be made for manual stack alignment, but we
+   don't actually require it anywhere.*/
+# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
 #  if defined(__GNUC__)
 #   define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
 #   define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
 #  elif defined(_MSC_VER)
 #   define OC_ALIGN8(expr) __declspec (align(8)) expr
 #   define OC_ALIGN16(expr) __declspec (align(16)) expr
+#  else
+#   error "Alignment macros required for this platform."
 #  endif
 # endif
 # if !defined(OC_ALIGN8)
@@ -60,19 +71,8 @@
 
 
 
-typedef struct oc_sb_flags              oc_sb_flags;
-typedef struct oc_border_info           oc_border_info;
-typedef struct oc_fragment              oc_fragment;
-typedef struct oc_fragment_plane        oc_fragment_plane;
-typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
-typedef struct oc_base_opt_data         oc_base_opt_data;
-typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
-typedef struct oc_theora_state          oc_theora_state;
-
-
-
 /*This library's version.*/
-# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)"
+# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"
 
 /*Theora bitstream version.*/
 # define TH_VERSION_MAJOR (3)
@@ -83,315 +83,6 @@ typedef struct oc_theora_state          oc_theora_state;
  ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
  (_info)->version_subminor>=(_sub)))
 
-/*A keyframe.*/
-#define OC_INTRA_FRAME (0)
-/*A predicted frame.*/
-#define OC_INTER_FRAME (1)
-/*A frame of unknown type (frame type decision has not yet been made).*/
-#define OC_UNKWN_FRAME (-1)
-
-/*The amount of padding to add to the reconstructed frame buffers on all
-   sides.
-  This is used to allow unrestricted motion vectors without special casing.
-  This must be a multiple of 2.*/
-#define OC_UMV_PADDING (16)
-
-/*Frame classification indices.*/
-/*The previous golden frame.*/
-#define OC_FRAME_GOLD (0)
-/*The previous frame.*/
-#define OC_FRAME_PREV (1)
-/*The current frame.*/
-#define OC_FRAME_SELF (2)
-
-/*The input or output buffer.*/
-#define OC_FRAME_IO   (3)
-
-/*Macroblock modes.*/
-/*Macro block is invalid: It is never coded.*/
-#define OC_MODE_INVALID        (-1)
-/*Encoded difference from the same macro block in the previous frame.*/
-#define OC_MODE_INTER_NOMV     (0)
-/*Encoded with no motion compensated prediction.*/
-#define OC_MODE_INTRA          (1)
-/*Encoded difference from the previous frame offset by the given motion 
-  vector.*/
-#define OC_MODE_INTER_MV       (2)
-/*Encoded difference from the previous frame offset by the last coded motion 
-  vector.*/
-#define OC_MODE_INTER_MV_LAST  (3)
-/*Encoded difference from the previous frame offset by the second to last 
-  coded motion vector.*/
-#define OC_MODE_INTER_MV_LAST2 (4)
-/*Encoded difference from the same macro block in the previous golden 
-  frame.*/
-#define OC_MODE_GOLDEN_NOMV    (5)
-/*Encoded difference from the previous golden frame offset by the given motion 
-  vector.*/
-#define OC_MODE_GOLDEN_MV      (6)
-/*Encoded difference from the previous frame offset by the individual motion 
-  vectors given for each block.*/
-#define OC_MODE_INTER_MV_FOUR  (7)
-/*The number of (coded) modes.*/
-#define OC_NMODES              (8)
-
-/*Determines the reference frame used for a given MB mode.*/
-#define OC_FRAME_FOR_MODE(_x) \
- OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
-  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
-
-/*Constants for the packet state machine common between encoder and decoder.*/
-
-/*Next packet to emit/read: Codec info header.*/
-#define OC_PACKET_INFO_HDR    (-3)
-/*Next packet to emit/read: Comment header.*/
-#define OC_PACKET_COMMENT_HDR (-2)
-/*Next packet to emit/read: Codec setup header.*/
-#define OC_PACKET_SETUP_HDR   (-1)
-/*No more packets to emit/read.*/
-#define OC_PACKET_DONE        (INT_MAX)
-
-
-
-/*Super blocks are 32x32 segments of pixels in a single color plane indexed
-   in image order.
-  Internally, super blocks are broken up into four quadrants, each of which
-   contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
-  Quadrants, and the blocks within them, are indexed in a special order called
-   a "Hilbert curve" within the super block.
-
-  In order to differentiate between the Hilbert-curve indexing strategy and
-   the regular image order indexing strategy, blocks indexed in image order
-   are called "fragments".
-  Fragments are indexed in image order, left to right, then bottom to top,
-   from Y' plane to Cb plane to Cr plane.
-
-  The co-located fragments in all image planes corresponding to the location
-   of a single quadrant of a luma plane super block form a macro block.
-  Thus there is only a single set of macro blocks for all planes, each of which
-   contains between 6 and 12 fragments, depending on the pixel format.
-  Therefore macro block information is kept in a separate set of arrays from
-   super blocks to avoid unused space in the other planes.
-  The lists are indexed in super block order.
-  That is, the macro block corresponding to the macro block mbi in (luma plane)
-   super block sbi is at index (sbi<<2|mbi).
-  Thus the number of macro blocks in each dimension is always twice the number
-   of super blocks, even when only an odd number fall inside the coded frame.
-  These "extra" macro blocks are just an artifact of our internal data layout,
-   and not part of the coded stream; they are flagged with a negative MB mode.*/
-
-
-
-/*A single quadrant of the map from a super block to fragment numbers.*/
-typedef ptrdiff_t       oc_sb_map_quad[4];
-/*A map from a super block to fragment numbers.*/
-typedef oc_sb_map_quad  oc_sb_map[4];
-/*A single plane of the map from a macro block to fragment numbers.*/
-typedef ptrdiff_t       oc_mb_map_plane[4];
-/*A map from a macro block to fragment numbers.*/
-typedef oc_mb_map_plane oc_mb_map[3];
-/*A motion vector.*/
-typedef signed char     oc_mv[2];
-
-
-
-/*Super block information.*/
-struct oc_sb_flags{
-  unsigned char coded_fully:1;
-  unsigned char coded_partially:1;
-  unsigned char quad_valid:4;
-};
-
-
-
-/*Information about a fragment which intersects the border of the displayable
-   region.
-  This marks which pixels belong to the displayable region.*/
-struct oc_border_info{
-  /*A bit mask marking which pixels are in the displayable region.
-    Pixel (x,y) corresponds to bit (y<<3|x).*/
-  ogg_int64_t mask;
-  /*The number of pixels in the displayable region.
-    This is always positive, and always less than 64.*/
-  int         npixels;
-};
-
-
-
-/*Fragment information.*/
-struct oc_fragment{
-  /*A flag indicating whether or not this fragment is coded.*/
-  unsigned   coded:1;
-  /*A flag indicating that this entire fragment lies outside the displayable
-     region of the frame.
-    Note the contrast with an invalid macro block, which is outside the coded
-     frame, not just the displayable one.
-    There are no fragments outside the coded frame by construction.*/
-  unsigned   invalid:1;
-  /*The index of the quality index used for this fragment's AC coefficients.*/
-  unsigned   qii:6;
-  /*The mode of the macroblock this fragment belongs to.*/
-  unsigned   mb_mode:3;
-  /*The index of the associated border information for fragments which lie
-     partially outside the displayable region.
-    For fragments completely inside or outside this region, this is -1.
-    Note that the C standard requires an explicit signed keyword for bitfield
-     types, since some compilers may treat them as unsigned without it.*/
-  signed int borderi:5;
-  /*The prediction-corrected DC component.
-    Note that the C standard requires an explicit signed keyword for bitfield
-     types, since some compilers may treat them as unsigned without it.*/
-  signed int dc:16;
-};
-
-
-
-/*A description of each fragment plane.*/
-struct oc_fragment_plane{
-  /*The number of fragments in the horizontal direction.*/
-  int       nhfrags;
-  /*The number of fragments in the vertical direction.*/
-  int       nvfrags;
-  /*The offset of the first fragment in the plane.*/
-  ptrdiff_t froffset;
-  /*The total number of fragments in the plane.*/
-  ptrdiff_t nfrags;
-  /*The number of super blocks in the horizontal direction.*/
-  unsigned  nhsbs;
-  /*The number of super blocks in the vertical direction.*/
-  unsigned  nvsbs;
-  /*The offset of the first super block in the plane.*/
-  unsigned  sboffset;
-  /*The total number of super blocks in the plane.*/
-  unsigned  nsbs;
-};
-
-
-
-/*The shared (encoder and decoder) functions that have accelerated variants.*/
-struct oc_base_opt_vtable{
-  void (*frag_copy)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride);
-  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
-   const ogg_int16_t _residue[64]);
-  void (*frag_recon_inter)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
-   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
-  void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
-   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-  void (*state_frag_copy_list)(const oc_theora_state *_state,
-   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
-   int _dst_frame,int _src_frame,int _pli);
-  void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
-   int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);  
-  void (*restore_fpu)(void);
-};
-
-/*The shared (encoder and decoder) tables that vary according to which variants
-   of the above functions are used.*/
-struct oc_base_opt_data{
-  const unsigned char *dct_fzig_zag;
-};
-
-
-/*State information common to both the encoder and decoder.*/
-struct oc_theora_state{
-  /*The stream information.*/
-  th_info             info;
-  /*Table for shared accelerated functions.*/
-  oc_base_opt_vtable  opt_vtable;
-  /*Table for shared data used by accelerated functions.*/
-  oc_base_opt_data    opt_data;
-  /*CPU flags to detect the presence of extended instruction sets.*/
-  ogg_uint32_t        cpu_flags;
-  /*The fragment plane descriptions.*/
-  oc_fragment_plane   fplanes[3];
-  /*The list of fragments, indexed in image order.*/
-  oc_fragment        *frags;
-  /*The the offset into the reference frame buffer to the upper-left pixel of
-     each fragment.*/
-  ptrdiff_t          *frag_buf_offs;
-  /*The motion vector for each fragment.*/
-  oc_mv              *frag_mvs;
-  /*The total number of fragments in a single frame.*/
-  ptrdiff_t           nfrags;
-  /*The list of super block maps, indexed in image order.*/
-  oc_sb_map          *sb_maps;
-  /*The list of super block flags, indexed in image order.*/
-  oc_sb_flags        *sb_flags;
-  /*The total number of super blocks in a single frame.*/
-  unsigned            nsbs;
-  /*The fragments from each color plane that belong to each macro block.
-    Fragments are stored in image order (left to right then top to bottom).
-    When chroma components are decimated, the extra fragments have an index of
-     -1.*/
-  oc_mb_map          *mb_maps;
-  /*The list of macro block modes.
-    A negative number indicates the macro block lies entirely outside the
-     coded frame.*/
-  signed char        *mb_modes;
-  /*The number of macro blocks in the X direction.*/
-  unsigned            nhmbs;
-  /*The number of macro blocks in the Y direction.*/
-  unsigned            nvmbs;
-  /*The total number of macro blocks.*/
-  size_t              nmbs;
-  /*The list of coded fragments, in coded order.
-    Uncoded fragments are stored in reverse order from the end of the list.*/
-  ptrdiff_t          *coded_fragis;
-  /*The number of coded fragments in each plane.*/
-  ptrdiff_t           ncoded_fragis[3];
-  /*The total number of coded fragments.*/
-  ptrdiff_t           ntotal_coded_fragis;
-  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
-  int                 ref_frame_idx[4];
-  /*The actual buffers used for the previously decoded frames.*/
-  th_ycbcr_buffer     ref_frame_bufs[4];
-  /*The storage for the reference frame buffers.*/
-  unsigned char      *ref_frame_data[4];
-  /*The strides for each plane in the reference frames.*/
-  int                 ref_ystride[3];
-  /*The number of unique border patterns.*/
-  int                 nborders;
-  /*The unique border patterns for all border fragments.
-    The borderi field of fragments which straddle the border indexes this
-     list.*/
-  oc_border_info      borders[16];
-  /*The frame number of the last keyframe.*/
-  ogg_int64_t         keyframe_num;
-  /*The frame number of the current frame.*/
-  ogg_int64_t         curframe_num;
-  /*The granpos of the current frame.*/
-  ogg_int64_t         granpos;
-  /*The type of the current frame.*/
-  unsigned char       frame_type;
-  /*The bias to add to the frame count when computing granule positions.*/
-  unsigned char       granpos_bias;
-  /*The number of quality indices used in the current frame.*/
-  unsigned char       nqis;
-  /*The quality indices of the current frame.*/
-  unsigned char       qis[3];
-  /*The dequantization tables, stored in zig-zag order, and indexed by
-     qi, pli, qti, and zzi.*/
-  ogg_uint16_t       *dequant_tables[64][3][2];
-  OC_ALIGN16(oc_quant_table      dequant_table_data[64][3][2]);
-  /*Loop filter strength parameters.*/
-  unsigned char       loop_filter_limits[64];
-};
-
-
-
-/*The function type used to fill in the chroma plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.
-  _cbmvs: The chroma block-level motion vectors to fill in.
-  _lmbmv: The luma macro-block level motion vector to fill in for use in
-           prediction.
-  _lbmvs: The luma block-level motion vectors.*/
-typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
-
 
 
 /*A map from the index in the zig zag scan to the coefficient number in a
@@ -409,14 +100,12 @@ extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
 /*The number of indices in the oc_mb_map array that can be valid for each of
    the various chroma decimation types.*/
 extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
-/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
-   macro block when 4 different motion vectors are specified in the luma
-   plane.*/
-extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
 
 
 
 int oc_ilog(unsigned _v);
+void *oc_aligned_malloc(size_t _sz,size_t _align);
+void oc_aligned_free(void *_ptr);
 void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
 void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
 void oc_free_2d(void *_ptr);
@@ -424,86 +113,4 @@ void oc_free_2d(void *_ptr);
 void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
  const th_ycbcr_buffer _src);
 
-int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
-void oc_state_clear(oc_theora_state *_state);
-void oc_state_vtable_init_c(oc_theora_state *_state);
-void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
- int _y0,int _yend);
-void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
-void oc_state_borders_fill(oc_theora_state *_state,int _refi);
-void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
- th_ycbcr_buffer _img);
-int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
-int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
- int _pli,int _dx,int _dy);
-
-int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
-void oc_state_loop_filter(oc_theora_state *_state,int _frame);
-#if defined(OC_DUMP_IMAGES)
-int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
- const char *_suf);
-#endif
-
-/*Shared accelerated functions.*/
-void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride);
-void oc_frag_recon_intra(const oc_theora_state *_state,
- unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter2(const oc_theora_state *_state,
- unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
- int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
-void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
-void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
-void oc_restore_fpu(const oc_theora_state *_state);
-
-/*Default pure-C implementations.*/
-void oc_frag_copy_c(unsigned char *_dst,
- const unsigned char *_src,int _src_ystride);
-void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter_c(unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
- const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
-void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
-void oc_restore_fpu_c(void);
-
-/*We need a way to call a few encoder functions without introducing a link-time
-   dependency into the decoder, while still allowing the old alpha API which
-   does not distinguish between encoder and decoder objects to be used.
-  We do this by placing a function table at the start of the encoder object
-   which can dispatch into the encoder library.
-  We do a similar thing for the decoder in case we ever decide to split off a
-   common base library.*/
-typedef void (*oc_state_clear_func)(theora_state *_th);
-typedef int (*oc_state_control_func)(theora_state *th,int _req,
- void *_buf,size_t _buf_sz);
-typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
- ogg_int64_t _granulepos);
-typedef double (*oc_state_granule_time_func)(theora_state *_th,
- ogg_int64_t _granulepos);
-
-
-struct oc_state_dispatch_vtable{
-  oc_state_clear_func         clear;
-  oc_state_control_func       control;
-  oc_state_granule_frame_func granule_frame;
-  oc_state_granule_time_func  granule_time;
-};
-
 #endif
diff --git a/thirdparty/libtheora/mathops.c b/thirdparty/libtheora/mathops.c
index d3fb909194..23c8f6e1ba 100644
--- a/thirdparty/libtheora/mathops.c
+++ b/thirdparty/libtheora/mathops.c
@@ -1,10 +1,8 @@
+#include "internal.h"
 #include "mathops.h"
-#include <limits.h>
 
 /*The fastest fallback strategy for platforms with fast multiplication appears
    to be based on de Bruijn sequences~\cite{LP98}.
-  Tests confirmed this to be true even on an ARM11, where it is actually faster
-   than using the native clz instruction.
   Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
    multiplication or table lookups are too expensive.
 
@@ -15,8 +13,7 @@
     year=1998,
     note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
   }*/
-#if !defined(OC_ILOG_NODEBRUIJN)&& \
- !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
+#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
 static const unsigned char OC_DEBRUIJN_IDX32[32]={
    0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
   31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
@@ -25,7 +22,7 @@ static const unsigned char OC_DEBRUIJN_IDX32[32]={
 
 int oc_ilog32(ogg_uint32_t _v){
 #if defined(OC_CLZ32)
-  return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
+  return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
 #else
 /*On a Pentium M, this branchless version tested as the fastest version without
    multiplications on 1,000,000,000 random 32-bit integers, edging out a
@@ -51,12 +48,12 @@ int oc_ilog32(ogg_uint32_t _v){
 /*This de Bruijn sequence version is faster if you have a fast multiplier.*/
 # else
   int ret;
-  ret=_v>0;
   _v|=_v>>1;
   _v|=_v>>2;
   _v|=_v>>4;
   _v|=_v>>8;
   _v|=_v>>16;
+  ret=_v&1;
   _v=(_v>>1)+1;
   ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
   return ret;
@@ -66,16 +63,21 @@ int oc_ilog32(ogg_uint32_t _v){
 
 int oc_ilog64(ogg_int64_t _v){
 #if defined(OC_CLZ64)
-  return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
+  return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
 #else
-# if defined(OC_ILOG_NODEBRUIJN)
+/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
+   halves.*/
+# if defined(OC_ILOG_NODEBRUIJN)|| \
+ defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL
   ogg_uint32_t v;
   int          ret;
   int          m;
-  ret=_v>0;
   m=(_v>0xFFFFFFFFU)<<5;
   v=(ogg_uint32_t)(_v>>m);
-  ret|=m;
+#  if defined(OC_CLZ32)
+  ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
+#  elif defined(OC_ILOG_NODEBRUIJN)
+  ret=v>0|m;
   m=(v>0xFFFFU)<<4;
   v>>=m;
   ret|=m;
@@ -90,26 +92,19 @@ int oc_ilog64(ogg_int64_t _v){
   ret|=m;
   ret+=v>1;
   return ret;
-# else
-/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
-#  if LONG_MAX<9223372036854775807LL
-  ogg_uint32_t v;
-  int          ret;
-  int          m;
-  ret=_v>0;
-  m=(_v>0xFFFFFFFFU)<<5;
-  v=(ogg_uint32_t)(_v>>m);
-  ret|=m;
+#  else
   v|=v>>1;
   v|=v>>2;
   v|=v>>4;
   v|=v>>8;
   v|=v>>16;
+  ret=v&1|m;
   v=(v>>1)+1;
   ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
+#  endif
   return ret;
-/*Otherwise do it in one 64-bit operation.*/
-#  else
+/*Otherwise do it in one 64-bit multiply.*/
+# else
   static const unsigned char OC_DEBRUIJN_IDX64[64]={
      0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
      5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
@@ -117,17 +112,16 @@ int oc_ilog64(ogg_int64_t _v){
     62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
   };
   int ret;
-  ret=_v>0;
   _v|=_v>>1;
   _v|=_v>>2;
   _v|=_v>>4;
   _v|=_v>>8;
   _v|=_v>>16;
   _v|=_v>>32;
+  ret=(int)_v&1;
   _v=(_v>>1)+1;
   ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
   return ret;
-#  endif
 # endif
 #endif
 }
@@ -294,3 +288,27 @@ ogg_int64_t oc_blog64(ogg_int64_t _w){
   }
   return OC_Q57(ipart)+z;
 }
+
+/*Polynomial approximation of a binary exponential.
+  Q10 input, Q0 output.*/
+ogg_uint32_t oc_bexp32_q10(int _z){
+  unsigned n;
+  int      ipart;
+  ipart=_z>>10;
+  n=(_z&(1<<10)-1)<<4;
+  n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
+  return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
+}
+
+/*Polynomial approximation of a binary logarithm.
+  Q0 input, Q10 output.*/
+int oc_blog32_q10(ogg_uint32_t _w){
+  int n;
+  int ipart;
+  int fpart;
+  if(_w<=0)return -1;
+  ipart=OC_ILOGNZ_32(_w);
+  n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
+  fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
+  return (ipart<<10)+(fpart>>4);
+}
diff --git a/thirdparty/libtheora/mathops.h b/thirdparty/libtheora/mathops.h
index efbc5377b0..a1a4f9df0e 100644
--- a/thirdparty/libtheora/mathops.h
+++ b/thirdparty/libtheora/mathops.h
@@ -2,29 +2,27 @@
 # define _mathops_H (1)
 # include <ogg/ogg.h>
 
-# ifdef __GNUC_PREREQ
-#  if __GNUC_PREREQ(3,4)
-#   include <limits.h>
+# if __GNUC_PREREQ(3,4)
+#  include <limits.h>
 /*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
    "upgrading" the type of an entire expression to an (unsigned) size_t.*/
-#   if INT_MAX>=2147483647
-#    define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#    define OC_CLZ32(_x) (__builtin_clz(_x))
-#   elif LONG_MAX>=2147483647L
-#    define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#    define OC_CLZ32(_x) (__builtin_clzl(_x))
-#   endif
-#   if INT_MAX>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clz(_x))
-#   elif LONG_MAX>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clzl(_x))
-#   elif LLONG_MAX>=9223372036854775807LL|| \
-     __LONG_LONG_MAX__>=9223372036854775807LL
-#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
-#    define OC_CLZ64(_x) (__builtin_clzll(_x))
-#   endif
+#  if INT_MAX>=2147483647
+#   define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#   define OC_CLZ32(_x) (__builtin_clz(_x))
+#  elif LONG_MAX>=2147483647L
+#   define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#   define OC_CLZ32(_x) (__builtin_clzl(_x))
+#  endif
+#  if INT_MAX>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clz(_x))
+#  elif LONG_MAX>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clzl(_x))
+#  elif LLONG_MAX>=9223372036854775807LL|| \
+    __LONG_LONG_MAX__>=9223372036854775807LL
+#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
+#   define OC_CLZ64(_x) (__builtin_clzll(_x))
 #  endif
 # endif
 
@@ -134,8 +132,12 @@ int oc_ilog64(ogg_int64_t _v);
 # define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
 
 #define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
+#define OC_Q10(_v) ((_v)<<10)
 
 ogg_int64_t oc_bexp64(ogg_int64_t _z);
 ogg_int64_t oc_blog64(ogg_int64_t _w);
 
+ogg_uint32_t oc_bexp32_q10(int _z);
+int oc_blog32_q10(ogg_uint32_t _w);
+
 #endif
diff --git a/thirdparty/libtheora/mcenc.c b/thirdparty/libtheora/mcenc.c
index 797e81f4f9..82eb824a80 100644
--- a/thirdparty/libtheora/mcenc.c
+++ b/thirdparty/libtheora/mcenc.c
@@ -88,9 +88,11 @@ static const int OC_SQUARE_SITES[11][8]={
 };
 
 
-static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
- int _accum[2],int _mbi,int _frame){
+static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
+ oc_mv _accum,int _mbi,int _frame){
   oc_mb_enc_info *embs;
+  int             accum_x;
+  int             accum_y;
   int             a[3][2];
   int             ncandidates;
   unsigned        nmbi;
@@ -102,20 +104,24 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
     /*Fill in the first part of set A: the vectors from adjacent blocks.*/
     for(i=0;i<embs[_mbi].ncneighbors;i++){
       nmbi=embs[_mbi].cneighbors[i];
-      _mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0];
-      _mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1];
+      _mcenc->candidates[ncandidates][0]=
+       OC_MV_X(embs[nmbi].analysis_mv[0][_frame]);
+      _mcenc->candidates[ncandidates][1]=
+       OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]);
       ncandidates++;
     }
   }
+  accum_x=OC_MV_X(_accum);
+  accum_y=OC_MV_Y(_accum);
   /*Add a few additional vectors to set A: the vectors used in the previous
      frames and the (0,0) vector.*/
-  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,_accum[0],31);
-  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,_accum[1],31);
+  _mcenc->candidates[ncandidates][0]=accum_x;
+  _mcenc->candidates[ncandidates][1]=accum_y;
   ncandidates++;
   _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
-   embs[_mbi].analysis_mv[1][_frame][0]+_accum[0],31);
+   OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31);
   _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
-   embs[_mbi].analysis_mv[1][_frame][1]+_accum[1],31);
+   OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31);
   ncandidates++;
   _mcenc->candidates[ncandidates][0]=0;
   _mcenc->candidates[ncandidates][1]=0;
@@ -131,30 +137,33 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
   OC_SORT2I(a[0][1],a[1][1]);
   _mcenc->candidates[0][0]=a[1][0];
   _mcenc->candidates[0][1]=a[1][1];
-  /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
   _mcenc->setb0=ncandidates;
-  /*The first time through the loop use the current macro block.*/
-  nmbi=_mbi;
-  for(i=0;;i++){
-    _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
-     2*embs[_mbi].analysis_mv[1][_frame][0]
-     -embs[_mbi].analysis_mv[2][_frame][0]+_accum[0],31);
-    _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
-     2*embs[_mbi].analysis_mv[1][_frame][1]
-     -embs[_mbi].analysis_mv[2][_frame][1]+_accum[1],31);
-    ncandidates++;
-    if(i>=embs[_mbi].npneighbors)break;
-    nmbi=embs[_mbi].pneighbors[i];
-  }
-  /*Truncate to full-pel positions.*/
-  for(i=0;i<ncandidates;i++){
-    _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]);
-    _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]);
-  }
+}
+
+static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
+ oc_mv _accum,int _mbi,int _frame){
+  oc_mb_enc_info *embs;
+  int             accum_x;
+  int             accum_y;
+  int             ncandidates;
+  embs=_enc->mb_info;
+  accum_x=OC_MV_X(_accum);
+  accum_y=OC_MV_Y(_accum);
+  /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
+  ncandidates=_mcenc->setb0;
+  /*Use only the current block. Using more did not appear to be helpful
+    with the current selection logic due to escaping the local search too
+    quickly.*/
+  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
+   2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
+   -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
+  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
+   2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
+   -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
+  ncandidates++;
   _mcenc->ncandidates=ncandidates;
 }
 
-#if 0
 static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
  const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
  int _mvoffset0,int _mvoffset1,const unsigned char *_src,
@@ -170,20 +179,21 @@ static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
   }
   return err;
 }
-#endif
 
 static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
  const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
  int _mvoffset0,int _mvoffset1,const unsigned char *_src,
  const unsigned char *_ref,int _ystride,unsigned _best_err){
   unsigned err;
+  int      dc;
   int      bi;
   err=0;
   for(bi=0;bi<4;bi++){
     ptrdiff_t frag_offs;
     frag_offs=_frag_buf_offs[_fragis[bi]];
-    err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
-     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
+    err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
+     _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
+    err+=abs(dc);
   }
   return err;
 }
@@ -219,9 +229,17 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
   err=0;
   for(bi=0;bi<4;bi++){
     ptrdiff_t frag_offs;
+    int       dc;
     frag_offs=_frag_buf_offs[_fragis[bi]];
-    err+=oc_enc_frag_satd_thresh(_enc,
-     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX);
+    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+      err+=oc_enc_frag_satd(_enc,&dc,
+       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
+      err+=abs(dc);
+    }
+    else{
+      err+=oc_enc_frag_sad(_enc,
+       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
+    }
   }
   return err;
 }
@@ -229,8 +247,11 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
 static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
  ptrdiff_t _frag_offs,int _dx,int _dy,
  const unsigned char *_src,const unsigned char *_ref,int _ystride){
-  return oc_enc_frag_satd_thresh(_enc,
-   _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX);
+  unsigned err;
+  int      dc;
+  err=oc_enc_frag_satd(_enc,&dc,
+   _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
+  return err+abs(dc);
 }
 
 /*Perform a motion vector search for this macro block against a single
@@ -239,11 +260,14 @@ static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
    the work can be shared.
   The actual motion vector is stored in the appropriate place in the
    oc_mb_enc_info structure.
-  _mcenc:    The motion compensation context.
-  _accum:    Drop frame/golden MV accumulators.
-  _mbi:      The macro block index.
-  _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/
-void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
+  _accum:      Drop frame/golden MV accumulators.
+  _mbi:        The macro block index.
+  _frame:      The frame to use for SATD calculations and refinement,
+                either OC_FRAME_PREV or OC_FRAME_GOLD.
+  _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV,
+                OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/
+void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
+ int _frame_full){
   /*Note: Traditionally this search is done using a rate-distortion objective
      function of the form D+lambda*R.
     However, xiphmont tested this and found it produced a small degredation,
@@ -264,6 +288,7 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
   const ptrdiff_t     *fragis;
   const unsigned char *src;
   const unsigned char *ref;
+  const unsigned char *satd_ref;
   int                  ystride;
   oc_mb_enc_info      *embs;
   ogg_int32_t          hit_cache[31];
@@ -278,17 +303,18 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
   int                  bi;
   embs=_enc->mb_info;
   /*Find some candidate motion vectors.*/
-  oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame);
+  oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame);
   /*Clear the cache of locations we've examined.*/
   memset(hit_cache,0,sizeof(hit_cache));
   /*Start with the median predictor.*/
-  candx=mcenc.candidates[0][0];
-  candy=mcenc.candidates[0][1];
+  candx=OC_DIV2(mcenc.candidates[0][0]);
+  candy=OC_DIV2(mcenc.candidates[0][1]);
   hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  ref=_enc->state.ref_frame_data[_frame_full];
+  satd_ref=_enc->state.ref_frame_data[_frame];
   ystride=_enc->state.ref_ystride[0];
   /*TODO: customize error function for speed/(quality+size) tradeoff.*/
   best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
@@ -317,8 +343,8 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
     t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
     /*Examine the candidates in set A.*/
     for(ci=1;ci<mcenc.setb0;ci++){
-      candx=mcenc.candidates[ci][0];
-      candy=mcenc.candidates[ci][1];
+      candx=OC_DIV2(mcenc.candidates[ci][0]);
+      candy=OC_DIV2(mcenc.candidates[ci][1]);
       /*If we've already examined this vector, then we would be using it if it
          was better than what we are using.*/
       hitbit=(ogg_int32_t)1<<candx+15;
@@ -340,10 +366,11 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
       }
     }
     if(best_err>t2){
+      oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame);
       /*Examine the candidates in set B.*/
       for(;ci<mcenc.ncandidates;ci++){
-        candx=mcenc.candidates[ci][0];
-        candy=mcenc.candidates[ci][1];
+        candx=OC_DIV2(mcenc.candidates[ci][0]);
+        candy=OC_DIV2(mcenc.candidates[ci][1]);
         hitbit=(ogg_int32_t)1<<candx+15;
         if(hit_cache[candy+15]&hitbit)continue;
         hit_cache[candy+15]|=hitbit;
@@ -475,58 +502,50 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
   candx=best_vec[0];
   candy=best_vec[1];
   embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
-   frag_buf_offs,fragis,candx,candy,src,ref,ystride);
-  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1);
-  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1);
-  if(_frame==OC_FRAME_PREV){
+   frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride);
+  embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1);
+  if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
     for(bi=0;bi<4;bi++){
       candx=best_block_vec[bi][0];
       candy=best_block_vec[bi][1];
       embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
-       frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride);
-      embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1);
-      embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1);
+       frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride);
+      embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1);
     }
   }
 }
 
 void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
-  oc_mv2         *mvs;
-  int             accum_p[2];
-  int             accum_g[2];
+  oc_mv2 *mvs;
+  oc_mv   accum_p;
+  oc_mv   accum_g;
+  oc_mv   mv2_p;
   mvs=_enc->mb_info[_mbi].analysis_mv;
-  if(_enc->prevframe_dropped){
-    accum_p[0]=mvs[0][OC_FRAME_PREV][0];
-    accum_p[1]=mvs[0][OC_FRAME_PREV][1];
-  }
-  else accum_p[1]=accum_p[0]=0;
-  accum_g[0]=mvs[2][OC_FRAME_GOLD][0];
-  accum_g[1]=mvs[2][OC_FRAME_GOLD][1];
-  mvs[0][OC_FRAME_PREV][0]-=mvs[2][OC_FRAME_PREV][0];
-  mvs[0][OC_FRAME_PREV][1]-=mvs[2][OC_FRAME_PREV][1];
+  if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV];
+  else accum_p=0;
+  accum_g=mvs[2][OC_FRAME_GOLD];
   /*Move the motion vector predictors back a frame.*/
-  memmove(mvs+1,mvs,2*sizeof(*mvs));
+  mv2_p=mvs[2][OC_FRAME_PREV];
+  mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD];
+  mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV];
+  mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD];
+  mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p);
   /*Search the last frame.*/
-  oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV);
-  mvs[2][OC_FRAME_PREV][0]=accum_p[0];
-  mvs[2][OC_FRAME_PREV][1]=accum_p[1];
+  oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG);
+  mvs[2][OC_FRAME_PREV]=accum_p;
   /*GOLDEN MVs are different from PREV MVs in that they're each absolute
      offsets from some frame in the past rather than relative offsets from the
      frame before.
     For predictor calculation to make sense, we need them to be in the same
      form as PREV MVs.*/
-  mvs[1][OC_FRAME_GOLD][0]-=mvs[2][OC_FRAME_GOLD][0];
-  mvs[1][OC_FRAME_GOLD][1]-=mvs[2][OC_FRAME_GOLD][1];
-  mvs[2][OC_FRAME_GOLD][0]-=accum_g[0];
-  mvs[2][OC_FRAME_GOLD][1]-=accum_g[1];
+  mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
+  mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g);
   /*Search the golden frame.*/
-  oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD);
+  oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG);
   /*Put GOLDEN MVs back into absolute offset form.
     The newest MV is already an absolute offset.*/
-  mvs[2][OC_FRAME_GOLD][0]+=accum_g[0];
-  mvs[2][OC_FRAME_GOLD][1]+=accum_g[1];
-  mvs[1][OC_FRAME_GOLD][0]+=mvs[2][OC_FRAME_GOLD][0];
-  mvs[1][OC_FRAME_GOLD][1]+=mvs[2][OC_FRAME_GOLD][1];
+  mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g);
+  mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
 }
 
 #if 0
@@ -543,7 +562,7 @@ static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
   int                  sitei;
   int                  err;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
+  ref=_enc->state.ref_frame_data[_framei];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -598,7 +617,7 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
   int                  sitei;
   int                  err;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  ref=_enc->state.ref_frame_data[_frame];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -627,8 +646,14 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
     ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
-     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+      err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
+       mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    }
+    else{
+      err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
+           mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    }
     if(err<_best_err){
       _best_err=err;
       best_site=site;
@@ -643,12 +668,11 @@ void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
   oc_mb_enc_info *embs;
   int             vec[2];
   embs=_enc->mb_info;
-  vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]);
-  vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]);
+  vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame]));
+  vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame]));
   embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
    _mbi,vec,embs[_mbi].satd[_frame],_frame);
-  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0];
-  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1];
+  embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]);
 }
 
 #if 0
@@ -704,6 +728,7 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
   best_site=4;
   for(sitei=0;sitei<8;sitei++){
     unsigned err;
+    int      dc;
     int      site;
     int      xmask;
     int      ymask;
@@ -723,8 +748,9 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
     ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
-    err=oc_enc_frag_satd2_thresh(_enc,_src,
-     _ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err);
+    err=oc_enc_frag_satd2(_enc,&dc,_src,
+     _ref+mvoffset0,_ref+mvoffset1,_ystride);
+    err+=abs(dc);
     if(err<_best_err){
       _best_err=err;
       best_site=site;
@@ -748,7 +774,7 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
   offset_y[3]=offset_y[5]=0;
   offset_y[6]=offset_y[7]=offset_y[8]=ystride;
@@ -757,11 +783,10 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
     ptrdiff_t frag_offs;
     int       vec[2];
     frag_offs=frag_buf_offs[fragis[bi]];
-    vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]);
-    vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]);
+    vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi]));
+    vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi]));
     embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
      src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
-    embs[_mbi].ref_mv[bi][0]=(signed char)vec[0];
-    embs[_mbi].ref_mv[bi][1]=(signed char)vec[1];
+    embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]);
   }
 }
diff --git a/thirdparty/libtheora/modedec.h b/thirdparty/libtheora/modedec.h
index ea12c64afd..efe640e263 100644
--- a/thirdparty/libtheora/modedec.h
+++ b/thirdparty/libtheora/modedec.h
@@ -1,614 +1,91 @@
 /*File generated by libtheora with OC_COLLECT_METRICS defined at compile time.*/
 #if !defined(_modedec_H)
 # define _modedec_H (1)
+# include "encint.h"
 
 
 
-# if defined(OC_COLLECT_METRICS)
-typedef struct oc_mode_metrics oc_mode_metrics;
+/*The log of the average quantizer for each of the OC_MODE_RD table rows
+   (e.g., for the represented qi's, and each pli and qti), in Q10 format.
+  The actual statistics used by the encoder will be interpolated from
+   that table based on log_plq for the actual quantization matrix used.*/
+# if !defined(OC_COLLECT_METRICS)
+static const
 # endif
-typedef struct oc_mode_rd      oc_mode_rd;
-
-
-
-/*The number of extra bits of precision at which to store rate metrics.*/
-# define OC_BIT_SCALE  (6)
-/*The number of extra bits of precision at which to store RMSE metrics.
-  This must be at least half OC_BIT_SCALE (rounded up).*/
-# define OC_RMSE_SCALE (5)
-/*The number of bins to partition statistics into.*/
-# define OC_SAD_BINS   (24)
-/*The number of bits of precision to drop from SAD scores to assign them to a
-   bin.*/
-# define OC_SAD_SHIFT  (9)
-
-
-
-# if defined(OC_COLLECT_METRICS)
-struct oc_mode_metrics{
-  double fragw;
-  double satd;
-  double rate;
-  double rmse;
-  double satd2;
-  double satdrate;
-  double rate2;
-  double satdrmse;
-  double rmse2;
+ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={
+  { {0x1F05,0x2101},{0x206E,0x2101},{0x206E,0x2101} },
+  { {0x1C9A,0x1EAC},{0x1E0E,0x1EAC},{0x1E0E,0x1EAC} },
+  { {0x1A31,0x1C48},{0x1B6F,0x1C48},{0x1B6F,0x1C48} },
+  { {0x17B0,0x19E7},{0x1938,0x19E7},{0x1938,0x19E7} },
+  { {0x152F,0x178F},{0x16AB,0x178F},{0x16AB,0x178F} },
+  { {0x12F1,0x1534},{0x145D,0x1534},{0x145D,0x1534} },
+  { {0x0FF3,0x1321},{0x11BE,0x1321},{0x11BE,0x1321} },
+  { {0x0E1F,0x1073},{0x0E93,0x1073},{0x0E93,0x1073} }
 };
 
-
-int             oc_has_mode_metrics;
-oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];
-# endif
-
-
-
-struct oc_mode_rd{
-  ogg_int16_t rate;
-  ogg_int16_t rmse;
-};
-
-
 # if !defined(OC_COLLECT_METRICS)
 static const
 # endif
-oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
+oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={
   {
     {
       /*Y'  qi=0  INTRA*/
       {
-        {   87,  -66},{  132, 1611},{  197, 3474},{  285, 5130},
-        {  376, 6419},{  450, 7545},{  521, 8587},{  600, 9587},
-        {  689,10498},{  790,11348},{  899,12158},{ 1030,12855},
-        { 1166,13459},{ 1276,14052},{ 1353,14732},{ 1444,15425},
-        { 1535,16101},{ 1609,16856},{ 1697,17532},{ 1823,17995},
-        { 1962,18426},{ 2085,18919},{ 2201,19503},{ 2304,20307}
+        {   57, 1550},{  121, 2460},{  185, 3901},{  336, 5189},
+        {  406, 6243},{  501, 7329},{  565, 8292},{  674, 9257},
+        {  746,10219},{  843,11056},{  961,11822},{ 1120,12512},
+        { 1208,13233},{ 1394,13600},{ 1409,14381},{ 1492,15129},
+        { 1593,15804},{ 1639,16573},{ 1731,17161},{ 1844,17707},
+        { 1949,18300},{ 2073,18654},{ 2140,19465},{ 2278,19794}
       },
       /*Y'  qi=0  INTER*/
       {
-        {   32, -105},{   40, 1268},{   54, 2919},{   91, 4559},
-        {  118, 6244},{  132, 7932},{  142, 9514},{  149,10989},
-        {  155,12375},{  161,13679},{  168,14958},{  176,16215},
-        {  187,17431},{  196,18623},{  207,19790},{  218,20941},
-        {  230,22083},{  246,23213},{  265,24333},{  292,25439},
-        {  328,26512},{  372,27538},{  427,28522},{  494,29479}
+        {  -18, 1274},{   23, 2505},{   32, 3612},{   57, 5153},
+        {   79, 6636},{   97, 8082},{  109, 9505},{  122,10924},
+        {  134,12293},{  145,13634},{  158,14942},{  172,16212},
+        {  186,17422},{  198,18604},{  209,19757},{  218,20875},
+        {  235,21980},{  253,23056},{  276,24121},{  305,25184},
+        {  342,26202},{  393,27140},{  439,28140},{  556,28659}
       }
     },
     {
       /*Cb  qi=0  INTRA*/
       {
-        {    1,    6},{   27,  368},{   52,  738},{   67, 1171},
-        {   80, 1642},{   99, 2134},{  110, 2642},{  112, 3144},
-        {  126, 3578},{  154, 3967},{  167, 4387},{  172, 4839},
-        {  191, 5278},{  208, 5666},{  220, 6036},{  223, 6398},
-        {  227, 6814},{  253, 7157},{  284, 7403},{  292, 7699},
-        {  314, 7983},{  339, 8203},{  363, 8460},{  399, 8919}
+        {   32, 1763},{   56, 2150},{   78, 2336},{   88, 2608},
+        {  105, 2975},{  121, 3297},{  113, 3460},{  126, 3993},
+        {  142, 4432},{  177, 4733},{  185, 5058},{  194, 5447},
+        {  220, 5812},{  227, 6202},{  246, 6415},{  269, 6821},
+        {  279, 7026},{  313, 7313},{  321, 7708},{  316, 8021},
+        {  370, 8203},{  389, 8573},{  410, 8607},{  431, 8816}
       },
       /*Cb  qi=0  INTER*/
       {
-        {   68,  -55},{   63,  275},{   58,  602},{   53,  936},
-        {   50, 1290},{   54, 1691},{   58, 2116},{   62, 2553},
-        {   67, 2992},{   72, 3422},{   78, 3843},{   84, 4253},
-        {   89, 4658},{   94, 5062},{   98, 5455},{  100, 5848},
-        {  102, 6231},{  104, 6604},{  104, 6982},{  105, 7359},
-        {  105, 7733},{  104, 8104},{  105, 8465},{  111, 8828}
+        {    3,  282},{    3, 1200},{    3, 1605},{    6, 2190},
+        {   15, 2519},{   18, 2798},{   21, 3115},{   25, 3460},
+        {   33, 3839},{   40, 4217},{   47, 4592},{   51, 4958},
+        {   56, 5326},{   59, 5710},{   63, 6066},{   65, 6412},
+        {   67, 6762},{   68, 7104},{   70, 7461},{   72, 7829},
+        {   77, 8200},{   80, 8566},{   86, 8906},{   90, 9203}
       }
     },
     {
       /*Cr  qi=0  INTRA*/
       {
-        {    1,    8},{   23,  375},{   47,  759},{   63, 1220},
-        {   71, 1693},{   82, 2171},{   94, 2652},{  109, 3103},
-        {  125, 3567},{  133, 3995},{  151, 4375},{  168, 4819},
-        {  174, 5244},{  190, 5635},{  215, 6005},{  242, 6347},
-        {  257, 6758},{  280, 7068},{  311, 7336},{  326, 7652},
-        {  346, 7968},{  372, 8213},{  388, 8515},{  408, 9060}
+        {   27, 1720},{   44, 1920},{   66, 2255},{   73, 2429},
+        {   95, 2988},{  103, 3279},{  123, 3691},{  129, 4012},
+        {  151, 4415},{  150, 4760},{  183, 5008},{  193, 5351},
+        {  211, 5788},{  235, 6134},{  263, 6400},{  276, 6711},
+        {  291, 7100},{  346, 7285},{  329, 7616},{  387, 7827},
+        {  361, 8214},{  430, 8534},{  429, 8608},{  450, 8823}
       },
       /*Cr  qi=0  INTER*/
       {
-        {   69,    0},{   60,  314},{   49,  624},{   45,  943},
-        {   45, 1285},{   49, 1691},{   55, 2130},{   62, 2560},
-        {   71, 2973},{   79, 3385},{   85, 3800},{   89, 4207},
-        {   92, 4620},{   95, 5037},{   96, 5436},{   97, 5839},
-        {   98, 6252},{   99, 6653},{   99, 7038},{  103, 7426},
-        {  107, 7810},{  108, 8178},{  107, 8539},{  106, 8937}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=1  INTRA*/
-      {
-        {   81,  -71},{  133, 1610},{  203, 3460},{  296, 5083},
-        {  392, 6342},{  467, 7454},{  541, 8486},{  625, 9466},
-        {  716,10352},{  823,11181},{  940,11961},{ 1074,12643},
-        { 1211,13233},{ 1324,13807},{ 1408,14489},{ 1504,15167},
-        { 1598,15824},{ 1679,16544},{ 1788,17161},{ 1928,17579},
-        { 2070,17991},{ 2202,18456},{ 2324,19021},{ 2425,19894}
-      },
-      /*Y'  qi=1  INTER*/
-      {
-        {   34,    4},{   40, 1307},{   55, 2914},{   93, 4555},
-        {  120, 6243},{  134, 7912},{  144, 9468},{  152,10918},
-        {  158,12275},{  164,13569},{  171,14846},{  180,16098},
-        {  191,17310},{  204,18484},{  216,19636},{  228,20779},
-        {  242,21912},{  261,23036},{  286,24146},{  320,25221},
-        {  363,26265},{  418,27261},{  485,28203},{  551,29148}
-      }
-    },
-    {
-      /*Cb  qi=1  INTRA*/
-      {
-        {    1,    6},{   28,  367},{   52,  738},{   68, 1172},
-        {   86, 1644},{  106, 2135},{  115, 2642},{  119, 3141},
-        {  132, 3569},{  157, 3951},{  172, 4366},{  177, 4819},
-        {  194, 5258},{  211, 5638},{  224, 6006},{  233, 6367},
-        {  236, 6784},{  258, 7121},{  299, 7357},{  319, 7637},
-        {  337, 7921},{  358, 8141},{  381, 8367},{  401, 8768}
-      },
-      /*Cb  qi=1  INTER*/
-      {
-        {   95,  -31},{   81,  295},{   67,  614},{   53,  953},
-        {   48, 1305},{   51, 1700},{   56, 2125},{   61, 2563},
-        {   67, 3008},{   73, 3435},{   79, 3844},{   85, 4251},
-        {   90, 4663},{   95, 5073},{   98, 5458},{  100, 5844},
-        {  101, 6231},{  102, 6606},{  102, 6980},{  103, 7347},
-        {  104, 7726},{  105, 8096},{  105, 8453},{  105, 8789}
-      }
-    },
-    {
-      /*Cr  qi=1  INTRA*/
-      {
-        {    1,    8},{   25,  375},{   50,  759},{   65, 1221},
-        {   74, 1695},{   86, 2172},{  101, 2651},{  117, 3101},
-        {  129, 3561},{  135, 3985},{  153, 4368},{  171, 4807},
-        {  182, 5223},{  202, 5608},{  225, 5964},{  251, 6300},
-        {  271, 6697},{  295, 6978},{  324, 7235},{  348, 7558},
-        {  367, 7877},{  394, 8101},{  413, 8386},{  409, 8945}
-      },
-      /*Cr  qi=1  INTER*/
-      {
-        {   66,   11},{   59,  323},{   51,  631},{   44,  949},
-        {   44, 1292},{   49, 1703},{   56, 2140},{   62, 2566},
-        {   69, 2991},{   77, 3397},{   84, 3799},{   89, 4211},
-        {   93, 4634},{   94, 5049},{   95, 5444},{   96, 5854},
-        {   94, 6260},{   95, 6640},{   96, 7032},{  101, 7423},
-        {  104, 7790},{  105, 8158},{  109, 8527},{  108, 8872}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=2  INTRA*/
-      {
-        {   87,  -72},{  139, 1607},{  213, 3426},{  315, 4992},
-        {  416, 6217},{  495, 7315},{  574, 8317},{  666, 9265},
-        {  763,10124},{  875,10906},{ 1001,11654},{ 1147,12305},
-        { 1289,12865},{ 1407,13424},{ 1503,14076},{ 1610,14724},
-        { 1720,15342},{ 1815,16020},{ 1937,16579},{ 2084,16981},
-        { 2236,17371},{ 2385,17779},{ 2536,18250},{ 2689,18931}
-      },
-      /*Y'  qi=2  INTER*/
-      {
-        {   30,   -2},{   40, 1308},{   57, 2921},{   96, 4567},
-        {  122, 6260},{  136, 7902},{  148, 9418},{  156,10826},
-        {  162,12157},{  169,13448},{  177,14709},{  188,15938},
-        {  200,17133},{  213,18295},{  228,19433},{  245,20564},
-        {  264,21685},{  289,22790},{  323,23876},{  368,24916},
-        {  427,25906},{  499,26837},{  585,27700},{  680,28514}
-      }
-    },
-    {
-      /*Cb  qi=2  INTRA*/
-      {
-        {    1,    6},{   30,  367},{   58,  738},{   77, 1172},
-        {   93, 1645},{  111, 2137},{  123, 2642},{  126, 3133},
-        {  136, 3553},{  162, 3934},{  178, 4352},{  183, 4803},
-        {  199, 5231},{  220, 5596},{  235, 5957},{  245, 6314},
-        {  256, 6718},{  286, 7048},{  320, 7285},{  336, 7568},
-        {  366, 7829},{  387, 8045},{  405, 8261},{  445, 8550}
-      },
-      /*Cb  qi=2  INTER*/
-      {
-        {  115,  -61},{   93,  277},{   71,  609},{   54,  963},
-        {   49, 1329},{   53, 1715},{   58, 2138},{   63, 2583},
-        {   69, 3017},{   75, 3442},{   81, 3857},{   88, 4263},
-        {   93, 4667},{   96, 5065},{  101, 5451},{  101, 5832},
-        {  102, 6213},{  103, 6593},{  103, 6968},{  104, 7336},
-        {  104, 7710},{  105, 8076},{  106, 8440},{  106, 8822}
-      }
-    },
-    {
-      /*Cr  qi=2  INTRA*/
-      {
-        {    1,    8},{   27,  375},{   54,  759},{   70, 1222},
-        {   79, 1696},{   89, 2173},{  106, 2652},{  123, 3098},
-        {  135, 3553},{  143, 3972},{  161, 4348},{  181, 4782},
-        {  194, 5189},{  213, 5565},{  235, 5907},{  266, 6229},
-        {  286, 6618},{  311, 6897},{  339, 7152},{  362, 7454},
-        {  392, 7721},{  416, 7946},{  429, 8227},{  458, 8540}
-      },
-      /*Cr  qi=2  INTER*/
-      {
-        {   74,   20},{   63,  330},{   51,  635},{   44,  942},
-        {   47, 1287},{   54, 1710},{   59, 2147},{   65, 2571},
-        {   72, 2996},{   79, 3413},{   86, 3820},{   91, 4230},
-        {   93, 4642},{   95, 5046},{   95, 5442},{   95, 5839},
-        {   96, 6243},{   97, 6641},{   99, 7021},{  101, 7396},
-        {  103, 7764},{  106, 8138},{  109, 8507},{  114, 8851}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=3  INTRA*/
-      {
-        {   91,  -67},{  141, 1606},{  219, 3405},{  328, 4929},
-        {  433, 6122},{  515, 7209},{  598, 8204},{  693, 9145},
-        {  796, 9986},{  912,10756},{ 1045,11471},{ 1200,12079},
-        { 1345,12640},{ 1471,13179},{ 1571,13809},{ 1678,14450},
-        { 1798,15047},{ 1905,15701},{ 2043,16205},{ 2202,16569},
-        { 2351,16971},{ 2501,17393},{ 2660,17851},{ 2825,18455}
-      },
-      /*Y'  qi=3  INTER*/
-      {
-        {   53, -164},{   38, 1314},{   59, 2917},{   99, 4563},
-        {  124, 6253},{  139, 7882},{  150, 9375},{  159,10749},
-        {  166,12059},{  173,13349},{  183,14608},{  194,15826},
-        {  208,17003},{  223,18150},{  240,19287},{  259,20411},
-        {  284,21508},{  317,22593},{  359,23656},{  414,24671},
-        {  483,25634},{  569,26519},{  670,27332},{  786,28072}
-      }
-    },
-    {
-      /*Cb  qi=3  INTRA*/
-      {
-        {    1,    5},{   31,  367},{   58,  739},{   78, 1173},
-        {   96, 1645},{  113, 2134},{  125, 2638},{  133, 3127},
-        {  148, 3542},{  171, 3915},{  184, 4328},{  192, 4776},
-        {  209, 5197},{  230, 5556},{  245, 5909},{  252, 6261},
-        {  272, 6641},{  304, 6942},{  330, 7184},{  342, 7477},
-        {  380, 7736},{  404, 7962},{  428, 8151},{  469, 8430}
-      },
-      /*Cb  qi=3  INTER*/
-      {
-        {   86,  -29},{   72,  296},{   58,  618},{   46,  964},
-        {   47, 1338},{   51, 1743},{   56, 2158},{   63, 2594},
-        {   69, 3035},{   77, 3455},{   84, 3859},{   89, 4266},
-        {   94, 4673},{   98, 5074},{  101, 5460},{  101, 5842},
-        {  101, 6217},{  101, 6593},{  102, 6964},{  104, 7325},
-        {  103, 7696},{  103, 8056},{  104, 8430},{  103, 8792}
-      }
-    },
-    {
-      /*Cr  qi=3  INTRA*/
-      {
-        {    1,    8},{   27,  374},{   56,  759},{   74, 1221},
-        {   83, 1696},{   96, 2173},{  113, 2650},{  127, 3091},
-        {  140, 3542},{  151, 3960},{  164, 4334},{  188, 4764},
-        {  208, 5144},{  224, 5493},{  250, 5841},{  278, 6162},
-        {  298, 6548},{  334, 6816},{  365, 7045},{  388, 7343},
-        {  419, 7613},{  443, 7836},{  455, 8105},{  484, 8445}
-      },
-      /*Cr  qi=3  INTER*/
-      {
-        {   76,   26},{   65,  332},{   53,  638},{   45,  945},
-        {   45, 1304},{   53, 1725},{   60, 2153},{   68, 2584},
-        {   74, 3007},{   81, 3425},{   87, 3844},{   91, 4253},
-        {   94, 4657},{   95, 5061},{   94, 5462},{   94, 5856},
-        {   95, 6250},{   96, 6635},{   97, 7014},{  101, 7393},
-        {  104, 7761},{  106, 8137},{  109, 8506},{  111, 8823}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=4  INTRA*/
-      {
-        {   80,  -67},{  143, 1603},{  227, 3378},{  344, 4861},
-        {  454, 6026},{  537, 7104},{  626, 8089},{  725, 9006},
-        {  830, 9827},{  950,10581},{ 1089,11270},{ 1257,11826},
-        { 1409,12366},{ 1535,12912},{ 1640,13528},{ 1753,14173},
-        { 1884,14756},{ 2007,15368},{ 2148,15852},{ 2307,16212},
-        { 2464,16591},{ 2614,17019},{ 2785,17455},{ 2970,17963}
-      },
-      /*Y'  qi=4  INTER*/
-      {
-        {   50, -145},{   38, 1324},{   61, 2921},{  102, 4566},
-        {  127, 6248},{  142, 7845},{  154, 9300},{  163,10656},
-        {  169,11965},{  177,13246},{  188,14495},{  202,15702},
-        {  218,16864},{  236,18003},{  256,19124},{  278,20233},
-        {  307,21330},{  347,22398},{  398,23437},{  463,24429},
-        {  546,25343},{  649,26170},{  767,26935},{  888,27674}
-      }
-    },
-    {
-      /*Cb  qi=4  INTRA*/
-      {
-        {    1,    5},{   33,  367},{   61,  739},{   80, 1173},
-        {   98, 1646},{  114, 2136},{  126, 2639},{  137, 3124},
-        {  152, 3535},{  176, 3903},{  194, 4307},{  206, 4753},
-        {  222, 5165},{  242, 5508},{  260, 5857},{  272, 6205},
-        {  294, 6559},{  332, 6848},{  356, 7104},{  364, 7389},
-        {  396, 7637},{  415, 7878},{  446, 8064},{  506, 8294}
-      },
-      /*Cb  qi=4  INTER*/
-      {
-        {   86,  -15},{   73,  308},{   60,  627},{   46,  967},
-        {   47, 1343},{   51, 1754},{   56, 2183},{   63, 2615},
-        {   70, 3044},{   79, 3459},{   85, 3866},{   90, 4276},
-        {   94, 4686},{   97, 5088},{  100, 5467},{  102, 5837},
-        {  102, 6205},{  101, 6569},{  103, 6939},{  104, 7317},
-        {  105, 7690},{  107, 8043},{  107, 8394},{  111, 8736}
-      }
-    },
-    {
-      /*Cr  qi=4  INTRA*/
-      {
-        {    1,    7},{   28,  375},{   57,  759},{   79, 1221},
-        {   92, 1697},{  105, 2174},{  122, 2648},{  135, 3085},
-        {  146, 3530},{  157, 3947},{  171, 4316},{  195, 4737},
-        {  218, 5117},{  239, 5445},{  268, 5767},{  295, 6074},
-        {  315, 6460},{  355, 6735},{  392, 6933},{  418, 7218},
-        {  448, 7495},{  471, 7688},{  481, 7954},{  504, 8313}
-      },
-      /*Cr  qi=4  INTER*/
-      {
-        {   68,   28},{   57,  334},{   47,  639},{   43,  953},
-        {   48, 1314},{   54, 1736},{   59, 2169},{   69, 2592},
-        {   78, 3017},{   84, 3434},{   88, 3850},{   92, 4260},
-        {   95, 4663},{   96, 5068},{   95, 5455},{   95, 5839},
-        {   96, 6243},{   97, 6626},{   98, 7006},{  101, 7390},
-        {  104, 7755},{  108, 8115},{  111, 8471},{  110, 8825}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=5  INTRA*/
-      {
-        {   84,  -69},{  147, 1599},{  237, 3350},{  360, 4796},
-        {  475, 5934},{  562, 6992},{  657, 7953},{  765, 8837},
-        {  874, 9641},{  998,10384},{ 1146,11047},{ 1322,11572},
-        { 1484,12076},{ 1617,12609},{ 1731,13203},{ 1856,13806},
-        { 1995,14367},{ 2132,14936},{ 2289,15386},{ 2460,15721},
-        { 2635,16066},{ 2802,16442},{ 2980,16805},{ 3177,17272}
-      },
-      /*Y'  qi=5  INTER*/
-      {
-        {   38,  -86},{   37, 1349},{   64, 2920},{  105, 4563},
-        {  129, 6236},{  145, 7809},{  158, 9236},{  167,10572},
-        {  174,11871},{  182,13141},{  195,14368},{  212,15558},
-        {  230,16706},{  250,17828},{  274,18944},{  303,20041},
-        {  342,21116},{  394,22152},{  460,23144},{  543,24073},
-        {  648,24919},{  773,25673},{  922,26323},{ 1084,26924}
-      }
-    },
-    {
-      /*Cb  qi=5  INTRA*/
-      {
-        {    1,    5},{   34,  367},{   63,  739},{   82, 1174},
-        {  102, 1647},{  119, 2137},{  134, 2639},{  145, 3121},
-        {  161, 3529},{  189, 3891},{  207, 4290},{  216, 4721},
-        {  232, 5113},{  258, 5455},{  277, 5798},{  294, 6124},
-        {  322, 6427},{  352, 6697},{  370, 6982},{  384, 7283},
-        {  423, 7529},{  448, 7766},{  478, 7943},{  527, 8151}
-      },
-      /*Cb  qi=5  INTER*/
-      {
-        {   83,  -49},{   69,  284},{   55,  611},{   48,  961},
-        {   49, 1355},{   52, 1769},{   58, 2191},{   65, 2616},
-        {   73, 3041},{   80, 3460},{   87, 3868},{   92, 4276},
-        {   95, 4682},{   98, 5077},{  100, 5459},{  102, 5827},
-        {  102, 6200},{  102, 6568},{  103, 6930},{  103, 7303},
-        {  104, 7672},{  106, 8032},{  106, 8391},{  106, 8727}
-      }
-    },
-    {
-      /*Cr  qi=5  INTRA*/
-      {
-        {    1,    8},{   28,  375},{   57,  760},{   81, 1222},
-        {   99, 1696},{  111, 2175},{  125, 2648},{  140, 3079},
-        {  152, 3520},{  162, 3927},{  179, 4294},{  203, 4714},
-        {  225, 5080},{  254, 5389},{  286, 5703},{  318, 5997},
-        {  342, 6364},{  380, 6640},{  416, 6837},{  445, 7103},
-        {  473, 7370},{  497, 7562},{  514, 7811},{  549, 8148}
-      },
-      /*Cr  qi=5  INTER*/
-      {
-        {   60,    6},{   54,  323},{   46,  638},{   43,  958},
-        {   45, 1329},{   54, 1749},{   61, 2175},{   70, 2600},
-        {   79, 3021},{   85, 3437},{   89, 3847},{   93, 4254},
-        {   95, 4660},{   96, 5065},{   95, 5456},{   95, 5849},
-        {   96, 6243},{   96, 6621},{   97, 6996},{  101, 7366},
-        {  104, 7722},{  107, 8088},{  111, 8448},{  119, 8816}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=6  INTRA*/
-      {
-        {   88,  -69},{  151, 1593},{  251, 3294},{  387, 4681},
-        {  507, 5790},{  601, 6837},{  702, 7787},{  813, 8648},
-        {  927, 9427},{ 1059,10152},{ 1213,10787},{ 1399,11284},
-        { 1568,11781},{ 1705,12312},{ 1823,12890},{ 1957,13482},
-        { 2106,14036},{ 2249,14600},{ 2411,15042},{ 2588,15359},
-        { 2772,15699},{ 2947,16062},{ 3127,16429},{ 3320,16849}
-      },
-      /*Y'  qi=6  INTER*/
-      {
-        {   44,  -80},{   36, 1346},{   69, 2919},{  111, 4563},
-        {  136, 6216},{  154, 7746},{  168, 9139},{  178,10461},
-        {  185,11747},{  195,13007},{  211,14229},{  230,15408},
-        {  250,16547},{  274,17663},{  302,18769},{  339,19851},
-        {  386,20907},{  446,21933},{  527,22884},{  631,23746},
-        {  760,24512},{  914,25178},{ 1087,25758},{ 1278,26262}
-      }
-    },
-    {
-      /*Cb  qi=6  INTRA*/
-      {
-        {    1,    4},{   36,  367},{   66,  739},{   84, 1174},
-        {  105, 1648},{  126, 2139},{  140, 2639},{  149, 3116},
-        {  164, 3523},{  194, 3880},{  217, 4271},{  226, 4694},
-        {  243, 5077},{  270, 5407},{  291, 5742},{  310, 6061},
-        {  340, 6340},{  373, 6609},{  394, 6890},{  409, 7189},
-        {  444, 7434},{  469, 7652},{  499, 7853},{  559, 8135}
-      },
-      /*Cb  qi=6  INTER*/
-      {
-        {   68,  -46},{   60,  291},{   50,  623},{   49,  971},
-        {   50, 1357},{   55, 1781},{   61, 2211},{   69, 2634},
-        {   78, 3052},{   86, 3466},{   91, 3882},{   95, 4292},
-        {   98, 4691},{  101, 5080},{  102, 5458},{  103, 5830},
-        {  103, 6192},{  104, 6554},{  104, 6916},{  106, 7278},
-        {  108, 7641},{  110, 8004},{  112, 8371},{  112, 8758}
-      }
-    },
-    {
-      /*Cr  qi=6  INTRA*/
-      {
-        {    1,    8},{   29,  375},{   59,  760},{   84, 1223},
-        {   99, 1698},{  112, 2176},{  129, 2647},{  143, 3076},
-        {  156, 3510},{  168, 3906},{  189, 4269},{  220, 4682},
-        {  241, 5047},{  266, 5342},{  299, 5649},{  331, 5954},
-        {  357, 6309},{  393, 6579},{  431, 6765},{  467, 6997},
-        {  501, 7276},{  520, 7488},{  525, 7749},{  548, 8146}
-      },
-      /*Cr  qi=6  INTER*/
-      {
-        {   94,   31},{   69,  335},{   47,  641},{   43,  967},
-        {   50, 1350},{   57, 1772},{   65, 2197},{   74, 2625},
-        {   83, 3043},{   90, 3454},{   94, 3867},{   97, 4273},
-        {   98, 4671},{   99, 5068},{   99, 5461},{   98, 5857},
-        {   98, 6245},{   99, 6610},{  103, 6975},{  105, 7345},
-        {  108, 7712},{  111, 8073},{  113, 8415},{  119, 8768}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=7  INTRA*/
-      {
-        {   92,  -70},{  156, 1590},{  261, 3267},{  403, 4618},
-        {  529, 5704},{  628, 6730},{  736, 7657},{  856, 8491},
-        {  978, 9246},{ 1118, 9943},{ 1281,10550},{ 1472,11028},
-        { 1645,11507},{ 1793,12008},{ 1924,12565},{ 2067,13130},
-        { 2229,13638},{ 2388,14160},{ 2558,14584},{ 2744,14886},
-        { 2932,15194},{ 3116,15531},{ 3311,15858},{ 3538,16197}
-      },
-      /*Y'  qi=7  INTER*/
-      {
-        {   43,   -8},{   36, 1351},{   71, 2923},{  112, 4568},
-        {  138, 6201},{  157, 7705},{  171, 9083},{  181,10390},
-        {  189,11664},{  202,12910},{  220,14121},{  241,15281},
-        {  266,16401},{  295,17507},{  328,18608},{  371,19677},
-        {  430,20701},{  508,21676},{  604,22588},{  727,23397},
-        {  878,24093},{ 1055,24690},{ 1263,25151},{ 1496,25504}
-      }
-    },
-    {
-      /*Cb  qi=7  INTRA*/
-      {
-        {    1,    5},{   40,  367},{   72,  740},{   89, 1175},
-        {  108, 1649},{  129, 2140},{  143, 2637},{  154, 3110},
-        {  169, 3507},{  198, 3860},{  224, 4237},{  235, 4652},
-        {  253, 5037},{  282, 5358},{  307, 5674},{  329, 5986},
-        {  361, 6273},{  393, 6527},{  419, 6777},{  435, 7078},
-        {  467, 7342},{  495, 7554},{  529, 7757},{  591, 8053}
-      },
-      /*Cb  qi=7  INTER*/
-      {
-        {   79,  -33},{   68,  299},{   56,  627},{   50,  978},
-        {   51, 1366},{   55, 1786},{   61, 2213},{   70, 2642},
-        {   80, 3062},{   87, 3474},{   92, 3886},{   96, 4292},
-        {   99, 4684},{  102, 5072},{  103, 5450},{  104, 5814},
-        {  104, 6176},{  104, 6538},{  107, 6905},{  110, 7270},
-        {  110, 7625},{  110, 7978},{  111, 8340},{  117, 8674}
-      }
-    },
-    {
-      /*Cr  qi=7  INTRA*/
-      {
-        {    2,    7},{   31,  375},{   62,  760},{   87, 1223},
-        {  103, 1698},{  115, 2175},{  131, 2644},{  147, 3066},
-        {  161, 3494},{  175, 3889},{  199, 4250},{  229, 4653},
-        {  250, 5001},{  279, 5275},{  311, 5577},{  343, 5889},
-        {  376, 6227},{  417, 6486},{  457, 6689},{  484, 6925},
-        {  518, 7174},{  544, 7393},{  549, 7662},{  577, 8050}
-      },
-      /*Cr  qi=7  INTER*/
-      {
-        {   89,   22},{   62,  332},{   45,  641},{   47,  976},
-        {   52, 1363},{   59, 1779},{   67, 2203},{   76, 2628},
-        {   84, 3046},{   90, 3460},{   94, 3875},{   98, 4272},
-        {   99, 4666},{   98, 5063},{   98, 5459},{   98, 5849},
-        {   99, 6226},{  101, 6594},{  104, 6957},{  109, 7324},
-        {  109, 7686},{  111, 8042},{  115, 8379},{  119, 8699}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=8  INTRA*/
-      {
-        {   91,  -69},{  160, 1585},{  274, 3226},{  423, 4538},
-        {  557, 5596},{  664, 6595},{  778, 7506},{  905, 8319},
-        { 1038, 9035},{ 1186, 9701},{ 1355,10292},{ 1554,10754},
-        { 1739,11196},{ 1904,11639},{ 2047,12184},{ 2194,12763},
-        { 2361,13256},{ 2529,13753},{ 2709,14155},{ 2902,14433},
-        { 3100,14723},{ 3292,15026},{ 3489,15327},{ 3714,15705}
-      },
-      /*Y'  qi=8  INTER*/
-      {
-        {   32, -157},{   33, 1346},{   74, 2914},{  116, 4554},
-        {  142, 6172},{  162, 7648},{  177, 9004},{  186,10300},
-        {  196,11570},{  210,12808},{  231,14001},{  256,15150},
-        {  285,16259},{  319,17352},{  359,18435},{  415,19475},
-        {  489,20470},{  584,21400},{  703,22246},{  852,22968},
-        { 1038,23556},{ 1253,24032},{ 1503,24367},{ 1778,24628}
-      }
-    },
-    {
-      /*Cb  qi=8  INTRA*/
-      {
-        {    1,    4},{   42,  367},{   75,  740},{   93, 1176},
-        {  111, 1649},{  128, 2139},{  144, 2635},{  157, 3103},
-        {  174, 3494},{  206, 3844},{  233, 4207},{  251, 4605},
-        {  277, 4980},{  304, 5284},{  335, 5584},{  359, 5888},
-        {  393, 6152},{  432, 6398},{  455, 6656},{  471, 6956},
-        {  502, 7193},{  528, 7405},{  562, 7630},{  603, 7922}
-      },
-      /*Cb  qi=8  INTER*/
-      {
-        {   77,  -37},{   68,  299},{   58,  632},{   50,  991},
-        {   50, 1382},{   55, 1799},{   62, 2226},{   73, 2647},
-        {   82, 3066},{   90, 3480},{   94, 3891},{   96, 4296},
-        {   98, 4687},{  101, 5073},{  103, 5456},{  104, 5817},
-        {  105, 6170},{  106, 6523},{  107, 6886},{  108, 7250},
-        {  109, 7600},{  110, 7955},{  111, 8305},{  112, 8641}
-      }
-    },
-    {
-      /*Cr  qi=8  INTRA*/
-      {
-        {    2,    7},{   33,  375},{   64,  760},{   92, 1224},
-        {  111, 1700},{  122, 2173},{  137, 2637},{  156, 3055},
-        {  172, 3476},{  186, 3856},{  211, 4211},{  242, 4597},
-        {  263, 4939},{  292, 5214},{  335, 5489},{  376, 5772},
-        {  406, 6099},{  440, 6378},{  483, 6578},{  517, 6797},
-        {  550, 7049},{  571, 7283},{  583, 7560},{  618, 7967}
-      },
-      /*Cr  qi=8  INTER*/
-      {
-        {   74,   25},{   58,  328},{   43,  637},{   45,  980},
-        {   51, 1371},{   59, 1788},{   69, 2207},{   79, 2630},
-        {   86, 3051},{   91, 3470},{   95, 3880},{   97, 4280},
-        {   98, 4680},{   97, 5074},{   96, 5456},{   97, 5839},
-        {   99, 6219},{  101, 6583},{  103, 6945},{  106, 7312},
-        {  110, 7671},{  114, 8009},{  115, 8345},{  117, 8686}
+        {    4,  439},{    2, 1131},{    3, 1593},{    6, 2130},
+        {   14, 2535},{   17, 2786},{   21, 3128},{   27, 3494},
+        {   35, 3875},{   42, 4256},{   48, 4637},{   53, 5019},
+        {   57, 5395},{   61, 5777},{   64, 6156},{   66, 6512},
+        {   68, 6853},{   71, 7183},{   77, 7511},{   81, 7841},
+        {   83, 8192},{   88, 8510},{   93, 8834},{   98, 9138}
       }
     }
   },
@@ -616,557 +93,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=9  INTRA*/
       {
-        {  104,  -68},{  164, 1580},{  288, 3173},{  448, 4439},
-        {  587, 5485},{  702, 6465},{  824, 7351},{  958, 8148},
-        { 1096, 8845},{ 1253, 9480},{ 1432,10047},{ 1640,10494},
-        { 1835,10926},{ 2015,11350},{ 2166,11871},{ 2321,12428},
-        { 2508,12876},{ 2684,13345},{ 2866,13741},{ 3069,13991},
-        { 3281,14243},{ 3487,14518},{ 3689,14813},{ 3911,15175}
+        {   76,  777},{  178, 1995},{  340, 3162},{  591, 4097},
+        {  746, 4973},{  916, 5847},{ 1047, 6687},{ 1218, 7430},
+        { 1385, 8079},{ 1566, 8685},{ 1755, 9167},{ 1992, 9572},
+        { 2164,10023},{ 2395,10270},{ 2536,10755},{ 2694,11285},
+        { 2895,11580},{ 3029,12143},{ 3182,12543},{ 3377,12800},
+        { 3525,13228},{ 3718,13463},{ 3878,13852},{ 4077,14001}
       },
       /*Y'  qi=9  INTER*/
       {
-        {   47, -140},{   34, 1348},{   77, 2915},{  119, 4552},
-        {  145, 6150},{  166, 7600},{  182, 8936},{  192,10221},
-        {  203,11482},{  220,12711},{  244,13886},{  274,15012},
-        {  308,16111},{  349,17190},{  401,18244},{  470,19257},
-        {  561,20209},{  680,21069},{  830,21822},{ 1010,22463},
-        { 1227,22971},{ 1482,23328},{ 1769,23544},{ 2077,23655}
+        {   10,  770},{   45, 1845},{   59, 3227},{   99, 4708},
+        {  135, 6092},{  164, 7425},{  190, 8729},{  218, 9991},
+        {  246,11234},{  281,12427},{  315,13573},{  354,14678},
+        {  402,15734},{  467,16728},{  543,17709},{  639,18610},
+        {  736,19503},{  855,20312},{  995,21033},{ 1151,21656},
+        { 1341,22130},{ 1525,22582},{ 1735,22922},{ 1922,23102}
       }
     },
     {
       /*Cb  qi=9  INTRA*/
       {
-        {    1,    5},{   43,  367},{   76,  740},{   95, 1176},
-        {  114, 1649},{  135, 2138},{  153, 2629},{  165, 3091},
-        {  184, 3481},{  217, 3831},{  244, 4187},{  260, 4572},
-        {  290, 4930},{  320, 5231},{  351, 5521},{  379, 5812},
-        {  414, 6055},{  452, 6307},{  483, 6564},{  502, 6848},
-        {  525, 7115},{  554, 7321},{  589, 7533},{  626, 7833}
+        {   41, 1227},{   70, 1452},{  102, 1697},{  110, 1967},
+        {  134, 2326},{  153, 2695},{  160, 3007},{  196, 3393},
+        {  232, 3769},{  266, 4067},{  297, 4376},{  326, 4728},
+        {  351, 5040},{  390, 5299},{  398, 5538},{  443, 5900},
+        {  448, 6107},{  506, 6370},{  519, 6636},{  525, 6953},
+        {  567, 7177},{  625, 7386},{  622, 7613},{  654, 7764}
       },
       /*Cb  qi=9  INTER*/
       {
-        {  101,  -43},{   81,  298},{   62,  637},{   49,  989},
-        {   51, 1381},{   56, 1806},{   65, 2231},{   74, 2653},
-        {   84, 3071},{   91, 3482},{   95, 3892},{   97, 4293},
-        {   99, 4684},{  101, 5066},{  103, 5437},{  103, 5793},
-        {  103, 6148},{  104, 6511},{  105, 6867},{  107, 7221},
-        {  110, 7572},{  111, 7926},{  112, 8283},{  116, 8625}
+        {    7,  377},{    2, 1102},{    7, 1262},{   19, 1693},
+        {   22, 1957},{   27, 2302},{   35, 2654},{   43, 3034},
+        {   52, 3431},{   58, 3826},{   63, 4207},{   67, 4570},
+        {   71, 4927},{   75, 5283},{   79, 5624},{   82, 5944},
+        {   85, 6279},{   88, 6616},{   94, 6955},{  102, 7284},
+        {  108, 7622},{  116, 7944},{  124, 8293},{  133, 8568}
       }
     },
     {
       /*Cr  qi=9  INTRA*/
       {
-        {    2,    7},{   35,  375},{   66,  761},{   93, 1224},
-        {  112, 1700},{  126, 2173},{  144, 2633},{  165, 3047},
-        {  183, 3458},{  199, 3835},{  224, 4191},{  257, 4558},
-        {  283, 4887},{  309, 5176},{  351, 5446},{  397, 5713},
-        {  433, 6017},{  469, 6283},{  508, 6480},{  546, 6687},
-        {  579, 6945},{  600, 7182},{  610, 7434},{  623, 7793}
+        {   38, 1217},{   61, 1473},{   88, 1650},{  100, 1908},
+        {  137, 2400},{  147, 2777},{  176, 3149},{  205, 3433},
+        {  227, 3772},{  249, 4092},{  286, 4370},{  313, 4746},
+        {  342, 5053},{  368, 5261},{  411, 5530},{  442, 5859},
+        {  494, 6061},{  526, 6340},{  532, 6646},{  580, 6799},
+        {  567, 7203},{  649, 7357},{  625, 7559},{  660, 7709}
       },
       /*Cr  qi=9  INTER*/
       {
-        {   77,   15},{   57,  330},{   45,  640},{   48,  980},
-        {   54, 1380},{   61, 1802},{   70, 2220},{   80, 2639},
-        {   87, 3057},{   92, 3474},{   94, 3882},{   98, 4282},
-        {   98, 4675},{   97, 5062},{   97, 5450},{   98, 5829},
-        {  100, 6197},{  101, 6561},{  104, 6927},{  107, 7289},
-        {  113, 7638},{  117, 7978},{  119, 8311},{  117, 8629}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=10  INTRA*/
-      {
-        {  101,  -69},{  168, 1574},{  299, 3143},{  465, 4386},
-        {  610, 5410},{  736, 6353},{  866, 7207},{ 1006, 7982},
-        { 1153, 8655},{ 1319, 9261},{ 1504, 9812},{ 1719,10248},
-        { 1928,10653},{ 2116,11056},{ 2282,11550},{ 2458,12070},
-        { 2654,12492},{ 2846,12923},{ 3043,13291},{ 3249,13537},
-        { 3466,13764},{ 3682,13999},{ 3896,14268},{ 4145,14548}
-      },
-      /*Y'  qi=10  INTER*/
-      {
-        {   48,  -94},{   34, 1355},{   81, 2920},{  124, 4545},
-        {  151, 6113},{  174, 7532},{  190, 8850},{  201,10125},
-        {  214,11379},{  235,12591},{  264,13745},{  299,14859},
-        {  338,15948},{  388,17008},{  456,18029},{  546,18988},
-        {  661,19877},{  808,20666},{  993,21321},{ 1218,21835},
-        { 1481,22203},{ 1783,22420},{ 2117,22504},{ 2469,22481}
-      }
-    },
-    {
-      /*Cb  qi=10  INTRA*/
-      {
-        {    2,    4},{   44,  367},{   79,  740},{   99, 1178},
-        {  117, 1652},{  137, 2141},{  156, 2630},{  170, 3089},
-        {  192, 3474},{  227, 3813},{  259, 4157},{  282, 4526},
-        {  310, 4860},{  342, 5140},{  377, 5425},{  400, 5714},
-        {  436, 5952},{  475, 6194},{  496, 6468},{  522, 6748},
-        {  559, 6996},{  587, 7216},{  617, 7433},{  673, 7678}
-      },
-      /*Cb  qi=10  INTER*/
-      {
-        {   87,  -37},{   72,  301},{   58,  636},{   49,  995},
-        {   51, 1394},{   57, 1819},{   66, 2241},{   78, 2660},
-        {   87, 3074},{   93, 3482},{   97, 3891},{   99, 4294},
-        {  101, 4678},{  103, 5050},{  105, 5414},{  106, 5773},
-        {  107, 6134},{  108, 6485},{  110, 6832},{  113, 7187},
-        {  113, 7547},{  114, 7887},{  117, 8230},{  112, 8590}
-      }
-    },
-    {
-      /*Cr  qi=10  INTRA*/
-      {
-        {    2,    7},{   38,  375},{   69,  761},{   96, 1224},
-        {  116, 1701},{  131, 2175},{  148, 2634},{  168, 3041},
-        {  190, 3439},{  211, 3802},{  238, 4151},{  271, 4506},
-        {  297, 4824},{  331, 5103},{  373, 5360},{  415, 5632},
-        {  459, 5928},{  500, 6176},{  535, 6386},{  573, 6586},
-        {  608, 6834},{  629, 7079},{  642, 7337},{  686, 7680}
-      },
-      /*Cr  qi=10  INTER*/
-      {
-        {   81,   34},{   63,  333},{   50,  633},{   48,  987},
-        {   53, 1397},{   61, 1820},{   71, 2237},{   83, 2651},
-        {   91, 3065},{   95, 3479},{   98, 3882},{  100, 4279},
-        {  101, 4673},{  101, 5054},{  100, 5429},{  101, 5801},
-        {  102, 6173},{  104, 6541},{  108, 6904},{  110, 7264},
-        {  114, 7609},{  119, 7945},{  123, 8275},{  128, 8615}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=11  INTRA*/
-      {
-        {  110,  -66},{  176, 1564},{  316, 3087},{  492, 4296},
-        {  645, 5299},{  781, 6217},{  924, 7039},{ 1075, 7776},
-        { 1232, 8421},{ 1410, 9005},{ 1607, 9532},{ 1834, 9929},
-        { 2053,10300},{ 2249,10697},{ 2427,11184},{ 2619,11682},
-        { 2826,12083},{ 3019,12508},{ 3225,12869},{ 3452,13064},
-        { 3670,13280},{ 3890,13519},{ 4123,13750},{ 4367,14059}
-      },
-      /*Y'  qi=11  INTER*/
-      {
-        {   72, -115},{   32, 1354},{   83, 2911},{  126, 4534},
-        {  154, 6080},{  178, 7475},{  194, 8779},{  205,10047},
-        {  222,11290},{  246,12488},{  281,13621},{  322,14714},
-        {  372,15786},{  436,16821},{  519,17813},{  628,18728},
-        {  770,19549},{  950,20254},{ 1175,20800},{ 1443,21197},
-        { 1752,21446},{ 2095,21555},{ 2457,21553},{ 2808,21544}
-      }
-    },
-    {
-      /*Cb  qi=11  INTRA*/
-      {
-        {    2,    4},{   45,  367},{   81,  740},{  101, 1177},
-        {  121, 1650},{  142, 2136},{  159, 2621},{  174, 3075},
-        {  199, 3451},{  234, 3778},{  265, 4117},{  297, 4473},
-        {  333, 4789},{  367, 5054},{  402, 5319},{  427, 5613},
-        {  462, 5871},{  503, 6107},{  532, 6336},{  560, 6584},
-        {  601, 6842},{  631, 7092},{  662, 7292},{  721, 7497}
-      },
-      /*Cb  qi=11  INTER*/
-      {
-        {  117,  -24},{   93,  308},{   69,  638},{   52,  993},
-        {   52, 1395},{   58, 1822},{   68, 2246},{   80, 2665},
-        {   89, 3082},{   94, 3492},{   96, 3900},{   98, 4299},
-        {  101, 4679},{  103, 5047},{  104, 5405},{  106, 5763},
-        {  106, 6120},{  107, 6474},{  109, 6823},{  112, 7163},
-        {  115, 7516},{  117, 7868},{  118, 8213},{  119, 8561}
-      }
-    },
-    {
-      /*Cr  qi=11  INTRA*/
-      {
-        {    2,    7},{   40,  375},{   75,  761},{  100, 1224},
-        {  119, 1700},{  137, 2169},{  154, 2622},{  178, 3025},
-        {  198, 3416},{  220, 3770},{  255, 4114},{  294, 4459},
-        {  323, 4756},{  359, 5028},{  399, 5292},{  438, 5556},
-        {  483, 5827},{  518, 6073},{  551, 6298},{  598, 6501},
-        {  634, 6754},{  652, 6997},{  670, 7211},{  689, 7560}
-      },
-      /*Cr  qi=11  INTER*/
-      {
-        {   75,   30},{   61,  334},{   51,  639},{   49,  995},
-        {   53, 1403},{   62, 1821},{   73, 2237},{   84, 2654},
-        {   91, 3070},{   95, 3485},{   96, 3890},{   98, 4287},
-        {   98, 4672},{   99, 5050},{   99, 5427},{  100, 5798},
-        {  103, 6169},{  105, 6528},{  107, 6881},{  113, 7233},
-        {  118, 7580},{  121, 7916},{  125, 8240},{  130, 8551}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=12  INTRA*/
-      {
-        {  104,  -69},{  182, 1557},{  335, 3040},{  521, 4205},
-        {  684, 5178},{  831, 6068},{  986, 6854},{ 1151, 7559},
-        { 1323, 8169},{ 1523, 8704},{ 1736, 9192},{ 1978, 9558},
-        { 2213, 9908},{ 2421,10298},{ 2613,10757},{ 2822,11208},
-        { 3042,11585},{ 3250,11991},{ 3474,12308},{ 3710,12480},
-        { 3939,12687},{ 4174,12902},{ 4416,13102},{ 4672,13369}
-      },
-      /*Y'  qi=12  INTER*/
-      {
-        {   52,  -91},{   34, 1355},{   86, 2911},{  129, 4518},
-        {  159, 6037},{  184, 7405},{  200, 8694},{  213, 9955},
-        {  232,11185},{  263,12360},{  304,13479},{  354,14555},
-        {  415,15601},{  495,16608},{  601,17549},{  738,18400},
-        {  915,19136},{ 1139,19724},{ 1414,20150},{ 1731,20412},
-        { 2090,20520},{ 2473,20509},{ 2851,20442},{ 3227,20328}
-      }
-    },
-    {
-      /*Cb  qi=12  INTRA*/
-      {
-        {    1,    4},{   46,  367},{   85,  740},{  109, 1178},
-        {  126, 1650},{  145, 2134},{  165, 2617},{  182, 3061},
-        {  209, 3428},{  245, 3749},{  281, 4077},{  316, 4417},
-        {  354, 4718},{  392, 4970},{  430, 5217},{  456, 5501},
-        {  490, 5771},{  534, 5996},{  571, 6207},{  600, 6458},
-        {  644, 6697},{  675, 6942},{  707, 7151},{  766, 7342}
-      },
-      /*Cb  qi=12  INTER*/
-      {
-        {   84,  -24},{   73,  311},{   60,  644},{   52,  998},
-        {   53, 1398},{   60, 1825},{   71, 2249},{   83, 2665},
-        {   90, 3081},{   94, 3490},{   97, 3893},{   99, 4286},
-        {  102, 4663},{  104, 5032},{  105, 5393},{  106, 5751},
-        {  107, 6102},{  108, 6445},{  111, 6788},{  113, 7136},
-        {  114, 7483},{  117, 7828},{  121, 8163},{  122, 8496}
-      }
-    },
-    {
-      /*Cr  qi=12  INTRA*/
-      {
-        {    3,    7},{   41,  375},{   78,  761},{  106, 1225},
-        {  124, 1700},{  140, 2167},{  163, 2616},{  188, 3010},
-        {  213, 3385},{  240, 3718},{  271, 4062},{  309, 4406},
-        {  345, 4691},{  387, 4956},{  430, 5212},{  469, 5467},
-        {  513, 5729},{  554, 5970},{  587, 6176},{  633, 6395},
-        {  673, 6659},{  692, 6868},{  712, 7061},{  758, 7259}
-      },
-      /*Cr  qi=12  INTER*/
-      {
-        {   73,   31},{   59,  335},{   48,  638},{   50,  998},
-        {   56, 1410},{   65, 1827},{   75, 2240},{   85, 2657},
-        {   92, 3073},{   95, 3485},{   97, 3888},{   99, 4279},
-        {   98, 4663},{   99, 5042},{  101, 5412},{  102, 5779},
-        {  105, 6142},{  107, 6498},{  108, 6848},{  113, 7198},
-        {  118, 7540},{  121, 7867},{  127, 8188},{  132, 8508}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=13  INTRA*/
-      {
-        {  109,  -68},{  187, 1551},{  347, 3010},{  541, 4153},
-        {  709, 5107},{  864, 5975},{ 1026, 6745},{ 1194, 7433},
-        { 1375, 8021},{ 1581, 8550},{ 1803, 9026},{ 2054, 9371},
-        { 2301, 9713},{ 2522,10082},{ 2728,10515},{ 2949,10956},
-        { 3184,11297},{ 3408,11653},{ 3643,11946},{ 3886,12100},
-        { 4124,12277},{ 4377,12459},{ 4632,12635},{ 4898,12861}
-      },
-      /*Y'  qi=13  INTER*/
-      {
-        {   48,  -78},{   35, 1357},{   89, 2914},{  133, 4512},
-        {  164, 6004},{  190, 7348},{  207, 8627},{  222, 9881},
-        {  247,11096},{  284,12251},{  333,13350},{  392,14407},
-        {  466,15426},{  565,16391},{  696,17279},{  865,18058},
-        { 1085,18689},{ 1358,19156},{ 1684,19456},{ 2050,19605},
-        { 2447,19614},{ 2855,19524},{ 3243,19398},{ 3611,19201}
-      }
-    },
-    {
-      /*Cb  qi=13  INTRA*/
-      {
-        {    2,    4},{   47,  367},{   86,  741},{  108, 1179},
-        {  127, 1651},{  150, 2133},{  173, 2611},{  194, 3050},
-        {  222, 3417},{  262, 3733},{  303, 4048},{  337, 4375},
-        {  378, 4657},{  420, 4897},{  456, 5148},{  486, 5422},
-        {  518, 5682},{  558, 5903},{  592, 6113},{  623, 6372},
-        {  662, 6628},{  700, 6833},{  751, 6989},{  805, 7147}
-      },
-      /*Cb  qi=13  INTER*/
-      {
-        {   94,  -34},{   78,  303},{   60,  638},{   51,  994},
-        {   54, 1406},{   61, 1836},{   73, 2253},{   84, 2668},
-        {   92, 3082},{   96, 3492},{   99, 3894},{  101, 4284},
-        {  103, 4659},{  105, 5023},{  106, 5376},{  108, 5726},
-        {  109, 6070},{  110, 6418},{  113, 6765},{  117, 7105},
-        {  119, 7448},{  122, 7784},{  126, 8119},{  131, 8463}
-      }
-    },
-    {
-      /*Cr  qi=13  INTRA*/
-      {
-        {    3,    7},{   43,  375},{   80,  762},{  110, 1226},
-        {  131, 1701},{  149, 2166},{  172, 2610},{  196, 2999},
-        {  221, 3359},{  254, 3679},{  292, 4005},{  332, 4329},
-        {  369, 4612},{  408, 4880},{  456, 5139},{  500, 5388},
-        {  544, 5631},{  581, 5877},{  615, 6101},{  660, 6316},
-        {  692, 6594},{  714, 6795},{  736, 6997},{  789, 7290}
-      },
-      /*Cr  qi=13  INTER*/
-      {
-        {   73,   28},{   61,  336},{   46,  642},{   50, 1003},
-        {   58, 1414},{   67, 1832},{   79, 2245},{   87, 2660},
-        {   93, 3075},{   97, 3484},{   99, 3888},{  100, 4277},
-        {  100, 4651},{  100, 5027},{  101, 5403},{  102, 5765},
-        {  105, 6116},{  109, 6470},{  113, 6825},{  119, 7163},
-        {  124, 7497},{  127, 7827},{  131, 8137},{  135, 8437}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=14  INTRA*/
-      {
-        {  113,  -68},{  191, 1545},{  358, 2981},{  559, 4104},
-        {  733, 5044},{  896, 5890},{ 1066, 6636},{ 1241, 7304},
-        { 1428, 7886},{ 1642, 8402},{ 1872, 8871},{ 2128, 9219},
-        { 2380, 9547},{ 2609, 9908},{ 2825,10321},{ 3055,10728},
-        { 3294,11076},{ 3523,11425},{ 3766,11689},{ 4013,11845},
-        { 4254,12022},{ 4506,12209},{ 4759,12383},{ 5013,12637}
-      },
-      /*Y'  qi=14  INTER*/
-      {
-        {   58,  -82},{   38, 1362},{   93, 2914},{  138, 4492},
-        {  171, 5962},{  198, 7289},{  216, 8559},{  234, 9804},
-        {  263,11005},{  306,12143},{  363,13222},{  434,14259},
-        {  523,15255},{  639,16188},{  794,17021},{ 1000,17717},
-        { 1262,18260},{ 1575,18645},{ 1943,18841},{ 2356,18872},
-        { 2782,18802},{ 3194,18682},{ 3576,18559},{ 3923,18447}
-      }
-    },
-    {
-      /*Cb  qi=14  INTRA*/
-      {
-        {    2,    3},{   50,  367},{   91,  741},{  114, 1180},
-        {  134, 1651},{  157, 2131},{  181, 2601},{  208, 3028},
-        {  239, 3391},{  279, 3706},{  322, 4000},{  361, 4309},
-        {  406, 4587},{  445, 4822},{  482, 5067},{  515, 5344},
-        {  546, 5612},{  589, 5821},{  626, 6020},{  655, 6276},
-        {  701, 6523},{  748, 6717},{  796, 6876},{  815, 7151}
-      },
-      /*Cb  qi=14  INTER*/
-      {
-        {   80,  -43},{   68,  301},{   56,  644},{   50, 1004},
-        {   54, 1412},{   63, 1836},{   75, 2253},{   87, 2670},
-        {   94, 3083},{   98, 3487},{  101, 3885},{  103, 4271},
-        {  106, 4645},{  107, 5004},{  108, 5358},{  109, 5705},
-        {  112, 6047},{  115, 6388},{  118, 6731},{  121, 7081},
-        {  126, 7421},{  129, 7747},{  132, 8076},{  137, 8419}
-      }
-    },
-    {
-      /*Cr  qi=14  INTRA*/
-      {
-        {    3,    6},{   45,  375},{   85,  762},{  116, 1226},
-        {  138, 1700},{  158, 2163},{  180, 2602},{  206, 2985},
-        {  236, 3333},{  270, 3639},{  310, 3956},{  359, 4258},
-        {  397, 4524},{  430, 4802},{  478, 5068},{  527, 5316},
-        {  572, 5560},{  613, 5802},{  654, 6012},{  699, 6216},
-        {  734, 6489},{  755, 6707},{  775, 6898},{  841, 7111}
-      },
-      /*Cr  qi=14  INTER*/
-      {
-        {   78,    0},{   59,  322},{   46,  649},{   51, 1016},
-        {   58, 1422},{   68, 1839},{   81, 2253},{   90, 2666},
-        {   95, 3080},{   98, 3486},{  101, 3881},{  102, 4268},
-        {  102, 4644},{  103, 5017},{  105, 5382},{  106, 5743},
-        {  108, 6093},{  112, 6442},{  118, 6791},{  124, 7130},
-        {  127, 7463},{  133, 7784},{  138, 8085},{  142, 8395}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=15  INTRA*/
-      {
-        {  111,  -66},{  197, 1538},{  370, 2949},{  579, 4050},
-        {  762, 4968},{  933, 5798},{ 1112, 6520},{ 1299, 7161},
-        { 1497, 7725},{ 1723, 8219},{ 1967, 8654},{ 2234, 8990},
-        { 2499, 9302},{ 2740, 9637},{ 2968,10039},{ 3215,10414},
-        { 3473,10709},{ 3721,11015},{ 3971,11270},{ 4228,11402},
-        { 4487,11543},{ 4752,11707},{ 5011,11871},{ 5290,12099}
-      },
-      /*Y'  qi=15  INTER*/
-      {
-        {   59, -113},{   37, 1349},{   95, 2904},{  139, 4478},
-        {  174, 5929},{  201, 7244},{  220, 8505},{  241, 9736},
-        {  275,10922},{  327,12040},{  395,13097},{  477,14114},
-        {  585,15071},{  730,15947},{  917,16714},{ 1162,17326},
-        { 1468,17770},{ 1833,18029},{ 2251,18111},{ 2694,18068},
-        { 3125,17968},{ 3529,17845},{ 3908,17713},{ 4260,17587}
-      }
-    },
-    {
-      /*Cb  qi=15  INTRA*/
-      {
-        {    2,    3},{   51,  367},{   94,  741},{  120, 1180},
-        {  140, 1651},{  160, 2129},{  184, 2591},{  213, 3010},
-        {  246, 3371},{  289, 3680},{  335, 3969},{  374, 4274},
-        {  418, 4546},{  460, 4783},{  498, 5019},{  532, 5280},
-        {  565, 5553},{  608, 5765},{  647, 5958},{  683, 6193},
-        {  732, 6433},{  782, 6620},{  832, 6769},{  848, 7027}
-      },
-      /*Cb  qi=15  INTER*/
-      {
-        {   71,  -52},{   63,  296},{   54,  644},{   50, 1010},
-        {   53, 1417},{   64, 1837},{   77, 2253},{   88, 2666},
-        {   95, 3079},{   98, 3487},{  100, 3882},{  103, 4264},
-        {  106, 4633},{  108, 4991},{  109, 5343},{  109, 5693},
-        {  112, 6038},{  114, 6371},{  119, 6709},{  123, 7051},
-        {  125, 7385},{  130, 7716},{  135, 8050},{  140, 8374}
-      }
-    },
-    {
-      /*Cr  qi=15  INTRA*/
-      {
-        {    2,    6},{   47,  375},{   87,  763},{  119, 1225},
-        {  143, 1699},{  162, 2158},{  185, 2595},{  213, 2971},
-        {  246, 3315},{  279, 3618},{  320, 3920},{  372, 4210},
-        {  409, 4480},{  446, 4756},{  496, 5017},{  542, 5263},
-        {  590, 5487},{  639, 5721},{  687, 5923},{  724, 6132},
-        {  753, 6417},{  781, 6622},{  805, 6806},{  856, 6977}
-      },
-      /*Cr  qi=15  INTER*/
-      {
-        {   71,    3},{   61,  326},{   52,  651},{   50, 1017},
-        {   58, 1422},{   69, 1837},{   82, 2251},{   90, 2668},
-        {   95, 3080},{   98, 3484},{  101, 3877},{  102, 4257},
-        {  102, 4632},{  101, 5005},{  103, 5370},{  106, 5733},
-        {  110, 6082},{  116, 6424},{  120, 6774},{  124, 7106},
-        {  130, 7427},{  135, 7748},{  141, 8052},{  147, 8333}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=16  INTRA*/
-      {
-        {  114,  -63},{  206, 1525},{  396, 2887},{  618, 3945},
-        {  816, 4832},{ 1002, 5626},{ 1196, 6319},{ 1401, 6923},
-        { 1616, 7458},{ 1857, 7928},{ 2121, 8334},{ 2405, 8645},
-        { 2685, 8934},{ 2938, 9255},{ 3175, 9638},{ 3433, 9990},
-        { 3707,10263},{ 3958,10577},{ 4218,10807},{ 4488,10906},
-        { 4760,11028},{ 5037,11148},{ 5306,11286},{ 5625,11463}
-      },
-      /*Y'  qi=16  INTER*/
-      {
-        {   69, -153},{   39, 1348},{   98, 2894},{  144, 4448},
-        {  181, 5872},{  209, 7167},{  228, 8422},{  254, 9644},
-        {  297,10810},{  359,11908},{  438,12944},{  539,13930},
-        {  672,14842},{  850,15650},{ 1085,16318},{ 1391,16793},
-        { 1769,17082},{ 2200,17198},{ 2659,17174},{ 3116,17072},
-        { 3547,16948},{ 3943,16819},{ 4299,16701},{ 4611,16644}
-      }
-    },
-    {
-      /*Cb  qi=16  INTRA*/
-      {
-        {    3,    4},{   54,  367},{   97,  742},{  122, 1181},
-        {  143, 1651},{  168, 2123},{  197, 2575},{  226, 2985},
-        {  263, 3338},{  314, 3631},{  367, 3903},{  409, 4200},
-        {  453, 4468},{  491, 4703},{  528, 4932},{  566, 5188},
-        {  601, 5459},{  647, 5672},{  693, 5844},{  734, 6058},
-        {  784, 6305},{  836, 6460},{  882, 6602},{  905, 6891}
-      },
-      /*Cb  qi=16  INTER*/
-      {
-        {   75,  -64},{   67,  292},{   56,  645},{   51, 1016},
-        {   54, 1421},{   66, 1842},{   79, 2257},{   89, 2670},
-        {   95, 3082},{   98, 3488},{  101, 3879},{  104, 4258},
-        {  106, 4623},{  108, 4974},{  109, 5321},{  113, 5664},
-        {  116, 6001},{  117, 6341},{  123, 6677},{  128, 7004},
-        {  130, 7336},{  136, 7671},{  143, 7996},{  148, 8310}
-      }
-    },
-    {
-      /*Cr  qi=16  INTRA*/
-      {
-        {    4,    7},{   50,  375},{   90,  763},{  124, 1225},
-        {  148, 1698},{  168, 2154},{  195, 2582},{  227, 2948},
-        {  263, 3279},{  302, 3575},{  343, 3865},{  394, 4137},
-        {  439, 4402},{  482, 4672},{  533, 4925},{  579, 5165},
-        {  626, 5382},{  675, 5616},{  725, 5812},{  769, 5991},
-        {  810, 6242},{  848, 6430},{  868, 6615},{  944, 6732}
-      },
-      /*Cr  qi=16  INTER*/
-      {
-        {   78,   11},{   62,  327},{   49,  650},{   50, 1025},
-        {   59, 1431},{   72, 1841},{   83, 2253},{   90, 2671},
-        {   95, 3084},{   98, 3487},{  100, 3879},{  101, 4254},
-        {  102, 4625},{  103, 4994},{  106, 5355},{  108, 5708},
-        {  111, 6058},{  115, 6400},{  121, 6733},{  128, 7058},
-        {  134, 7374},{  140, 7691},{  146, 7993},{  146, 8317}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=17  INTRA*/
-      {
-        {  112,  -59},{  210, 1515},{  409, 2850},{  640, 3882},
-        {  844, 4748},{ 1038, 5529},{ 1240, 6206},{ 1452, 6803},
-        { 1676, 7330},{ 1925, 7792},{ 2194, 8201},{ 2483, 8512},
-        { 2766, 8801},{ 3027, 9121},{ 3279, 9482},{ 3548, 9810},
-        { 3825,10069},{ 4088,10345},{ 4362,10544},{ 4638,10644},
-        { 4915,10744},{ 5196,10850},{ 5471,10981},{ 5802,11136}
-      },
-      /*Y'  qi=17  INTER*/
-      {
-        {   70, -147},{   45, 1349},{  106, 2894},{  155, 4425},
-        {  195, 5818},{  225, 7099},{  247, 8348},{  278, 9565},
-        {  328,10717},{  399,11794},{  491,12807},{  609,13760},
-        {  766,14623},{  984,15349},{ 1274,15902},{ 1642,16256},
-        { 2082,16411},{ 2563,16409},{ 3048,16315},{ 3508,16194},
-        { 3924,16064},{ 4306,15938},{ 4656,15828},{ 4966,15733}
-      }
-    },
-    {
-      /*Cb  qi=17  INTRA*/
-      {
-        {    3,    4},{   57,  367},{  101,  742},{  126, 1182},
-        {  148, 1650},{  175, 2118},{  207, 2565},{  241, 2966},
-        {  279, 3307},{  331, 3588},{  389, 3845},{  435, 4132},
-        {  474, 4408},{  517, 4641},{  560, 4869},{  602, 5122},
-        {  638, 5389},{  672, 5610},{  716, 5787},{  758, 6002},
-        {  817, 6226},{  869, 6393},{  916, 6530},{  950, 6799}
-      },
-      /*Cb  qi=17  INTER*/
-      {
-        {  105,  -65},{   86,  288},{   66,  638},{   54, 1014},
-        {   59, 1427},{   71, 1844},{   86, 2257},{   95, 2668},
-        {  100, 3075},{  103, 3476},{  106, 3867},{  110, 4241},
-        {  112, 4598},{  114, 4948},{  117, 5294},{  121, 5633},
-        {  123, 5968},{  126, 6301},{  131, 6637},{  136, 6968},
-        {  144, 7287},{  152, 7606},{  158, 7931},{  162, 8262}
-      }
-    },
-    {
-      /*Cr  qi=17  INTRA*/
-      {
-        {    4,    6},{   55,  376},{   97,  765},{  128, 1226},
-        {  152, 1696},{  175, 2144},{  204, 2568},{  241, 2928},
-        {  282, 3250},{  323, 3530},{  368, 3811},{  420, 4089},
-        {  463, 4347},{  505, 4609},{  562, 4860},{  609, 5094},
-        {  655, 5303},{  709, 5535},{  759, 5740},{  803, 5913},
-        {  844, 6153},{  879, 6350},{  905, 6527},{  972, 6637}
-      },
-      /*Cr  qi=17  INTER*/
-      {
-        {   88,    8},{   68,  330},{   51,  653},{   54, 1028},
-        {   65, 1433},{   77, 1845},{   89, 2257},{   96, 2669},
-        {  100, 3081},{  102, 3481},{  105, 3867},{  106, 4245},
-        {  108, 4613},{  110, 4971},{  112, 5328},{  115, 5679},
-        {  120, 6019},{  127, 6355},{  133, 6686},{  140, 7007},
-        {  149, 7316},{  158, 7618},{  166, 7924},{  170, 8232}
+        {    5,  408},{    3, 1197},{    7, 1275},{   16, 1695},
+        {   22, 1979},{   30, 2324},{   38, 2691},{   47, 3071},
+        {   53, 3462},{   59, 3857},{   64, 4255},{   69, 4612},
+        {   74, 4975},{   76, 5347},{   81, 5694},{   86, 6020},
+        {   91, 6357},{   96, 6687},{  102, 7020},{  108, 7351},
+        {  115, 7663},{  122, 7979},{  125, 8298},{  136, 8576}
       }
     }
   },
@@ -1174,557 +155,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=18  INTRA*/
       {
-        {  122,  -58},{  216, 1506},{  425, 2815},{  665, 3822},
-        {  882, 4666},{ 1088, 5425},{ 1301, 6084},{ 1529, 6653},
-        { 1766, 7162},{ 2026, 7611},{ 2312, 7987},{ 2612, 8278},
-        { 2913, 8551},{ 3196, 8840},{ 3454, 9184},{ 3734, 9490},
-        { 4030, 9725},{ 4305, 9973},{ 4585,10162},{ 4864,10251},
-        { 5150,10324},{ 5443,10420},{ 5727,10536},{ 6053,10682}
+        {   83,  534},{  261, 1697},{  507, 2691},{  852, 3418},
+        { 1127, 4094},{ 1378, 4775},{ 1626, 5442},{ 1905, 5975},
+        { 2164, 6468},{ 2445, 6913},{ 2704, 7301},{ 3001, 7631},
+        { 3285, 7934},{ 3536, 8217},{ 3837, 8489},{ 4076, 8814},
+        { 4325, 9046},{ 4590, 9313},{ 4794, 9546},{ 5062, 9751},
+        { 5285, 9963},{ 5578,10079},{ 5777,10302},{ 6054,10296}
       },
       /*Y'  qi=18  INTER*/
       {
-        {   66, -143},{   47, 1351},{  108, 2886},{  158, 4401},
-        {  200, 5775},{  232, 7044},{  256, 8288},{  292, 9493},
-        {  351,10625},{  434,11679},{  541,12665},{  681,13578},
-        {  875,14379},{ 1136,15025},{ 1483,15475},{ 1914,15709},
-        { 2399,15767},{ 2907,15699},{ 3400,15579},{ 3852,15453},
-        { 4259,15332},{ 4630,15221},{ 4976,15121},{ 5294,15061}
+        {   33,  490},{   62, 1599},{   96, 3015},{  164, 4378},
+        {  225, 5633},{  285, 6831},{  351, 7999},{  427, 9133},
+        {  526,10181},{  652,11141},{  829,11991},{ 1049,12732},
+        { 1310,13367},{ 1592,13896},{ 1881,14350},{ 2207,14667},
+        { 2529,14877},{ 2873,14980},{ 3231,14949},{ 3571,14926},
+        { 3922,14816},{ 4246,14715},{ 4559,14579},{ 4778,14590}
       }
     },
     {
       /*Cb  qi=18  INTRA*/
       {
-        {    2,    3},{   61,  367},{  107,  743},{  131, 1182},
-        {  155, 1648},{  183, 2110},{  220, 2542},{  260, 2927},
-        {  303, 3265},{  359, 3540},{  416, 3785},{  462, 4063},
-        {  506, 4334},{  553, 4567},{  595, 4797},{  636, 5049},
-        {  676, 5304},{  717, 5516},{  759, 5698},{  801, 5904},
-        {  861, 6133},{  911, 6311},{  962, 6443},{ 1021, 6645}
+        {   55,  825},{   95, 1021},{  131, 1276},{  150, 1618},
+        {  180, 1958},{  220, 2306},{  256, 2608},{  322, 2939},
+        {  385, 3239},{  436, 3530},{  475, 3771},{  518, 4078},
+        {  557, 4348},{  604, 4592},{  620, 4851},{  676, 5083},
+        {  704, 5363},{  739, 5582},{  788, 5782},{  819, 6000},
+        {  893, 6158},{  940, 6418},{  984, 6499},{ 1035, 6596}
       },
       /*Cb  qi=18  INTER*/
       {
-        {  126,    5},{   95,  326},{   66,  643},{   55, 1015},
-        {   60, 1427},{   73, 1843},{   87, 2256},{   96, 2667},
-        {  101, 3073},{  104, 3470},{  108, 3853},{  111, 4226},
-        {  114, 4584},{  117, 4928},{  119, 5274},{  122, 5612},
-        {  126, 5942},{  130, 6271},{  136, 6606},{  141, 6931},
-        {  148, 7247},{  156, 7568},{  164, 7891},{  173, 8211}
+        {   -2,  642},{   12,  771},{   20, 1054},{   29, 1394},
+        {   35, 1721},{   45, 2080},{   53, 2450},{   63, 2835},
+        {   73, 3225},{   81, 3596},{   87, 3952},{   95, 4300},
+        {  102, 4634},{  109, 4959},{  115, 5283},{  120, 5608},
+        {  130, 5931},{  139, 6254},{  152, 6571},{  163, 6887},
+        {  179, 7204},{  191, 7508},{  198, 7834},{  224, 8066}
       }
     },
     {
       /*Cr  qi=18  INTRA*/
       {
-        {    4,    6},{   59,  376},{  104,  765},{  133, 1226},
-        {  156, 1692},{  184, 2136},{  218, 2548},{  260, 2893},
-        {  308, 3204},{  348, 3481},{  397, 3751},{  448, 4024},
-        {  490, 4281},{  541, 4523},{  593, 4776},{  634, 5022},
-        {  685, 5236},{  748, 5455},{  812, 5638},{  856, 5818},
-        {  891, 6048},{  928, 6230},{  961, 6405},{ 1055, 6449}
+        {   49,  780},{   86,  986},{  120, 1261},{  137, 1588},
+        {  183, 1998},{  228, 2339},{  291, 2670},{  334, 2938},
+        {  376, 3239},{  412, 3522},{  459, 3783},{  490, 4113},
+        {  547, 4321},{  593, 4571},{  640, 4828},{  675, 5137},
+        {  730, 5254},{  774, 5524},{  821, 5754},{  859, 5911},
+        {  887, 6178},{  982, 6266},{  941, 6536},{  996, 6630}
       },
       /*Cr  qi=18  INTER*/
       {
-        {   81,   34},{   68,  342},{   57,  652},{   59, 1027},
-        {   67, 1439},{   80, 1848},{   91, 2257},{   97, 2670},
-        {  100, 3076},{  103, 3473},{  106, 3857},{  108, 4231},
-        {  109, 4599},{  110, 4958},{  113, 5307},{  119, 5650},
-        {  125, 5991},{  130, 6325},{  138, 6651},{  147, 6971},
-        {  153, 7278},{  162, 7578},{  172, 7874},{  177, 8156}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=19  INTRA*/
-      {
-        {  128,  -55},{  228, 1495},{  448, 2775},{  699, 3758},
-        {  931, 4571},{ 1154, 5296},{ 1386, 5914},{ 1636, 6450},
-        { 1894, 6930},{ 2177, 7342},{ 2479, 7698},{ 2792, 7976},
-        { 3099, 8235},{ 3392, 8517},{ 3658, 8853},{ 3938, 9155},
-        { 4242, 9371},{ 4527, 9605},{ 4810, 9781},{ 5089, 9853},
-        { 5378, 9920},{ 5674,10009},{ 5972,10110},{ 6336,10196}
-      },
-      /*Y'  qi=19  INTER*/
-      {
-        {   69, -147},{   49, 1353},{  111, 2883},{  162, 4381},
-        {  205, 5737},{  237, 6996},{  264, 8232},{  307, 9421},
-        {  376,10534},{  472,11567},{  596,12525},{  761,13395},
-        {  990,14130},{ 1298,14694},{ 1695,15053},{ 2172,15195},
-        { 2696,15173},{ 3213,15075},{ 3696,14948},{ 4141,14829},
-        { 4541,14721},{ 4910,14609},{ 5245,14506},{ 5536,14399}
-      }
-    },
-    {
-      /*Cb  qi=19  INTRA*/
-      {
-        {    3,    3},{   61,  367},{  109,  743},{  135, 1182},
-        {  161, 1646},{  191, 2101},{  229, 2524},{  273, 2898},
-        {  318, 3221},{  376, 3490},{  436, 3731},{  487, 3994},
-        {  539, 4251},{  584, 4485},{  621, 4721},{  664, 4967},
-        {  709, 5225},{  752, 5431},{  801, 5595},{  846, 5796},
-        {  912, 6011},{  959, 6193},{ 1015, 6321},{ 1121, 6504}
-      },
-      /*Cb  qi=19  INTER*/
-      {
-        {  126,    4},{   97,  329},{   69,  649},{   56, 1017},
-        {   61, 1432},{   74, 1846},{   88, 2255},{   98, 2663},
-        {  103, 3065},{  106, 3460},{  110, 3844},{  114, 4211},
-        {  117, 4564},{  120, 4911},{  122, 5253},{  125, 5588},
-        {  129, 5916},{  135, 6241},{  142, 6567},{  149, 6885},
-        {  155, 7206},{  163, 7527},{  174, 7843},{  188, 8145}
-      }
-    },
-    {
-      /*Cr  qi=19  INTRA*/
-      {
-        {    5,    6},{   61,  376},{  106,  765},{  135, 1225},
-        {  160, 1689},{  192, 2126},{  229, 2531},{  271, 2869},
-        {  321, 3168},{  370, 3433},{  421, 3704},{  476, 3965},
-        {  520, 4212},{  572, 4452},{  629, 4691},{  671, 4939},
-        {  724, 5152},{  792, 5347},{  858, 5510},{  895, 5696},
-        {  939, 5905},{  991, 6056},{ 1027, 6244},{ 1127, 6333}
-      },
-      /*Cr  qi=19  INTER*/
-      {
-        {   80,   45},{   66,  344},{   55,  654},{   56, 1030},
-        {   66, 1440},{   80, 1850},{   91, 2259},{   98, 2668},
-        {  102, 3072},{  104, 3466},{  107, 3845},{  109, 4215},
-        {  110, 4578},{  112, 4933},{  116, 5283},{  122, 5625},
-        {  129, 5963},{  136, 6287},{  143, 6611},{  151, 6927},
-        {  160, 7229},{  170, 7528},{  181, 7818},{  191, 8092}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=20  INTRA*/
-      {
-        {  129,  -50},{  238, 1481},{  469, 2728},{  730, 3684},
-        {  974, 4473},{ 1213, 5171},{ 1463, 5763},{ 1729, 6281},
-        { 2002, 6744},{ 2299, 7146},{ 2613, 7492},{ 2940, 7746},
-        { 3265, 7978},{ 3571, 8228},{ 3853, 8543},{ 4156, 8815},
-        { 4476, 9001},{ 4775, 9218},{ 5070, 9373},{ 5352, 9446},
-        { 5649, 9510},{ 5956, 9580},{ 6268, 9660},{ 6647, 9705}
-      },
-      /*Y'  qi=20  INTER*/
-      {
-        {   64,  -93},{   52, 1340},{  116, 2862},{  170, 4344},
-        {  216, 5678},{  249, 6928},{  281, 8155},{  333, 9326},
-        {  418,10410},{  533,11411},{  683,12329},{  890,13127},
-        { 1183,13750},{ 1579,14162},{ 2066,14357},{ 2611,14370},
-        { 3159,14284},{ 3675,14167},{ 4142,14053},{ 4568,13953},
-        { 4961,13852},{ 5320,13755},{ 5649,13675},{ 5933,13610}
-      }
-    },
-    {
-      /*Cb  qi=20  INTRA*/
-      {
-        {    3,    3},{   62,  367},{  112,  743},{  140, 1183},
-        {  165, 1646},{  196, 2099},{  235, 2517},{  284, 2883},
-        {  334, 3198},{  393, 3460},{  457, 3690},{  509, 3945},
-        {  560, 4198},{  605, 4435},{  647, 4658},{  699, 4888},
-        {  742, 5155},{  788, 5350},{  835, 5517},{  880, 5730},
-        {  956, 5914},{ 1007, 6060},{ 1053, 6199},{ 1158, 6358}
-      },
-      /*Cb  qi=20  INTER*/
-      {
-        {  128,   -6},{   96,  322},{   66,  653},{   54, 1025},
-        {   63, 1431},{   79, 1844},{   91, 2256},{   99, 2665},
-        {  104, 3065},{  107, 3455},{  111, 3831},{  115, 4189},
-        {  120, 4539},{  123, 4885},{  126, 5219},{  130, 5548},
-        {  135, 5876},{  141, 6199},{  149, 6519},{  156, 6837},
-        {  166, 7153},{  179, 7468},{  189, 7784},{  194, 8102}
-      }
-    },
-    {
-      /*Cr  qi=20  INTRA*/
-      {
-        {    4,    6},{   63,  376},{  109,  765},{  139, 1225},
-        {  165, 1689},{  199, 2124},{  239, 2523},{  285, 2852},
-        {  340, 3140},{  388, 3398},{  438, 3662},{  499, 3914},
-        {  547, 4155},{  596, 4392},{  652, 4634},{  699, 4877},
-        {  759, 5074},{  824, 5257},{  883, 5428},{  936, 5589},
-        {  986, 5790},{ 1030, 5960},{ 1074, 6119},{ 1172, 6191}
-      },
-      /*Cr  qi=20  INTER*/
-      {
-        {   92,   40},{   70,  345},{   55,  658},{   57, 1034},
-        {   69, 1441},{   84, 1852},{   94, 2261},{   98, 2669},
-        {  102, 3074},{  105, 3465},{  107, 3841},{  110, 4206},
-        {  112, 4562},{  116, 4915},{  121, 5260},{  127, 5591},
-        {  134, 5920},{  142, 6246},{  153, 6562},{  163, 6870},
-        {  173, 7170},{  186, 7463},{  198, 7746},{  199, 8030}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=21  INTRA*/
-      {
-        {  130,  -51},{  244, 1476},{  483, 2705},{  756, 3635},
-        { 1013, 4396},{ 1266, 5070},{ 1530, 5647},{ 1806, 6153},
-        { 2093, 6600},{ 2411, 6976},{ 2739, 7299},{ 3079, 7534},
-        { 3422, 7744},{ 3738, 7987},{ 4032, 8274},{ 4348, 8533},
-        { 4675, 8721},{ 4989, 8909},{ 5291, 9051},{ 5577, 9111},
-        { 5879, 9163},{ 6190, 9228},{ 6506, 9286},{ 6899, 9295}
-      },
-      /*Y'  qi=21  INTER*/
-      {
-        {   64,  -56},{   55, 1341},{  119, 2859},{  174, 4324},
-        {  223, 5640},{  258, 6880},{  295, 8096},{  359, 9246},
-        {  460,10302},{  595,11268},{  778,12131},{ 1032,12857},
-        { 1387,13385},{ 1850,13683},{ 2399,13774},{ 2976,13729},
-        { 3527,13619},{ 4034,13504},{ 4492,13401},{ 4912,13291},
-        { 5298,13209},{ 5648,13137},{ 5974,13046},{ 6308,12977}
-      }
-    },
-    {
-      /*Cb  qi=21  INTRA*/
-      {
-        {    4,    3},{   64,  367},{  114,  743},{  141, 1183},
-        {  166, 1645},{  201, 2092},{  247, 2502},{  299, 2856},
-        {  352, 3158},{  413, 3412},{  480, 3642},{  536, 3893},
-        {  588, 4137},{  637, 4367},{  678, 4598},{  725, 4834},
-        {  774, 5083},{  827, 5269},{  883, 5420},{  930, 5633},
-        {  999, 5829},{ 1057, 5959},{ 1113, 6082},{ 1200, 6265}
-      },
-      /*Cb  qi=21  INTER*/
-      {
-        {  109,   -8},{   84,  321},{   62,  654},{   54, 1028},
-        {   64, 1434},{   80, 1847},{   92, 2259},{  100, 2664},
-        {  105, 3060},{  109, 3445},{  114, 3815},{  118, 4172},
-        {  122, 4519},{  126, 4861},{  128, 5194},{  133, 5520},
-        {  139, 5847},{  146, 6169},{  155, 6487},{  166, 6801},
-        {  177, 7114},{  189, 7423},{  201, 7729},{  208, 8035}
-      }
-    },
-    {
-      /*Cr  qi=21  INTRA*/
-      {
-        {    4,    6},{   64,  377},{  111,  766},{  144, 1225},
-        {  174, 1683},{  206, 2114},{  248, 2506},{  302, 2824},
-        {  357, 3099},{  404, 3357},{  455, 3622},{  519, 3867},
-        {  573, 4098},{  625, 4331},{  683, 4571},{  733, 4802},
-        {  793, 4994},{  863, 5173},{  926, 5337},{  978, 5492},
-        { 1030, 5685},{ 1079, 5856},{ 1126, 6027},{ 1217, 6159}
-      },
-      /*Cr  qi=21  INTER*/
-      {
-        {   82,   29},{   67,  341},{   55,  660},{   58, 1038},
-        {   71, 1443},{   85, 1851},{   95, 2258},{   99, 2666},
-        {  103, 3069},{  107, 3456},{  110, 3826},{  112, 4188},
-        {  114, 4544},{  118, 4891},{  124, 5231},{  132, 5567},
-        {  139, 5894},{  148, 6210},{  159, 6520},{  171, 6822},
-        {  185, 7111},{  196, 7403},{  209, 7691},{  225, 7945}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=22  INTRA*/
-      {
-        {  128,  -45},{  254, 1463},{  507, 2662},{  794, 3562},
-        { 1070, 4292},{ 1340, 4941},{ 1622, 5492},{ 1920, 5968},
-        { 2229, 6387},{ 2565, 6742},{ 2911, 7047},{ 3263, 7264},
-        { 3615, 7464},{ 3944, 7689},{ 4258, 7950},{ 4591, 8183},
-        { 4934, 8347},{ 5259, 8517},{ 5573, 8634},{ 5870, 8683},
-        { 6186, 8723},{ 6508, 8762},{ 6831, 8801},{ 7232, 8830}
-      },
-      /*Y'  qi=22  INTER*/
-      {
-        {   77,  -48},{   57, 1343},{  122, 2853},{  180, 4299},
-        {  231, 5597},{  269, 6826},{  314, 8025},{  393, 9150},
-        {  512,10179},{  673,11103},{  894,11908},{ 1207,12542},
-        { 1635,12956},{ 2166,13148},{ 2755,13167},{ 3345,13088},
-        { 3895,12966},{ 4386,12848},{ 4832,12746},{ 5252,12647},
-        { 5634,12563},{ 5978,12497},{ 6299,12412},{ 6633,12338}
-      }
-    },
-    {
-      /*Cb  qi=22  INTRA*/
-      {
-        {    4,    3},{   66,  367},{  122,  744},{  153, 1182},
-        {  177, 1640},{  213, 2080},{  263, 2475},{  323, 2811},
-        {  382, 3103},{  451, 3346},{  522, 3568},{  581, 3814},
-        {  633, 4054},{  674, 4288},{  719, 4523},{  768, 4756},
-        {  823, 4979},{  883, 5162},{  937, 5325},{  996, 5510},
-        { 1070, 5687},{ 1129, 5807},{ 1193, 5929},{ 1311, 6099}
-      },
-      /*Cb  qi=22  INTER*/
-      {
-        {  107,   -5},{   83,  322},{   61,  653},{   55, 1030},
-        {   66, 1436},{   81, 1845},{   94, 2253},{  102, 2656},
-        {  107, 3050},{  111, 3435},{  115, 3804},{  119, 4158},
-        {  124, 4501},{  128, 4835},{  132, 5164},{  138, 5490},
-        {  146, 5812},{  154, 6128},{  163, 6442},{  174, 6754},
-        {  188, 7060},{  205, 7361},{  219, 7662},{  233, 7953}
-      }
-    },
-    {
-      /*Cr  qi=22  INTRA*/
-      {
-        {    4,    6},{   67,  378},{  118,  767},{  151, 1222},
-        {  182, 1675},{  221, 2097},{  269, 2476},{  329, 2774},
-        {  389, 3039},{  444, 3292},{  500, 3545},{  560, 3788},
-        {  615, 4020},{  671, 4251},{  734, 4484},{  781, 4712},
-        {  850, 4887},{  925, 5060},{  981, 5229},{ 1031, 5369},
-        { 1092, 5549},{ 1148, 5715},{ 1200, 5861},{ 1291, 5943}
-      },
-      /*Cr  qi=22  INTER*/
-      {
-        {   88,   34},{   69,  340},{   57,  657},{   60, 1039},
-        {   73, 1445},{   87, 1851},{   96, 2257},{  100, 2662},
-        {  103, 3058},{  107, 3442},{  111, 3812},{  115, 4172},
-        {  118, 4524},{  123, 4864},{  129, 5199},{  136, 5531},
-        {  145, 5855},{  156, 6168},{  170, 6468},{  184, 6765},
-        {  193, 7066},{  207, 7353},{  222, 7628},{  230, 7900}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=23  INTRA*/
-      {
-        {  126,  -40},{  257, 1458},{  521, 2636},{  825, 3501},
-        { 1111, 4207},{ 1391, 4842},{ 1684, 5385},{ 1992, 5858},
-        { 2311, 6277},{ 2653, 6626},{ 3005, 6929},{ 3366, 7134},
-        { 3729, 7311},{ 4071, 7526},{ 4396, 7770},{ 4734, 7986},
-        { 5086, 8131},{ 5421, 8286},{ 5735, 8404},{ 6033, 8456},
-        { 6357, 8486},{ 6682, 8525},{ 7003, 8573},{ 7387, 8604}
-      },
-      /*Y'  qi=23  INTER*/
-      {
-        {   64,  -57},{   60, 1345},{  124, 2853},{  185, 4284},
-        {  239, 5565},{  282, 6783},{  336, 7967},{  429, 9069},
-        {  568,10063},{  758,10943},{ 1028,11679},{ 1407,12216},
-        { 1909,12520},{ 2502,12616},{ 3126,12573},{ 3722,12461},
-        { 4258,12344},{ 4742,12236},{ 5185,12136},{ 5590,12052},
-        { 5970,11980},{ 6315,11901},{ 6631,11826},{ 6954,11769}
-      }
-    },
-    {
-      /*Cb  qi=23  INTRA*/
-      {
-        {    3,    3},{   70,  367},{  124,  744},{  151, 1182},
-        {  181, 1637},{  222, 2071},{  276, 2460},{  343, 2785},
-        {  403, 3072},{  468, 3317},{  542, 3534},{  605, 3773},
-        {  659, 4009},{  703, 4243},{  747, 4479},{  795, 4707},
-        {  852, 4923},{  908, 5105},{  972, 5254},{ 1043, 5423},
-        { 1118, 5594},{ 1172, 5731},{ 1240, 5853},{ 1365, 6005}
-      },
-      /*Cb  qi=23  INTER*/
-      {
-        {  109,  -10},{   87,  325},{   63,  650},{   57, 1031},
-        {   67, 1439},{   83, 1847},{   96, 2253},{  103, 2652},
-        {  109, 3041},{  114, 3421},{  117, 3789},{  122, 4141},
-        {  128, 4480},{  134, 4811},{  139, 5138},{  144, 5463},
-        {  152, 5781},{  161, 6096},{  174, 6404},{  185, 6714},
-        {  198, 7023},{  216, 7320},{  233, 7621},{  245, 7935}
-      }
-    },
-    {
-      /*Cr  qi=23  INTRA*/
-      {
-        {    5,    6},{   70,  379},{  122,  768},{  155, 1222},
-        {  187, 1671},{  231, 2088},{  283, 2459},{  346, 2750},
-        {  411, 3009},{  465, 3261},{  523, 3509},{  585, 3746},
-        {  639, 3980},{  695, 4219},{  754, 4449},{  803, 4671},
-        {  873, 4840},{  953, 5001},{ 1015, 5156},{ 1071, 5286},
-        { 1137, 5464},{ 1191, 5629},{ 1249, 5782},{ 1359, 5885}
-      },
-      /*Cr  qi=23  INTER*/
-      {
-        {   84,   29},{   69,  343},{   58,  660},{   62, 1041},
-        {   75, 1448},{   88, 1853},{   97, 2258},{  102, 2659},
-        {  105, 3050},{  108, 3430},{  113, 3799},{  116, 4155},
-        {  121, 4505},{  126, 4845},{  132, 5176},{  142, 5504},
-        {  153, 5826},{  165, 6133},{  180, 6432},{  197, 6722},
-        {  212, 7005},{  226, 7287},{  244, 7555},{  258, 7828}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=24  INTRA*/
-      {
-        {  125,  -34},{  268, 1444},{  547, 2590},{  866, 3422},
-        { 1172, 4098},{ 1476, 4702},{ 1790, 5222},{ 2117, 5678},
-        { 2453, 6080},{ 2811, 6418},{ 3178, 6700},{ 3552, 6895},
-        { 3928, 7055},{ 4286, 7243},{ 4627, 7477},{ 4981, 7674},
-        { 5344, 7802},{ 5683, 7944},{ 6009, 8043},{ 6313, 8082},
-        { 6633, 8111},{ 6959, 8151},{ 7280, 8197},{ 7660, 8221}
-      },
-      /*Y'  qi=24  INTER*/
-      {
-        {   62,  -63},{   68, 1345},{  134, 2840},{  199, 4245},
-        {  256, 5508},{  304, 6715},{  371, 7880},{  484, 8950},
-        {  652, 9899},{  892,10709},{ 1238,11334},{ 1722,11722},
-        { 2326,11875},{ 2983,11864},{ 3616,11783},{ 4189,11678},
-        { 4707,11570},{ 5178,11476},{ 5617,11395},{ 6017,11319},
-        { 6380,11252},{ 6720,11185},{ 7044,11126},{ 7377,11118}
-      }
-    },
-    {
-      /*Cb  qi=24  INTRA*/
-      {
-        {    4,    3},{   75,  367},{  132,  745},{  159, 1182},
-        {  187, 1634},{  230, 2061},{  289, 2439},{  361, 2753},
-        {  425, 3034},{  492, 3278},{  566, 3490},{  630, 3720},
-        {  686, 3956},{  732, 4190},{  777, 4420},{  829, 4637},
-        {  894, 4840},{  958, 5012},{ 1023, 5155},{ 1090, 5326},
-        { 1165, 5502},{ 1226, 5622},{ 1299, 5717},{ 1408, 5887}
-      },
-      /*Cb  qi=24  INTER*/
-      {
-        {  110,   35},{   92,  337},{   70,  651},{   63, 1033},
-        {   74, 1440},{   91, 1846},{  102, 2248},{  109, 2644},
-        {  114, 3031},{  120, 3404},{  127, 3762},{  133, 4109},
-        {  138, 4445},{  144, 4772},{  151, 5094},{  159, 5411},
-        {  168, 5728},{  180, 6037},{  195, 6338},{  210, 6640},
-        {  227, 6944},{  249, 7236},{  272, 7528},{  299, 7809}
-      }
-    },
-    {
-      /*Cr  qi=24  INTRA*/
-      {
-        {    5,    6},{   72,  380},{  124,  770},{  158, 1222},
-        {  195, 1668},{  240, 2079},{  297, 2438},{  367, 2715},
-        {  433, 2966},{  488, 3218},{  549, 3467},{  609, 3701},
-        {  664, 3935},{  728, 4165},{  792, 4379},{  845, 4586},
-        {  917, 4744},{  995, 4898},{ 1063, 5049},{ 1120, 5187},
-        { 1190, 5359},{ 1249, 5522},{ 1304, 5672},{ 1397, 5806}
-      },
-      /*Cr  qi=24  INTER*/
-      {
-        {   91,   56},{   73,  353},{   61,  664},{   66, 1045},
-        {   80, 1449},{   95, 1851},{  103, 2250},{  107, 2648},
-        {  111, 3038},{  116, 3413},{  120, 3774},{  124, 4128},
-        {  130, 4471},{  138, 4802},{  145, 5130},{  156, 5453},
-        {  171, 5764},{  187, 6061},{  204, 6355},{  220, 6643},
-        {  238, 6923},{  254, 7204},{  275, 7475},{  289, 7752}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=25  INTRA*/
-      {
-        {  125,  -28},{  285, 1426},{  582, 2540},{  917, 3351},
-        { 1244, 3997},{ 1569, 4570},{ 1903, 5071},{ 2258, 5498},
-        { 2626, 5866},{ 3002, 6182},{ 3382, 6448},{ 3770, 6623},
-        { 4162, 6760},{ 4528, 6934},{ 4882, 7144},{ 5249, 7328},
-        { 5610, 7453},{ 5958, 7578},{ 6291, 7672},{ 6597, 7708},
-        { 6928, 7715},{ 7258, 7737},{ 7575, 7781},{ 7950, 7829}
-      },
-      /*Y'  qi=25  INTER*/
-      {
-        {   64,  -16},{   72, 1348},{  139, 2832},{  206, 4218},
-        {  268, 5465},{  322, 6659},{  403, 7803},{  540, 8838},
-        {  747, 9734},{ 1044,10465},{ 1473,10981},{ 2048,11249},
-        { 2717,11311},{ 3397,11257},{ 4025,11161},{ 4589,11052},
-        { 5099,10947},{ 5560,10859},{ 5989,10786},{ 6389,10717},
-        { 6753,10652},{ 7078,10592},{ 7389,10535},{ 7697,10460}
-      }
-    },
-    {
-      /*Cb  qi=25  INTRA*/
-      {
-        {    3,    3},{   78,  368},{  133,  745},{  159, 1180},
-        {  193, 1627},{  242, 2046},{  304, 2411},{  381, 2714},
-        {  456, 2983},{  527, 3224},{  598, 3437},{  667, 3655},
-        {  726, 3888},{  776, 4117},{  826, 4333},{  883, 4543},
-        {  954, 4727},{ 1019, 4878},{ 1095, 5014},{ 1171, 5187},
-        { 1255, 5342},{ 1319, 5458},{ 1396, 5546},{ 1536, 5678}
-      },
-      /*Cb  qi=25  INTER*/
-      {
-        {  117,   32},{   89,  342},{   67,  660},{   64, 1037},
-        {   77, 1441},{   93, 1845},{  105, 2243},{  113, 2633},
-        {  120, 3016},{  125, 3387},{  131, 3739},{  137, 4080},
-        {  144, 4416},{  152, 4741},{  160, 5057},{  169, 5369},
-        {  180, 5680},{  193, 5990},{  209, 6294},{  227, 6594},
-        {  249, 6888},{  269, 7180},{  294, 7467},{  317, 7768}
-      }
-    },
-    {
-      /*Cr  qi=25  INTRA*/
-      {
-        {    6,    6},{   74,  380},{  129,  770},{  165, 1220},
-        {  201, 1658},{  253, 2061},{  315, 2410},{  388, 2676},
-        {  462, 2920},{  523, 3166},{  584, 3404},{  647, 3637},
-        {  701, 3870},{  769, 4086},{  838, 4296},{  898, 4491},
-        {  980, 4627},{ 1065, 4759},{ 1126, 4920},{ 1187, 5058},
-        { 1283, 5180},{ 1347, 5332},{ 1404, 5475},{ 1527, 5534}
-      },
-      /*Cr  qi=25  INTER*/
-      {
-        {   92,   41},{   75,  347},{   64,  664},{   70, 1045},
-        {   85, 1448},{   98, 1849},{  105, 2245},{  110, 2637},
-        {  115, 3023},{  120, 3395},{  126, 3753},{  131, 4102},
-        {  136, 4439},{  145, 4768},{  156, 5094},{  168, 5410},
-        {  184, 5717},{  203, 6010},{  221, 6300},{  239, 6577},
-        {  262, 6847},{  282, 7123},{  303, 7390},{  322, 7665}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=26  INTRA*/
-      {
-        {  130,  -24},{  292, 1423},{  594, 2525},{  943, 3307},
-        { 1289, 3921},{ 1633, 4467},{ 1991, 4943},{ 2368, 5348},
-        { 2753, 5696},{ 3148, 5991},{ 3545, 6247},{ 3942, 6415},
-        { 4342, 6535},{ 4726, 6690},{ 5093, 6883},{ 5466, 7047},
-        { 5840, 7159},{ 6202, 7274},{ 6545, 7351},{ 6855, 7375},
-        { 7186, 7384},{ 7517, 7416},{ 7840, 7447},{ 8238, 7450}
-      },
-      /*Y'  qi=26  INTER*/
-      {
-        {   52,   16},{   75, 1336},{  143, 2815},{  213, 4191},
-        {  278, 5427},{  339, 6611},{  436, 7734},{  600, 8732},
-        {  843, 9579},{ 1195,10243},{ 1702,10660},{ 2355,10825},
-        { 3070,10820},{ 3755,10743},{ 4372,10643},{ 4925,10538},
-        { 5426,10440},{ 5882,10354},{ 6296,10290},{ 6686,10224},
-        { 7049,10163},{ 7380,10113},{ 7672,10062},{ 7937,10021}
-      }
-    },
-    {
-      /*Cb  qi=26  INTRA*/
-      {
-        {    4,    3},{   79,  368},{  138,  745},{  167, 1180},
-        {  200, 1623},{  252, 2034},{  322, 2389},{  403, 2682},
-        {  480, 2941},{  558, 3176},{  631, 3393},{  700, 3608},
-        {  766, 3825},{  819, 4046},{  868, 4265},{  926, 4472},
-        { 1002, 4645},{ 1070, 4800},{ 1151, 4924},{ 1242, 5063},
-        { 1325, 5221},{ 1393, 5338},{ 1464, 5431},{ 1595, 5559}
-      },
-      /*Cb  qi=26  INTER*/
-      {
-        {   98,   33},{   83,  343},{   65,  662},{   65, 1037},
-        {   80, 1437},{   96, 1839},{  107, 2238},{  115, 2628},
-        {  122, 3007},{  128, 3373},{  134, 3722},{  142, 4060},
-        {  149, 4390},{  158, 4713},{  167, 5029},{  178, 5341},
-        {  191, 5647},{  208, 5948},{  227, 6244},{  247, 6539},
-        {  269, 6833},{  295, 7114},{  328, 7388},{  369, 7658}
-      }
-    },
-    {
-      /*Cr  qi=26  INTRA*/
-      {
-        {    5,    6},{   75,  380},{  133,  769},{  172, 1217},
-        {  212, 1652},{  266, 2048},{  333, 2384},{  412, 2643},
-        {  490, 2880},{  552, 3124},{  616, 3365},{  681, 3594},
-        {  739, 3816},{  810, 4024},{  880, 4224},{  945, 4405},
-        { 1029, 4538},{ 1114, 4674},{ 1183, 4822},{ 1254, 4946},
-        { 1346, 5063},{ 1417, 5201},{ 1478, 5345},{ 1597, 5411}
-      },
-      /*Cr  qi=26  INTER*/
-      {
-        {   97,   29},{   75,  342},{   62,  667},{   70, 1047},
-        {   87, 1447},{  100, 1846},{  107, 2242},{  113, 2633},
-        {  118, 3016},{  123, 3382},{  128, 3737},{  135, 4082},
-        {  142, 4417},{  151, 4746},{  162, 5066},{  176, 5377},
-        {  194, 5679},{  217, 5963},{  239, 6244},{  260, 6522},
-        {  284, 6789},{  309, 7052},{  335, 7313},{  355, 7582}
+        {    0,  741},{    9,  743},{   16, 1034},{   26, 1385},
+        {   39, 1741},{   48, 2090},{   56, 2459},{   64, 2850},
+        {   72, 3242},{   81, 3622},{   89, 3980},{   98, 4323},
+        {  104, 4667},{  110, 5005},{  118, 5337},{  126, 5675},
+        {  137, 5998},{  146, 6311},{  156, 6621},{  170, 6914},
+        {  181, 7205},{  196, 7490},{  203, 7779},{  232, 8012}
       }
     }
   },
@@ -1732,557 +217,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=27  INTRA*/
       {
-        {  118,  -10},{  308, 1404},{  630, 2473},{  997, 3227},
-        { 1360, 3819},{ 1719, 4354},{ 2086, 4829},{ 2470, 5233},
-        { 2863, 5576},{ 3267, 5870},{ 3677, 6117},{ 4085, 6268},
-        { 4499, 6376},{ 4888, 6521},{ 5257, 6705},{ 5638, 6865},
-        { 6020, 6962},{ 6394, 7056},{ 6744, 7130},{ 7051, 7158},
-        { 7386, 7164},{ 7717, 7185},{ 8042, 7209},{ 8444, 7206}
+        {  121,  378},{  379, 1464},{  810, 2335},{ 1447, 2725},
+        { 1851, 3194},{ 2311, 3655},{ 2747, 4081},{ 3211, 4393},
+        { 3640, 4672},{ 4056, 4933},{ 4427, 5150},{ 4842, 5259},
+        { 5220, 5381},{ 5584, 5443},{ 5925, 5648},{ 6233, 5783},
+        { 6547, 5944},{ 6905, 6056},{ 7203, 6181},{ 7526, 6207},
+        { 7800, 6330},{ 8175, 6312},{ 8415, 6437},{ 8705, 6459}
       },
       /*Y'  qi=27  INTER*/
       {
-        {   54,   19},{   77, 1333},{  147, 2806},{  221, 4166},
-        {  290, 5390},{  360, 6564},{  474, 7665},{  664, 8630},
-        {  949, 9423},{ 1370,10002},{ 1958,10323},{ 2670,10414},
-        { 3406,10375},{ 4086,10285},{ 4691,10182},{ 5233,10085},
-        { 5724, 9994},{ 6169, 9918},{ 6582, 9863},{ 6962, 9813},
-        { 7316, 9759},{ 7645, 9707},{ 7948, 9660},{ 8262, 9623}
+        {   48,  199},{   90, 1458},{  167, 2824},{  291, 4050},
+        {  434, 5144},{  638, 6133},{  901, 7011},{ 1249, 7743},
+        { 1726, 8280},{ 2317, 8616},{ 2957, 8789},{ 3561, 8896},
+        { 4126, 8936},{ 4646, 8933},{ 5115, 8931},{ 5579, 8890},
+        { 6008, 8804},{ 6411, 8744},{ 6774, 8646},{ 7153, 8549},
+        { 7475, 8462},{ 7790, 8372},{ 8069, 8280},{ 8299, 8278}
       }
     },
     {
       /*Cb  qi=27  INTRA*/
       {
-        {    4,    3},{   79,  368},{  137,  745},{  166, 1180},
-        {  200, 1622},{  253, 2030},{  324, 2381},{  407, 2671},
-        {  487, 2925},{  567, 3156},{  640, 3372},{  712, 3580},
-        {  782, 3792},{  833, 4015},{  887, 4227},{  954, 4422},
-        { 1031, 4592},{ 1103, 4738},{ 1187, 4856},{ 1280, 4990},
-        { 1371, 5135},{ 1442, 5244},{ 1520, 5321},{ 1684, 5398}
+        {   75,  612},{  117,  751},{  160, 1068},{  195, 1406},
+        {  240, 1741},{  305, 2066},{  364, 2359},{  454, 2639},
+        {  538, 2899},{  609, 3149},{  664, 3384},{  730, 3625},
+        {  785, 3860},{  836, 4094},{  872, 4312},{  948, 4507},
+        { 1023, 4677},{ 1081, 4843},{ 1165, 4985},{ 1238, 5092},
+        { 1316, 5235},{ 1418, 5345},{ 1430, 5478},{ 1505, 5538}
       },
       /*Cb  qi=27  INTER*/
       {
-        {  113,   20},{   90,  338},{   66,  661},{   67, 1034},
-        {   82, 1438},{   97, 1842},{  108, 2238},{  115, 2624},
-        {  123, 3000},{  130, 3361},{  138, 3708},{  146, 4040},
-        {  155, 4367},{  164, 4688},{  174, 4999},{  186, 5306},
-        {  203, 5609},{  222, 5908},{  243, 6202},{  268, 6494},
-        {  295, 6781},{  326, 7058},{  367, 7319},{  420, 7551}
+        {   16,  637},{   13,  634},{   32,  869},{   46, 1230},
+        {   55, 1583},{   67, 1950},{   79, 2320},{   93, 2690},
+        {  107, 3052},{  120, 3399},{  133, 3733},{  146, 4054},
+        {  162, 4367},{  175, 4679},{  191, 4984},{  211, 5285},
+        {  232, 5581},{  252, 5875},{  276, 6155},{  305, 6433},
+        {  333, 6706},{  364, 6967},{  398, 7244},{  474, 7394}
       }
     },
     {
       /*Cr  qi=27  INTRA*/
       {
-        {    5,    6},{   75,  380},{  133,  770},{  173, 1217},
-        {  214, 1650},{  268, 2040},{  337, 2375},{  418, 2631},
-        {  496, 2862},{  558, 3104},{  625, 3346},{  692, 3571},
-        {  753, 3786},{  825, 3989},{  896, 4182},{  969, 4352},
-        { 1059, 4479},{ 1144, 4614},{ 1212, 4757},{ 1284, 4871},
-        { 1380, 4982},{ 1457, 5125},{ 1528, 5267},{ 1651, 5346}
+        {   64,  632},{  107,  763},{  147, 1054},{  176, 1411},
+        {  255, 1770},{  324, 2079},{  411, 2359},{  475, 2621},
+        {  545, 2880},{  590, 3158},{  647, 3425},{  709, 3648},
+        {  766, 3878},{  831, 4082},{  911, 4260},{  960, 4493},
+        { 1042, 4558},{ 1115, 4760},{ 1200, 4852},{ 1280, 4950},
+        { 1327, 5186},{ 1445, 5157},{ 1443, 5431},{ 1518, 5493}
       },
       /*Cr  qi=27  INTER*/
       {
-        {   92,   24},{   74,  341},{   61,  669},{   71, 1049},
-        {   88, 1448},{  100, 1849},{  107, 2243},{  113, 2631},
-        {  119, 3010},{  125, 3373},{  131, 3723},{  137, 4064},
-        {  146, 4396},{  159, 4720},{  172, 5033},{  189, 5340},
-        {  210, 5636},{  233, 5920},{  256, 6197},{  282, 6465},
-        {  310, 6730},{  332, 7000},{  359, 7259},{  385, 7515}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=28  INTRA*/
-      {
-        {  116,   -8},{  314, 1400},{  640, 2458},{ 1013, 3197},
-        { 1386, 3768},{ 1762, 4279},{ 2151, 4733},{ 2558, 5117},
-        { 2970, 5442},{ 3393, 5714},{ 3820, 5935},{ 4243, 6069},
-        { 4671, 6161},{ 5074, 6289},{ 5456, 6457},{ 5849, 6598},
-        { 6244, 6689},{ 6632, 6777},{ 6984, 6833},{ 7294, 6855},
-        { 7625, 6862},{ 7961, 6875},{ 8302, 6890},{ 8720, 6883}
-      },
-      /*Y'  qi=28  INTER*/
-      {
-        {   54,    8},{   81, 1333},{  154, 2793},{  231, 4138},
-        {  304, 5352},{  384, 6512},{  519, 7585},{  743, 8508},
-        { 1082, 9236},{ 1587, 9717},{ 2267, 9928},{ 3034, 9944},
-        { 3775, 9878},{ 4438, 9786},{ 5031, 9686},{ 5563, 9601},
-        { 6042, 9523},{ 6481, 9456},{ 6890, 9405},{ 7266, 9356},
-        { 7614, 9313},{ 7933, 9265},{ 8238, 9220},{ 8545, 9193}
-      }
-    },
-    {
-      /*Cb  qi=28  INTRA*/
-      {
-        {    3,    3},{   80,  368},{  138,  746},{  168, 1179},
-        {  208, 1615},{  268, 2014},{  345, 2354},{  432, 2637},
-        {  515, 2884},{  595, 3108},{  669, 3323},{  745, 3533},
-        {  818, 3740},{  876, 3953},{  932, 4160},{ 1003, 4349},
-        { 1088, 4501},{ 1154, 4648},{ 1241, 4768},{ 1349, 4889},
-        { 1441, 5023},{ 1524, 5113},{ 1611, 5187},{ 1783, 5283}
-      },
-      /*Cb  qi=28  INTER*/
-      {
-        {  117,   29},{   91,  341},{   65,  663},{   68, 1038},
-        {   85, 1440},{  100, 1841},{  110, 2234},{  119, 2616},
-        {  127, 2985},{  135, 3342},{  142, 3685},{  151, 4015},
-        {  162, 4337},{  174, 4652},{  186, 4960},{  201, 5264},
-        {  218, 5567},{  239, 5863},{  266, 6149},{  295, 6434},
-        {  328, 6715},{  371, 6976},{  409, 7239},{  460, 7477}
-      }
-    },
-    {
-      /*Cr  qi=28  INTRA*/
-      {
-        {    6,    7},{   79,  381},{  138,  771},{  178, 1215},
-        {  222, 1644},{  285, 2026},{  359, 2347},{  441, 2597},
-        {  521, 2827},{  588, 3066},{  655, 3303},{  725, 3523},
-        {  791, 3728},{  870, 3920},{  950, 4103},{ 1030, 4265},
-        { 1121, 4388},{ 1198, 4520},{ 1266, 4659},{ 1356, 4759},
-        { 1461, 4865},{ 1540, 4993},{ 1619, 5115},{ 1786, 5160}
-      },
-      /*Cr  qi=28  INTER*/
-      {
-        {   96,   18},{   78,  340},{   66,  672},{   74, 1051},
-        {   90, 1450},{  103, 1845},{  110, 2235},{  116, 2619},
-        {  122, 2995},{  129, 3356},{  137, 3702},{  146, 4038},
-        {  156, 4365},{  168, 4684},{  182, 4995},{  203, 5297},
-        {  227, 5588},{  253, 5866},{  282, 6131},{  311, 6394},
-        {  339, 6664},{  366, 6918},{  400, 7171},{  424, 7450}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=29  INTRA*/
-      {
-        {  112,    7},{  334, 1382},{  681, 2410},{ 1081, 3112},
-        { 1484, 3650},{ 1894, 4128},{ 2316, 4547},{ 2749, 4905},
-        { 3188, 5208},{ 3634, 5458},{ 4079, 5666},{ 4517, 5791},
-        { 4952, 5870},{ 5359, 5983},{ 5754, 6137},{ 6165, 6268},
-        { 6568, 6351},{ 6958, 6423},{ 7320, 6471},{ 7638, 6490},
-        { 7979, 6490},{ 8313, 6499},{ 8651, 6517},{ 9085, 6499}
-      },
-      /*Y'  qi=29  INTER*/
-      {
-        {   55,   15},{   85, 1336},{  160, 2780},{  242, 4104},
-        {  323, 5302},{  418, 6443},{  586, 7480},{  859, 8342},
-        { 1278, 8982},{ 1888, 9347},{ 2658, 9457},{ 3457, 9425},
-        { 4192, 9343},{ 4842, 9247},{ 5417, 9162},{ 5935, 9086},
-        { 6404, 9011},{ 6841, 8952},{ 7241, 8907},{ 7609, 8867},
-        { 7953, 8832},{ 8267, 8792},{ 8562, 8740},{ 8836, 8701}
-      }
-    },
-    {
-      /*Cb  qi=29  INTRA*/
-      {
-        {    5,    3},{   84,  368},{  144,  746},{  176, 1175},
-        {  219, 1604},{  285, 1991},{  372, 2318},{  462, 2591},
-        {  546, 2833},{  628, 3058},{  704, 3274},{  788, 3473},
-        {  870, 3664},{  935, 3865},{  995, 4059},{ 1072, 4239},
-        { 1167, 4388},{ 1248, 4518},{ 1334, 4634},{ 1429, 4765},
-        { 1536, 4884},{ 1628, 4964},{ 1716, 5038},{ 1885, 5128}
-      },
-      /*Cb  qi=29  INTER*/
-      {
-        {  126,   25},{   95,  340},{   69,  662},{   71, 1039},
-        {   88, 1440},{  102, 1839},{  113, 2227},{  122, 2604},
-        {  132, 2969},{  141, 3320},{  151, 3659},{  161, 3985},
-        {  172, 4301},{  186, 4612},{  200, 4917},{  219, 5213},
-        {  241, 5509},{  265, 5800},{  296, 6081},{  329, 6360},
-        {  369, 6633},{  414, 6899},{  465, 7148},{  520, 7387}
-      }
-    },
-    {
-      /*Cr  qi=29  INTRA*/
-      {
-        {    6,    7},{   82,  382},{  142,  772},{  185, 1211},
-        {  233, 1632},{  303, 2000},{  388, 2306},{  475, 2550},
-        {  556, 2779},{  627, 3007},{  707, 3237},{  778, 3459},
-        {  843, 3654},{  927, 3834},{ 1012, 4012},{ 1101, 4152},
-        { 1197, 4262},{ 1275, 4399},{ 1359, 4511},{ 1455, 4596},
-        { 1562, 4708},{ 1644, 4833},{ 1719, 4954},{ 1888, 4988}
-      },
-      /*Cr  qi=29  INTER*/
-      {
-        {  101,   28},{   81,  343},{   67,  673},{   75, 1053},
-        {   93, 1450},{  106, 1844},{  113, 2230},{  119, 2610},
-        {  127, 2980},{  135, 3334},{  143, 3676},{  153, 4007},
-        {  165, 4330},{  180, 4645},{  201, 4951},{  224, 5243},
-        {  253, 5522},{  284, 5794},{  314, 6060},{  345, 6322},
-        {  381, 6578},{  419, 6828},{  455, 7073},{  495, 7316}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=30  INTRA*/
-      {
-        {  112,    8},{  335, 1380},{  682, 2401},{ 1083, 3093},
-        { 1489, 3619},{ 1902, 4092},{ 2332, 4511},{ 2777, 4865},
-        { 3231, 5156},{ 3693, 5394},{ 4153, 5585},{ 4605, 5689},
-        { 5049, 5764},{ 5468, 5871},{ 5875, 6004},{ 6295, 6120},
-        { 6706, 6201},{ 7099, 6273},{ 7461, 6311},{ 7785, 6320},
-        { 8128, 6322},{ 8469, 6331},{ 8806, 6342},{ 9220, 6338}
-      },
-      /*Y'  qi=30  INTER*/
-      {
-        {   58,    8},{   90, 1340},{  169, 2771},{  257, 4079},
-        {  345, 5266},{  459, 6387},{  660, 7383},{  990, 8180},
-        { 1496, 8726},{ 2203, 8992},{ 3029, 9038},{ 3833, 8984},
-        { 4549, 8900},{ 5183, 8813},{ 5745, 8735},{ 6250, 8674},
-        { 6715, 8619},{ 7138, 8565},{ 7529, 8528},{ 7899, 8495},
-        { 8234, 8465},{ 8550, 8429},{ 8856, 8395},{ 9160, 8374}
-      }
-    },
-    {
-      /*Cb  qi=30  INTRA*/
-      {
-        {    7,    3},{   88,  369},{  149,  747},{  185, 1175},
-        {  232, 1599},{  304, 1976},{  392, 2293},{  486, 2557},
-        {  573, 2797},{  656, 3027},{  735, 3243},{  819, 3442},
-        {  903, 3629},{  966, 3828},{ 1025, 4027},{ 1105, 4204},
-        { 1201, 4343},{ 1282, 4469},{ 1379, 4575},{ 1486, 4689},
-        { 1588, 4813},{ 1678, 4900},{ 1767, 4969},{ 1911, 5080}
-      },
-      /*Cb  qi=30  INTER*/
-      {
-        {  120,   23},{   96,  336},{   72,  661},{   75, 1043},
-        {   91, 1441},{  105, 1837},{  117, 2221},{  127, 2592},
-        {  137, 2953},{  148, 3301},{  159, 3635},{  170, 3959},
-        {  184, 4271},{  199, 4578},{  216, 4879},{  238, 5175},
-        {  262, 5466},{  294, 5750},{  332, 6027},{  373, 6298},
-        {  421, 6559},{  473, 6805},{  526, 7053},{  587, 7298}
-      }
-    },
-    {
-      /*Cr  qi=30  INTRA*/
-      {
-        {   10,    7},{   89,  384},{  147,  773},{  192, 1211},
-        {  245, 1627},{  322, 1984},{  412, 2280},{  501, 2520},
-        {  583, 2750},{  654, 2982},{  736, 3207},{  810, 3419},
-        {  873, 3614},{  957, 3794},{ 1048, 3965},{ 1139, 4102},
-        { 1237, 4208},{ 1327, 4328},{ 1408, 4448},{ 1496, 4545},
-        { 1604, 4652},{ 1699, 4760},{ 1780, 4877},{ 1937, 4942}
-      },
-      /*Cr  qi=30  INTER*/
-      {
-        {  115,   26},{   89,  342},{   70,  672},{   79, 1055},
-        {   96, 1451},{  108, 1841},{  116, 2222},{  124, 2599},
-        {  132, 2965},{  141, 3316},{  151, 3655},{  163, 3984},
-        {  178, 4301},{  197, 4609},{  219, 4909},{  247, 5195},
-        {  280, 5469},{  317, 5734},{  351, 5991},{  383, 6248},
-        {  423, 6500},{  467, 6744},{  502, 6995},{  558, 7226}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=31  INTRA*/
-      {
-        {  116,   20},{  359, 1361},{  732, 2350},{ 1162, 3010},
-        { 1597, 3507},{ 2042, 3950},{ 2503, 4339},{ 2974, 4670},
-        { 3446, 4951},{ 3922, 5179},{ 4394, 5357},{ 4858, 5454},
-        { 5313, 5519},{ 5734, 5626},{ 6154, 5755},{ 6585, 5859},
-        { 7004, 5928},{ 7408, 5998},{ 7775, 6039},{ 8102, 6048},
-        { 8442, 6051},{ 8790, 6054},{ 9136, 6057},{ 9554, 6041}
-      },
-      /*Y'  qi=31  INTER*/
-      {
-        {   53,   12},{   90, 1340},{  169, 2765},{  259, 4062},
-        {  353, 5236},{  483, 6340},{  713, 7305},{ 1086, 8059},
-        { 1651, 8548},{ 2423, 8751},{ 3288, 8754},{ 4106, 8674},
-        { 4827, 8572},{ 5451, 8482},{ 6007, 8407},{ 6514, 8344},
-        { 6970, 8282},{ 7397, 8225},{ 7795, 8193},{ 8159, 8161},
-        { 8498, 8120},{ 8814, 8093},{ 9127, 8066},{ 9432, 8040}
-      }
-    },
-    {
-      /*Cb  qi=31  INTRA*/
-      {
-        {    7,    3},{   88,  369},{  149,  746},{  185, 1173},
-        {  234, 1595},{  308, 1967},{  399, 2278},{  494, 2537},
-        {  583, 2774},{  669, 2997},{  755, 3204},{  847, 3390},
-        {  936, 3569},{ 1008, 3759},{ 1078, 3942},{ 1162, 4104},
-        { 1262, 4238},{ 1352, 4364},{ 1442, 4470},{ 1557, 4567},
-        { 1676, 4674},{ 1759, 4781},{ 1850, 4853},{ 2043, 4897}
-      },
-      /*Cb  qi=31  INTER*/
-      {
-        {  121,   23},{   96,  335},{   72,  660},{   74, 1043},
-        {   90, 1440},{  105, 1834},{  116, 2217},{  127, 2586},
-        {  138, 2945},{  148, 3293},{  159, 3626},{  172, 3945},
-        {  185, 4256},{  202, 4559},{  223, 4856},{  245, 5150},
-        {  272, 5440},{  306, 5719},{  346, 5989},{  391, 6253},
-        {  443, 6511},{  510, 6743},{  583, 6965},{  651, 7182}
-      }
-    },
-    {
-      /*Cr  qi=31  INTRA*/
-      {
-        {   10,    7},{   88,  384},{  147,  773},{  192, 1209},
-        {  247, 1622},{  326, 1974},{  417, 2262},{  509, 2500},
-        {  596, 2726},{  670, 2949},{  754, 3170},{  836, 3370},
-        {  912, 3548},{  999, 3724},{ 1093, 3888},{ 1198, 4000},
-        { 1304, 4095},{ 1384, 4230},{ 1470, 4347},{ 1577, 4422},
-        { 1696, 4513},{ 1798, 4620},{ 1869, 4746},{ 1991, 4798}
-      },
-      /*Cr  qi=31  INTER*/
-      {
-        {  113,   32},{   88,  345},{   69,  674},{   79, 1055},
-        {   96, 1451},{  108, 1839},{  115, 2218},{  123, 2592},
-        {  132, 2957},{  141, 3308},{  151, 3643},{  163, 3968},
-        {  179, 4285},{  200, 4590},{  225, 4886},{  254, 5169},
-        {  291, 5436},{  330, 5696},{  368, 5951},{  409, 6200},
-        {  452, 6448},{  493, 6695},{  536, 6940},{  571, 7204}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=32  INTRA*/
-      {
-        {  123,   26},{  370, 1356},{  756, 2321},{ 1211, 2944},
-        { 1674, 3408},{ 2148, 3826},{ 2639, 4193},{ 3138, 4504},
-        { 3634, 4765},{ 4133, 4973},{ 4625, 5137},{ 5101, 5225},
-        { 5567, 5274},{ 6002, 5363},{ 6437, 5482},{ 6885, 5566},
-        { 7312, 5625},{ 7723, 5686},{ 8101, 5721},{ 8429, 5732},
-        { 8769, 5728},{ 9120, 5726},{ 9472, 5723},{ 9918, 5700}
-      },
-      /*Y'  qi=32  INTER*/
-      {
-        {   54,   -3},{   95, 1343},{  179, 2750},{  276, 4027},
-        {  382, 5185},{  543, 6256},{  830, 7161},{ 1301, 7815},
-        { 2003, 8172},{ 2883, 8266},{ 3779, 8217},{ 4578, 8127},
-        { 5274, 8035},{ 5886, 7952},{ 6430, 7887},{ 6929, 7835},
-        { 7380, 7779},{ 7796, 7737},{ 8190, 7705},{ 8552, 7672},
-        { 8896, 7640},{ 9210, 7612},{ 9510, 7589},{ 9746, 7552}
-      }
-    },
-    {
-      /*Cb  qi=32  INTRA*/
-      {
-        {    6,    3},{   89,  369},{  153,  746},{  193, 1167},
-        {  247, 1577},{  330, 1935},{  429, 2236},{  528, 2494},
-        {  620, 2732},{  712, 2948},{  801, 3146},{  898, 3325},
-        {  999, 3489},{ 1078, 3664},{ 1155, 3832},{ 1251, 3985},
-        { 1360, 4115},{ 1451, 4236},{ 1549, 4338},{ 1667, 4433},
-        { 1797, 4522},{ 1891, 4613},{ 1989, 4687},{ 2162, 4776}
-      },
-      /*Cb  qi=32  INTER*/
-      {
-        {  116,   -1},{   98,  321},{   80,  656},{   80, 1042},
-        {   96, 1438},{  110, 1827},{  122, 2205},{  133, 2570},
-        {  144, 2925},{  157, 3268},{  170, 3597},{  185, 3911},
-        {  202, 4216},{  221, 4516},{  244, 4809},{  273, 5096},
-        {  308, 5376},{  350, 5644},{  401, 5907},{  459, 6160},
-        {  520, 6401},{  592, 6630},{  676, 6837},{  758, 7050}
-      }
-    },
-    {
-      /*Cr  qi=32  INTRA*/
-      {
-        {   12,    7},{   91,  386},{  152,  773},{  201, 1202},
-        {  261, 1603},{  347, 1942},{  447, 2223},{  540, 2460},
-        {  626, 2684},{  711, 2901},{  801, 3115},{  887, 3312},
-        {  969, 3480},{ 1068, 3633},{ 1176, 3779},{ 1283, 3885},
-        { 1392, 3969},{ 1485, 4090},{ 1573, 4206},{ 1686, 4274},
-        { 1813, 4354},{ 1911, 4459},{ 2004, 4563},{ 2162, 4590}
-      },
-      /*Cr  qi=32  INTER*/
-      {
-        {  129,    5},{   98,  334},{   75,  673},{   84, 1055},
-        {  101, 1448},{  113, 1832},{  121, 2206},{  129, 2577},
-        {  140, 2937},{  151, 3282},{  163, 3614},{  179, 3932},
-        {  198, 4240},{  221, 4542},{  252, 4830},{  290, 5102},
-        {  329, 5364},{  373, 5618},{  420, 5864},{  468, 6105},
-        {  513, 6351},{  564, 6587},{  624, 6810},{  697, 7017}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=33  INTRA*/
-      {
-        {  115,   36},{  388, 1338},{  791, 2289},{ 1258, 2899},
-        { 1732, 3352},{ 2220, 3760},{ 2730, 4117},{ 3244, 4415},
-        { 3751, 4662},{ 4261, 4858},{ 4766, 5012},{ 5249, 5094},
-        { 5719, 5141},{ 6159, 5225},{ 6597, 5333},{ 7044, 5416},
-        { 7474, 5472},{ 7893, 5531},{ 8268, 5570},{ 8591, 5580},
-        { 8931, 5578},{ 9283, 5579},{ 9634, 5582},{10067, 5560}
-      },
-      /*Y'  qi=33  INTER*/
-      {
-        {   65,  -14},{  102, 1345},{  190, 2736},{  294, 3999},
-        {  411, 5146},{  597, 6192},{  934, 7045},{ 1488, 7622},
-        { 2281, 7895},{ 3213, 7937},{ 4108, 7871},{ 4883, 7784},
-        { 5556, 7709},{ 6150, 7643},{ 6685, 7585},{ 7176, 7539},
-        { 7620, 7502},{ 8034, 7466},{ 8427, 7435},{ 8793, 7409},
-        { 9136, 7386},{ 9446, 7364},{ 9743, 7339},{10025, 7303}
-      }
-    },
-    {
-      /*Cb  qi=33  INTRA*/
-      {
-        {    5,    3},{   92,  369},{  159,  746},{  203, 1163},
-        {  263, 1564},{  353, 1911},{  458, 2204},{  557, 2460},
-        {  650, 2697},{  744, 2913},{  836, 3110},{  934, 3292},
-        { 1036, 3454},{ 1125, 3616},{ 1204, 3781},{ 1298, 3932},
-        { 1410, 4058},{ 1507, 4170},{ 1606, 4265},{ 1725, 4358},
-        { 1853, 4445},{ 1955, 4535},{ 2067, 4597},{ 2258, 4663}
-      },
-      /*Cb  qi=33  INTER*/
-      {
-        {  109,   37},{   94,  343},{   81,  662},{   85, 1042},
-        {  102, 1436},{  116, 1823},{  128, 2195},{  141, 2554},
-        {  154, 2906},{  167, 3246},{  183, 3570},{  202, 3881},
-        {  220, 4185},{  241, 4482},{  268, 4772},{  302, 5053},
-        {  341, 5328},{  388, 5592},{  446, 5846},{  507, 6096},
-        {  581, 6328},{  670, 6534},{  762, 6731},{  842, 6922}
-      }
-    },
-    {
-      /*Cr  qi=33  INTRA*/
-      {
-        {   11,    7},{   93,  387},{  158,  774},{  211, 1197},
-        {  278, 1589},{  372, 1917},{  475, 2191},{  569, 2429},
-        {  658, 2655},{  744, 2868},{  835, 3083},{  926, 3271},
-        { 1010, 3430},{ 1110, 3586},{ 1224, 3724},{ 1336, 3826},
-        { 1449, 3908},{ 1547, 4021},{ 1636, 4136},{ 1751, 4200},
-        { 1886, 4277},{ 1977, 4384},{ 2070, 4474},{ 2232, 4510}
-      },
-      /*Cr  qi=33  INTER*/
-      {
-        {   77,    9},{   90,  347},{   80,  674},{   91, 1053},
-        {  107, 1444},{  119, 1825},{  127, 2196},{  137, 2563},
-        {  149, 2919},{  161, 3259},{  176, 3588},{  194, 3905},
-        {  217, 4209},{  246, 4504},{  280, 4786},{  320, 5055},
-        {  364, 5316},{  409, 5565},{  460, 5804},{  517, 6039},
-        {  578, 6264},{  640, 6489},{  701, 6721},{  772, 6948}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=34  INTRA*/
-      {
-        {  124,   40},{  401, 1333},{  823, 2262},{ 1318, 2842},
-        { 1823, 3265},{ 2339, 3650},{ 2872, 3991},{ 3405, 4274},
-        { 3926, 4513},{ 4448, 4704},{ 4961, 4845},{ 5450, 4921},
-        { 5925, 4971},{ 6372, 5053},{ 6813, 5160},{ 7264, 5242},
-        { 7704, 5291},{ 8124, 5346},{ 8500, 5382},{ 8831, 5384},
-        { 9178, 5380},{ 9525, 5387},{ 9869, 5389},{10310, 5356}
-      },
-      /*Y'  qi=34  INTER*/
-      {
-        {   64,  -17},{  101, 1344},{  190, 2730},{  299, 3981},
-        {  430, 5110},{  648, 6127},{ 1036, 6933},{ 1664, 7445},
-        { 2535, 7652},{ 3504, 7653},{ 4402, 7572},{ 5173, 7479},
-        { 5843, 7400},{ 6441, 7334},{ 6976, 7280},{ 7464, 7231},
-        { 7910, 7189},{ 8332, 7157},{ 8730, 7125},{ 9091, 7103},
-        { 9422, 7086},{ 9753, 7061},{10067, 7036},{10316, 7029}
-      }
-    },
-    {
-      /*Cb  qi=34  INTRA*/
-      {
-        {    5,    3},{   91,  369},{  158,  746},{  204, 1162},
-        {  266, 1561},{  358, 1903},{  466, 2189},{  570, 2439},
-        {  665, 2671},{  765, 2880},{  864, 3069},{  970, 3238},
-        { 1079, 3392},{ 1174, 3545},{ 1265, 3693},{ 1360, 3841},
-        { 1471, 3968},{ 1572, 4083},{ 1675, 4181},{ 1804, 4255},
-        { 1939, 4332},{ 2048, 4411},{ 2155, 4484},{ 2339, 4584}
-      },
-      /*Cb  qi=34  INTER*/
-      {
-        {   99,   44},{   92,  345},{   82,  661},{   86, 1043},
-        {  101, 1436},{  116, 1821},{  128, 2191},{  140, 2549},
-        {  154, 2898},{  168, 3235},{  185, 3556},{  203, 3865},
-        {  224, 4166},{  248, 4457},{  278, 4741},{  315, 5021},
-        {  361, 5289},{  416, 5546},{  483, 5792},{  559, 6025},
-        {  651, 6237},{  752, 6432},{  849, 6626},{  967, 6790}
-      }
-    },
-    {
-      /*Cr  qi=34  INTRA*/
-      {
-        {   11,    7},{   93,  387},{  158,  773},{  212, 1195},
-        {  282, 1584},{  378, 1909},{  483, 2179},{  578, 2414},
-        {  671, 2633},{  766, 2837},{  866, 3038},{  960, 3223},
-        { 1049, 3376},{ 1158, 3520},{ 1285, 3644},{ 1400, 3740},
-        { 1505, 3828},{ 1616, 3928},{ 1713, 4030},{ 1820, 4104},
-        { 1957, 4185},{ 2063, 4280},{ 2160, 4355},{ 2320, 4341}
-      },
-      /*Cr  qi=34  INTER*/
-      {
-        {   78,   11},{   89,  347},{   79,  674},{   90, 1053},
-        {  106, 1444},{  117, 1823},{  127, 2192},{  137, 2558},
-        {  149, 2912},{  163, 3249},{  178, 3574},{  197, 3888},
-        {  222, 4189},{  252, 4481},{  293, 4755},{  341, 5013},
-        {  386, 5268},{  436, 5512},{  498, 5743},{  563, 5970},
-        {  622, 6200},{  694, 6415},{  776, 6622},{  871, 6818}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=35  INTRA*/
-      {
-        {  116,   51},{  433, 1312},{  881, 2221},{ 1406, 2771},
-        { 1948, 3156},{ 2511, 3501},{ 3085, 3811},{ 3654, 4066},
-        { 4212, 4273},{ 4763, 4444},{ 5298, 4572},{ 5799, 4638},
-        { 6285, 4678},{ 6747, 4746},{ 7203, 4838},{ 7673, 4905},
-        { 8124, 4950},{ 8552, 5003},{ 8938, 5027},{ 9275, 5026},
-        { 9628, 5019},{ 9981, 5024},{10331, 5030},{10795, 5000}
-      },
-      /*Y'  qi=35  INTER*/
-      {
-        {   71,  -10},{  108, 1348},{  203, 2710},{  325, 3938},
-        {  485, 5040},{  766, 6000},{ 1267, 6706},{ 2048, 7089},
-        { 3037, 7191},{ 4032, 7146},{ 4903, 7061},{ 5648, 6977},
-        { 6301, 6912},{ 6884, 6857},{ 7413, 6812},{ 7898, 6775},
-        { 8342, 6739},{ 8764, 6710},{ 9160, 6688},{ 9519, 6668},
-        { 9859, 6646},{10190, 6625},{10492, 6612},{10755, 6595}
-      }
-    },
-    {
-      /*Cb  qi=35  INTRA*/
-      {
-        {    6,    3},{   95,  369},{  164,  746},{  214, 1156},
-        {  287, 1542},{  390, 1869},{  504, 2143},{  611, 2388},
-        {  712, 2613},{  822, 2811},{  937, 2987},{ 1055, 3147},
-        { 1174, 3285},{ 1286, 3420},{ 1386, 3560},{ 1488, 3698},
-        { 1604, 3814},{ 1714, 3916},{ 1825, 4008},{ 1958, 4088},
-        { 2101, 4159},{ 2224, 4226},{ 2339, 4292},{ 2538, 4383}
-      },
-      /*Cb  qi=35  INTER*/
-      {
-        {   98,   41},{   90,  348},{   86,  665},{   92, 1042},
-        {  108, 1432},{  122, 1812},{  136, 2175},{  151, 2528},
-        {  165, 2872},{  182, 3202},{  202, 3516},{  225, 3819},
-        {  251, 4112},{  281, 4398},{  320, 4675},{  367, 4944},
-        {  421, 5204},{  493, 5450},{  579, 5679},{  672, 5892},
-        {  785, 6082},{  906, 6258},{ 1026, 6432},{ 1153, 6592}
-      }
-    },
-    {
-      /*Cr  qi=35  INTRA*/
-      {
-        {   12,    7},{   98,  388},{  166,  773},{  226, 1187},
-        {  306, 1563},{  411, 1874},{  524, 2134},{  622, 2365},
-        {  721, 2577},{  826, 2768},{  947, 2946},{ 1066, 3106},
-        { 1163, 3250},{ 1274, 3395},{ 1417, 3508},{ 1539, 3590},
-        { 1639, 3671},{ 1754, 3765},{ 1865, 3855},{ 1979, 3921},
-        { 2127, 3998},{ 2249, 4085},{ 2346, 4172},{ 2473, 4210}
-      },
-      /*Cr  qi=35  INTER*/
-      {
-        {   86,   12},{   94,  354},{   85,  677},{   96, 1052},
-        {  113, 1439},{  125, 1811},{  135, 2177},{  147, 2537},
-        {  160, 2884},{  177, 3215},{  195, 3535},{  219, 3842},
-        {  252, 4133},{  292, 4413},{  339, 4680},{  396, 4928},
-        {  455, 5169},{  514, 5408},{  588, 5626},{  672, 5835},
-        {  750, 6051},{  837, 6257},{  943, 6442},{ 1073, 6595}
+        {   12,  688},{   11,  660},{   28,  869},{   46, 1227},
+        {   60, 1598},{   68, 1954},{   79, 2318},{   93, 2693},
+        {  108, 3054},{  123, 3406},{  138, 3748},{  151, 4078},
+        {  165, 4400},{  180, 4716},{  197, 5024},{  217, 5314},
+        {  243, 5599},{  275, 5866},{  301, 6128},{  327, 6394},
+        {  352, 6644},{  375, 6894},{  376, 7180},{  458, 7334}
       }
     }
   },
@@ -2290,557 +279,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=36  INTRA*/
       {
-        {  116,   52},{  432, 1312},{  881, 2215},{ 1407, 2759},
-        { 1948, 3140},{ 2511, 3484},{ 3090, 3789},{ 3672, 4036},
-        { 4243, 4236},{ 4803, 4397},{ 5346, 4517},{ 5856, 4581},
-        { 6350, 4614},{ 6821, 4675},{ 7286, 4763},{ 7754, 4832},
-        { 8201, 4875},{ 8631, 4922},{ 9015, 4948},{ 9351, 4945},
-        { 9706, 4941},{10061, 4948},{10408, 4949},{10878, 4923}
+        {  156,  263},{  484, 1370},{ 1174, 2110},{ 1914, 2456},
+        { 2601, 2695},{ 3221, 2984},{ 3865, 3284},{ 4450, 3530},
+        { 4979, 3739},{ 5470, 3928},{ 5905, 4080},{ 6375, 4200},
+        { 6761, 4373},{ 7175, 4429},{ 7615, 4616},{ 8069, 4687},
+        { 8417, 4820},{ 8813, 4908},{ 9211, 5001},{ 9508, 5073},
+        { 9888, 5133},{10209, 5140},{10529, 5196},{10830, 5173}
       },
       /*Y'  qi=36  INTER*/
       {
-        {   63,  -16},{  114, 1332},{  216, 2690},{  343, 3914},
-        {  515, 5009},{  829, 5939},{ 1399, 6586},{ 2263, 6901},
-        { 3290, 6967},{ 4272, 6920},{ 5115, 6847},{ 5839, 6779},
-        { 6478, 6726},{ 7051, 6685},{ 7571, 6649},{ 8050, 6614},
-        { 8495, 6587},{ 8908, 6567},{ 9298, 6550},{ 9673, 6530},
-        {10005, 6512},{10324, 6499},{10640, 6483},{10936, 6487}
+        {   68,  151},{  107, 1413},{  262, 2665},{  542, 3715},
+        {  946, 4584},{ 1508, 5279},{ 2167, 5829},{ 2968, 6179},
+        { 3758, 6392},{ 4481, 6517},{ 5139, 6577},{ 5706, 6636},
+        { 6271, 6612},{ 6746, 6585},{ 7216, 6533},{ 7622, 6496},
+        { 8045, 6403},{ 8393, 6389},{ 8799, 6272},{ 9062, 6281},
+        { 9436, 6184},{ 9637, 6238},{ 9864, 6215},{10147, 6215}
       }
     },
     {
       /*Cb  qi=36  INTRA*/
       {
-        {    6,    3},{   98,  370},{  170,  746},{  225, 1150},
-        {  306, 1527},{  416, 1845},{  534, 2116},{  642, 2363},
-        {  743, 2591},{  851, 2794},{  964, 2972},{ 1081, 3133},
-        { 1198, 3275},{ 1311, 3410},{ 1411, 3547},{ 1519, 3680},
-        { 1642, 3789},{ 1750, 3892},{ 1860, 3982},{ 1998, 4054},
-        { 2141, 4129},{ 2256, 4204},{ 2372, 4278},{ 2567, 4356}
+        {   91,  385},{  138,  613},{  205,  932},{  265, 1239},
+        {  353, 1549},{  443, 1839},{  518, 2104},{  655, 2341},
+        {  764, 2559},{  876, 2756},{  967, 2950},{ 1088, 3107},
+        { 1184, 3266},{ 1295, 3396},{ 1375, 3548},{ 1502, 3664},
+        { 1610, 3764},{ 1731, 3844},{ 1839, 3938},{ 1954, 4016},
+        { 2069, 4100},{ 2207, 4167},{ 2274, 4253},{ 2374, 4289}
       },
       /*Cb  qi=36  INTER*/
       {
-        {  107,   30},{   96,  346},{   88,  667},{  100, 1039},
-        {  115, 1426},{  128, 1804},{  142, 2164},{  158, 2512},
-        {  176, 2851},{  195, 3178},{  218, 3491},{  243, 3791},
-        {  270, 4084},{  307, 4365},{  348, 4638},{  397, 4908},
-        {  464, 5157},{  545, 5392},{  635, 5620},{  734, 5831},
-        {  854, 6015},{  993, 6170},{ 1124, 6327},{ 1234, 6502}
+        {   59,   18},{   56,  463},{   50,  790},{   76, 1155},
+        {   90, 1515},{  108, 1877},{  125, 2226},{  150, 2562},
+        {  177, 2890},{  203, 3203},{  231, 3501},{  259, 3789},
+        {  289, 4074},{  325, 4348},{  367, 4608},{  418, 4857},
+        {  486, 5093},{  574, 5307},{  677, 5494},{  784, 5688},
+        {  914, 5844},{ 1033, 6004},{ 1142, 6179},{ 1307, 6220}
       }
     },
     {
       /*Cr  qi=36  INTRA*/
       {
-        {   12,    7},{  102,  388},{  172,  773},{  239, 1182},
-        {  328, 1546},{  439, 1848},{  554, 2106},{  651, 2341},
-        {  747, 2561},{  850, 2757},{  972, 2934},{ 1086, 3097},
-        { 1182, 3245},{ 1302, 3382},{ 1447, 3491},{ 1572, 3567},
-        { 1677, 3641},{ 1793, 3733},{ 1899, 3828},{ 2013, 3894},
-        { 2163, 3967},{ 2283, 4059},{ 2387, 4142},{ 2559, 4145}
+        {   87,  376},{  132,  616},{  190,  931},{  268, 1260},
+        {  358, 1550},{  457, 1833},{  592, 2082},{  685, 2318},
+        {  781, 2548},{  867, 2757},{  968, 2953},{ 1080, 3124},
+        { 1173, 3255},{ 1282, 3390},{ 1410, 3477},{ 1528, 3593},
+        { 1645, 3612},{ 1766, 3739},{ 1885, 3789},{ 1954, 3892},
+        { 2115, 3987},{ 2202, 4052},{ 2280, 4172},{ 2379, 4213}
       },
       /*Cr  qi=36  INTER*/
       {
-        {   98,  -10},{   96,  347},{   89,  676},{  102, 1048},
-        {  118, 1433},{  130, 1804},{  141, 2167},{  154, 2523},
-        {  171, 2866},{  190, 3194},{  212, 3508},{  240, 3809},
-        {  276, 4099},{  320, 4377},{  372, 4638},{  428, 4887},
-        {  492, 5122},{  560, 5353},{  638, 5572},{  725, 5779},
-        {  814, 5985},{  902, 6192},{ 1013, 6377},{ 1155, 6527}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=37  INTRA*/
-      {
-        {  109,   58},{  445, 1302},{  927, 2177},{ 1489, 2689},
-        { 2053, 3052},{ 2632, 3387},{ 3230, 3683},{ 3830, 3922},
-        { 4417, 4114},{ 4992, 4266},{ 5546, 4375},{ 6067, 4430},
-        { 6571, 4459},{ 7046, 4516},{ 7513, 4599},{ 7991, 4663},
-        { 8445, 4706},{ 8883, 4749},{ 9273, 4771},{ 9612, 4770},
-        { 9970, 4765},{10325, 4773},{10672, 4778},{11106, 4758}
-      },
-      /*Y'  qi=37  INTER*/
-      {
-        {   56,  -14},{  114, 1333},{  218, 2683},{  354, 3894},
-        {  550, 4966},{  916, 5854},{ 1569, 6437},{ 2520, 6685},
-        { 3596, 6704},{ 4585, 6635},{ 5424, 6556},{ 6147, 6489},
-        { 6787, 6437},{ 7358, 6395},{ 7876, 6358},{ 8361, 6325},
-        { 8807, 6294},{ 9229, 6271},{ 9631, 6253},{10002, 6238},
-        {10356, 6228},{10678, 6212},{10975, 6197},{11274, 6185}
-      }
-    },
-    {
-      /*Cb  qi=37  INTRA*/
-      {
-        {    6,    3},{   99,  370},{  171,  746},{  227, 1149},
-        {  309, 1522},{  421, 1836},{  541, 2104},{  652, 2347},
-        {  757, 2572},{  871, 2768},{  989, 2936},{ 1111, 3087},
-        { 1238, 3223},{ 1357, 3352},{ 1465, 3486},{ 1576, 3612},
-        { 1709, 3705},{ 1828, 3801},{ 1937, 3895},{ 2076, 3967},
-        { 2220, 4035},{ 2345, 4104},{ 2466, 4173},{ 2680, 4265}
-      },
-      /*Cb  qi=37  INTER*/
-      {
-        {  111,   27},{   97,  344},{   87,  667},{   99, 1038},
-        {  115, 1425},{  128, 1802},{  143, 2160},{  159, 2506},
-        {  176, 2843},{  198, 3167},{  220, 3477},{  247, 3774},
-        {  280, 4061},{  321, 4338},{  368, 4608},{  427, 4867},
-        {  501, 5109},{  595, 5332},{  701, 5544},{  818, 5738},
-        {  956, 5905},{ 1105, 6066},{ 1248, 6217},{ 1381, 6353}
-      }
-    },
-    {
-      /*Cr  qi=37  INTRA*/
-      {
-        {   12,    7},{  102,  388},{  173,  773},{  242, 1180},
-        {  331, 1541},{  444, 1839},{  562, 2095},{  662, 2326},
-        {  763, 2540},{  871, 2728},{ 1003, 2892},{ 1130, 3045},
-        { 1230, 3188},{ 1350, 3321},{ 1503, 3418},{ 1634, 3492},
-        { 1737, 3568},{ 1856, 3653},{ 1970, 3744},{ 2091, 3802},
-        { 2247, 3871},{ 2371, 3962},{ 2477, 4041},{ 2655, 4052}
-      },
-      /*Cr  qi=37  INTER*/
-      {
-        {   89,   -9},{   97,  347},{   88,  677},{  102, 1048},
-        {  118, 1432},{  130, 1802},{  141, 2163},{  154, 2517},
-        {  172, 2857},{  192, 3181},{  216, 3494},{  246, 3793},
-        {  286, 4074},{  337, 4343},{  395, 4600},{  464, 4837},
-        {  534, 5066},{  608, 5289},{  694, 5501},{  788, 5704},
-        {  893, 5901},{ 1010, 6088},{ 1151, 6249},{ 1331, 6374}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=38  INTRA*/
-      {
-        {  107,   65},{  476, 1286},{  968, 2148},{ 1548, 2641},
-        { 2141, 2979},{ 2757, 3289},{ 3390, 3564},{ 4020, 3784},
-        { 4632, 3957},{ 5224, 4097},{ 5794, 4201},{ 6326, 4250},
-        { 6828, 4274},{ 7309, 4322},{ 7790, 4401},{ 8271, 4463},
-        { 8729, 4498},{ 9165, 4540},{ 9552, 4566},{ 9901, 4560},
-        {10266, 4552},{10617, 4563},{10964, 4572},{11393, 4567}
-      },
-      /*Y'  qi=38  INTER*/
-      {
-        {   57,  -13},{  118, 1332},{  233, 2665},{  386, 3856},
-        {  620, 4899},{ 1070, 5722},{ 1849, 6211},{ 2898, 6384},
-        { 3989, 6376},{ 4947, 6311},{ 5754, 6249},{ 6454, 6199},
-        { 7077, 6161},{ 7640, 6132},{ 8159, 6101},{ 8639, 6076},
-        { 9081, 6054},{ 9502, 6037},{ 9900, 6027},{10274, 6012},
-        {10621, 5999},{10938, 5991},{11237, 5977},{11557, 5966}
-      }
-    },
-    {
-      /*Cb  qi=38  INTRA*/
-      {
-        {    8,    3},{  104,  370},{  179,  744},{  243, 1139},
-        {  338, 1498},{  458, 1801},{  584, 2060},{  700, 2297},
-        {  812, 2514},{  935, 2699},{ 1061, 2858},{ 1189, 3007},
-        { 1321, 3141},{ 1446, 3266},{ 1563, 3388},{ 1684, 3512},
-        { 1816, 3614},{ 1942, 3702},{ 2055, 3793},{ 2201, 3857},
-        { 2357, 3923},{ 2477, 3994},{ 2593, 4061},{ 2768, 4178}
-      },
-      /*Cb  qi=38  INTER*/
-      {
-        {  118,   24},{  102,  342},{   91,  663},{  101, 1040},
-        {  116, 1427},{  131, 1799},{  147, 2152},{  168, 2491},
-        {  191, 2822},{  215, 3139},{  244, 3441},{  276, 3731},
-        {  316, 4013},{  363, 4286},{  423, 4546},{  495, 4795},
-        {  584, 5028},{  691, 5242},{  814, 5439},{  959, 5608},
-        { 1119, 5759},{ 1277, 5906},{ 1449, 6035},{ 1655, 6144}
-      }
-    },
-    {
-      /*Cr  qi=38  INTRA*/
-      {
-        {   12,    6},{  106,  387},{  182,  771},{  261, 1168},
-        {  364, 1514},{  483, 1802},{  603, 2053},{  707, 2282},
-        {  817, 2489},{  933, 2670},{ 1074, 2825},{ 1210, 2967},
-        { 1320, 3104},{ 1444, 3229},{ 1599, 3324},{ 1735, 3396},
-        { 1846, 3464},{ 1971, 3547},{ 2086, 3646},{ 2206, 3711},
-        { 2366, 3773},{ 2499, 3859},{ 2603, 3945},{ 2766, 3952}
-      },
-      /*Cr  qi=38  INTER*/
-      {
-        {   86,   -9},{   91,  352},{   85,  680},{  102, 1053},
-        {  119, 1435},{  132, 1799},{  146, 2153},{  162, 2501},
-        {  183, 2835},{  209, 3154},{  240, 3458},{  278, 3751},
-        {  327, 4025},{  388, 4284},{  455, 4532},{  529, 4766},
-        {  616, 4980},{  711, 5188},{  815, 5386},{  920, 5583},
-        { 1042, 5770},{ 1186, 5936},{ 1348, 6080},{ 1542, 6196}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=39  INTRA*/
-      {
-        {  103,   66},{  479, 1283},{  998, 2125},{ 1610, 2591},
-        { 2223, 2913},{ 2855, 3214},{ 3501, 3482},{ 4146, 3698},
-        { 4772, 3868},{ 5376, 3999},{ 5956, 4095},{ 6496, 4140},
-        { 7008, 4162},{ 7499, 4209},{ 7987, 4282},{ 8478, 4338},
-        { 8947, 4374},{ 9385, 4417},{ 9783, 4437},{10143, 4433},
-        {10504, 4424},{10866, 4435},{11225, 4444},{11665, 4430}
-      },
-      /*Y'  qi=39  INTER*/
-      {
-        {   56,    2},{  118, 1332},{  235, 2660},{  395, 3843},
-        {  653, 4867},{ 1153, 5652},{ 2003, 6089},{ 3113, 6214},
-        { 4228, 6178},{ 5189, 6102},{ 6002, 6031},{ 6707, 5976},
-        { 7336, 5936},{ 7901, 5900},{ 8424, 5870},{ 8915, 5844},
-        { 9361, 5822},{ 9784, 5807},{10187, 5794},{10571, 5778},
-        {10931, 5763},{11264, 5751},{11582, 5742},{11916, 5730}
-      }
-    },
-    {
-      /*Cb  qi=39  INTRA*/
-      {
-        {    8,    3},{  104,  370},{  179,  744},{  244, 1138},
-        {  340, 1496},{  461, 1796},{  588, 2053},{  705, 2288},
-        {  820, 2503},{  945, 2684},{ 1073, 2840},{ 1210, 2981},
-        { 1352, 3106},{ 1480, 3225},{ 1603, 3342},{ 1728, 3464},
-        { 1865, 3559},{ 1990, 3645},{ 2106, 3734},{ 2258, 3796},
-        { 2413, 3856},{ 2540, 3920},{ 2667, 3986},{ 2887, 4060}
-      },
-      /*Cb  qi=39  INTER*/
-      {
-        {  119,   19},{  103,  340},{   90,  664},{  100, 1040},
-        {  115, 1426},{  131, 1797},{  148, 2148},{  169, 2486},
-        {  192, 2816},{  217, 3131},{  247, 3432},{  282, 3721},
-        {  324, 3999},{  374, 4268},{  435, 4526},{  520, 4766},
-        {  621, 4990},{  738, 5194},{  878, 5376},{ 1035, 5543},
-        { 1202, 5686},{ 1374, 5819},{ 1545, 5950},{ 1729, 6064}
-      }
-    },
-    {
-      /*Cr  qi=39  INTRA*/
-      {
-        {   12,    6},{  106,  387},{  182,  771},{  262, 1167},
-        {  365, 1512},{  486, 1798},{  608, 2047},{  713, 2274},
-        {  824, 2479},{  945, 2655},{ 1091, 2804},{ 1231, 2941},
-        { 1346, 3073},{ 1475, 3194},{ 1633, 3282},{ 1778, 3345},
-        { 1891, 3414},{ 2013, 3501},{ 2138, 3584},{ 2266, 3640},
-        { 2428, 3701},{ 2568, 3782},{ 2674, 3863},{ 2816, 3894}
-      },
-      /*Cr  qi=39  INTER*/
-      {
-        {   88,   -7},{   92,  352},{   85,  680},{  102, 1053},
-        {  119, 1434},{  132, 1797},{  146, 2151},{  163, 2498},
-        {  185, 2830},{  211, 3147},{  243, 3451},{  285, 3735},
-        {  337, 4005},{  401, 4260},{  477, 4499},{  565, 4721},
-        {  655, 4937},{  749, 5148},{  858, 5344},{  979, 5529},
-        { 1110, 5710},{ 1264, 5871},{ 1460, 5990},{ 1677, 6086}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=40  INTRA*/
-      {
-        {   98,   71},{  491, 1274},{ 1023, 2103},{ 1641, 2559},
-        { 2257, 2877},{ 2898, 3171},{ 3566, 3429},{ 4233, 3629},
-        { 4881, 3784},{ 5499, 3906},{ 6088, 3997},{ 6631, 4040},
-        { 7145, 4060},{ 7640, 4107},{ 8128, 4178},{ 8618, 4233},
-        { 9077, 4267},{ 9514, 4304},{ 9919, 4324},{10277, 4317},
-        {10635, 4312},{10985, 4324},{11338, 4331},{11792, 4334}
-      },
-      /*Y'  qi=40  INTER*/
-      {
-        {   63,  -26},{  125, 1331},{  256, 2640},{  439, 3801},
-        {  757, 4782},{ 1391, 5474},{ 2399, 5805},{ 3582, 5870},
-        { 4678, 5824},{ 5600, 5763},{ 6386, 5710},{ 7076, 5667},
-        { 7693, 5637},{ 8252, 5610},{ 8775, 5586},{ 9255, 5571},
-        { 9694, 5556},{10115, 5541},{10530, 5530},{10903, 5522},
-        {11242, 5515},{11596, 5501},{11904, 5482},{12205, 5475}
-      }
-    },
-    {
-      /*Cb  qi=40  INTRA*/
-      {
-        {    8,    3},{  108,  371},{  189,  743},{  265, 1128},
-        {  371, 1475},{  499, 1767},{  628, 2022},{  746, 2256},
-        {  864, 2467},{  991, 2647},{ 1124, 2801},{ 1270, 2933},
-        { 1412, 3054},{ 1547, 3165},{ 1677, 3277},{ 1804, 3393},
-        { 1946, 3483},{ 2078, 3569},{ 2201, 3651},{ 2352, 3711},
-        { 2513, 3766},{ 2643, 3826},{ 2775, 3880},{ 3025, 3919}
-      },
-      /*Cb  qi=40  INTER*/
-      {
-        {  114,   35},{  104,  349},{   96,  667},{  106, 1040},
-        {  121, 1423},{  138, 1789},{  158, 2132},{  184, 2464},
-        {  212, 2787},{  242, 3095},{  279, 3389},{  321, 3671},
-        {  374, 3941},{  438, 4199},{  517, 4446},{  617, 4673},
-        {  740, 4881},{  891, 5064},{ 1058, 5225},{ 1239, 5372},
-        { 1441, 5499},{ 1638, 5610},{ 1840, 5719},{ 2076, 5814}
-      }
-    },
-    {
-      /*Cr  qi=40  INTRA*/
-      {
-        {   14,    7},{  114,  389},{  193,  771},{  283, 1156},
-        {  399, 1488},{  523, 1768},{  643, 2018},{  752, 2245},
-        {  865, 2450},{  984, 2626},{ 1139, 2763},{ 1290, 2887},
-        { 1413, 3014},{ 1550, 3128},{ 1711, 3211},{ 1865, 3268},
-        { 1981, 3334},{ 2103, 3415},{ 2237, 3486},{ 2365, 3543},
-        { 2529, 3610},{ 2666, 3700},{ 2775, 3779},{ 2929, 3803}
-      },
-      /*Cr  qi=40  INTER*/
-      {
-        {   89,   -8},{   95,  353},{   90,  681},{  107, 1053},
-        {  124, 1430},{  139, 1787},{  156, 2136},{  177, 2477},
-        {  203, 2803},{  237, 3112},{  276, 3406},{  329, 3683},
-        {  395, 3942},{  475, 4182},{  567, 4407},{  665, 4624},
-        {  767, 4834},{  879, 5032},{ 1011, 5213},{ 1169, 5375},
-        { 1348, 5525},{ 1547, 5654},{ 1785, 5743},{ 2066, 5787}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=41  INTRA*/
-      {
-        {   98,   71},{  495, 1272},{ 1040, 2090},{ 1675, 2533},
-        { 2302, 2842},{ 2953, 3132},{ 3631, 3381},{ 4309, 3574},
-        { 4966, 3726},{ 5593, 3846},{ 6189, 3934},{ 6738, 3972},
-        { 7256, 3991},{ 7754, 4036},{ 8250, 4099},{ 8747, 4150},
-        { 9207, 4185},{ 9650, 4222},{10057, 4242},{10411, 4237},
-        {10771, 4230},{11127, 4244},{11486, 4254},{11933, 4252}
-      },
-      /*Y'  qi=41  INTER*/
-      {
-        {   65,  -25},{  125, 1331},{  260, 2633},{  457, 3782},
-        {  807, 4740},{ 1499, 5397},{ 2562, 5693},{ 3766, 5743},
-        { 4859, 5695},{ 5776, 5638},{ 6556, 5590},{ 7243, 5554},
-        { 7859, 5529},{ 8417, 5506},{ 8935, 5486},{ 9419, 5473},
-        { 9869, 5460},{10296, 5446},{10711, 5436},{11089, 5430},
-        {11445, 5421},{11802, 5412},{12129, 5404},{12465, 5393}
-      }
-    },
-    {
-      /*Cb  qi=41  INTRA*/
-      {
-        {    8,    3},{  108,  371},{  189,  743},{  267, 1126},
-        {  374, 1471},{  504, 1760},{  635, 2011},{  758, 2241},
-        {  881, 2447},{ 1013, 2621},{ 1147, 2773},{ 1293, 2906},
-        { 1441, 3023},{ 1580, 3131},{ 1712, 3243},{ 1844, 3360},
-        { 1985, 3451},{ 2114, 3532},{ 2240, 3613},{ 2390, 3680},
-        { 2550, 3740},{ 2687, 3800},{ 2825, 3862},{ 3052, 3944}
-      },
-      /*Cb  qi=41  INTER*/
-      {
-        {  104,   39},{  100,  350},{   95,  667},{  105, 1040},
-        {  121, 1422},{  137, 1787},{  159, 2129},{  185, 2459},
-        {  216, 2778},{  249, 3083},{  287, 3374},{  335, 3653},
-        {  393, 3920},{  462, 4175},{  549, 4414},{  660, 4636},
-        {  791, 4839},{  952, 5014},{ 1135, 5166},{ 1337, 5297},
-        { 1552, 5411},{ 1752, 5530},{ 1972, 5634},{ 2224, 5724}
-      }
-    },
-    {
-      /*Cr  qi=41  INTRA*/
-      {
-        {   15,    7},{  115,  389},{  193,  770},{  284, 1154},
-        {  401, 1484},{  528, 1761},{  652, 2005},{  764, 2228},
-        {  882, 2427},{ 1008, 2599},{ 1167, 2734},{ 1320, 2859},
-        { 1443, 2990},{ 1580, 3103},{ 1743, 3181},{ 1894, 3241},
-        { 2012, 3309},{ 2141, 3385},{ 2272, 3459},{ 2398, 3519},
-        { 2566, 3584},{ 2707, 3680},{ 2816, 3762},{ 2991, 3770}
-      },
-      /*Cr  qi=41  INTER*/
-      {
-        {   92,   -9},{   98,  354},{   90,  682},{  107, 1052},
-        {  124, 1429},{  139, 1786},{  156, 2132},{  178, 2471},
-        {  207, 2794},{  241, 3100},{  285, 3391},{  345, 3662},
-        {  417, 3915},{  503, 4151},{  600, 4375},{  703, 4589},
-        {  815, 4791},{  942, 4981},{ 1088, 5155},{ 1250, 5316},
-        { 1432, 5462},{ 1653, 5575},{ 1930, 5639},{ 2250, 5655}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=42  INTRA*/
-      {
-        {  109,   75},{  534, 1257},{ 1114, 2047},{ 1793, 2456},
-        { 2461, 2735},{ 3157, 2994},{ 3879, 3221},{ 4595, 3396},
-        { 5282, 3531},{ 5931, 3638},{ 6546, 3714},{ 7105, 3749},
-        { 7633, 3766},{ 8147, 3803},{ 8652, 3865},{ 9148, 3915},
-        { 9613, 3946},{10075, 3976},{10489, 3997},{10835, 3994},
-        {11195, 3985},{11553, 3997},{11909, 4004},{12369, 3990}
-      },
-      /*Y'  qi=42  INTER*/
-      {
-        {   69,  -23},{  134, 1332},{  287, 2611},{  521, 3730},
-        {  970, 4624},{ 1827, 5176},{ 3028, 5382},{ 4262, 5389},
-        { 5325, 5338},{ 6214, 5291},{ 6976, 5255},{ 7651, 5228},
-        { 8260, 5206},{ 8821, 5190},{ 9343, 5177},{ 9823, 5165},
-        {10273, 5152},{10709, 5143},{11121, 5136},{11502, 5129},
-        {11857, 5125},{12193, 5115},{12520, 5107},{12802, 5097}
-      }
-    },
-    {
-      /*Cb  qi=42  INTRA*/
-      {
-        {    9,    3},{  113,  371},{  199,  743},{  279, 1123},
-        {  390, 1462},{  525, 1743},{  662, 1986},{  789, 2208},
-        {  916, 2406},{ 1057, 2571},{ 1204, 2712},{ 1362, 2835},
-        { 1524, 2943},{ 1676, 3040},{ 1815, 3145},{ 1959, 3249},
-        { 2117, 3325},{ 2249, 3406},{ 2377, 3488},{ 2537, 3547},
-        { 2706, 3597},{ 2854, 3646},{ 2999, 3705},{ 3236, 3759}
-      },
-      /*Cb  qi=42  INTER*/
-      {
-        {  114,   44},{  107,  353},{  101,  670},{  111, 1041},
-        {  129, 1418},{  148, 1775},{  174, 2110},{  208, 2432},
-        {  244, 2746},{  283, 3046},{  330, 3330},{  388, 3602},
-        {  460, 3858},{  546, 4101},{  655, 4326},{  793, 4530},
-        {  966, 4703},{ 1165, 4851},{ 1388, 4980},{ 1630, 5088},
-        { 1869, 5189},{ 2122, 5268},{ 2403, 5328},{ 2667, 5417}
-      }
-    },
-    {
-      /*Cr  qi=42  INTRA*/
-      {
-        {   15,    7},{  120,  390},{  202,  771},{  298, 1150},
-        {  421, 1473},{  553, 1743},{  681, 1982},{  796, 2199},
-        {  923, 2388},{ 1062, 2547},{ 1225, 2678},{ 1392, 2792},
-        { 1531, 2907},{ 1682, 3007},{ 1856, 3074},{ 2009, 3134},
-        { 2138, 3192},{ 2274, 3257},{ 2407, 3333},{ 2536, 3393},
-        { 2711, 3455},{ 2875, 3531},{ 3000, 3598},{ 3186, 3599}
-      },
-      /*Cr  qi=42  INTER*/
-      {
-        {   87,   -4},{   95,  358},{   97,  683},{  113, 1052},
-        {  131, 1423},{  148, 1774},{  170, 2116},{  198, 2448},
-        {  234, 2762},{  276, 3062},{  331, 3343},{  404, 3603},
-        {  494, 3844},{  598, 4067},{  715, 4276},{  842, 4471},
-        {  977, 4661},{ 1128, 4840},{ 1311, 4991},{ 1516, 5127},
-        { 1759, 5233},{ 2050, 5300},{ 2377, 5323},{ 2710, 5304}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=43  INTRA*/
-      {
-        {   99,   79},{  557, 1244},{ 1175, 2016},{ 1882, 2408},
-        { 2570, 2677},{ 3288, 2926},{ 4030, 3141},{ 4760, 3307},
-        { 5458, 3435},{ 6115, 3537},{ 6743, 3608},{ 7312, 3636},
-        { 7841, 3652},{ 8357, 3687},{ 8870, 3742},{ 9376, 3788},
-        { 9850, 3821},{10315, 3853},{10734, 3873},{11084, 3870},
-        {11442, 3862},{11800, 3874},{12160, 3879},{12618, 3876}
-      },
-      /*Y'  qi=43  INTER*/
-      {
-        {   69,  -22},{  134, 1331},{  294, 2601},{  551, 3703},
-        { 1056, 4563},{ 2003, 5061},{ 3276, 5215},{ 4534, 5194},
-        { 5599, 5133},{ 6488, 5083},{ 7257, 5044},{ 7938, 5014},
-        { 8556, 4992},{ 9124, 4975},{ 9648, 4960},{10138, 4948},
-        {10594, 4939},{11039, 4926},{11462, 4919},{11847, 4912},
-        {12216, 4904},{12570, 4896},{12883, 4889},{13189, 4879}
-      }
-    },
-    {
-      /*Cb  qi=43  INTRA*/
-      {
-        {    9,    3},{  114,  371},{  202,  740},{  294, 1110},
-        {  417, 1440},{  558, 1716},{  700, 1956},{  833, 2172},
-        {  966, 2365},{ 1116, 2524},{ 1269, 2661},{ 1431, 2781},
-        { 1599, 2885},{ 1756, 2980},{ 1902, 3082},{ 2051, 3185},
-        { 2209, 3261},{ 2337, 3342},{ 2464, 3420},{ 2633, 3475},
-        { 2809, 3525},{ 2948, 3579},{ 3094, 3633},{ 3347, 3678}
-      },
-      /*Cb  qi=43  INTER*/
-      {
-        {  111,   44},{  106,  353},{  102,  670},{  112, 1040},
-        {  128, 1416},{  148, 1771},{  176, 2104},{  211, 2424},
-        {  250, 2734},{  293, 3030},{  347, 3309},{  411, 3575},
-        {  490, 3828},{  589, 4064},{  716, 4278},{  869, 4472},
-        { 1050, 4640},{ 1264, 4781},{ 1512, 4895},{ 1775, 4991},
-        { 2042, 5069},{ 2310, 5141},{ 2593, 5207},{ 2912, 5239}
-      }
-    },
-    {
-      /*Cr  qi=43  INTRA*/
-      {
-        {   15,    7},{  121,  390},{  208,  767},{  315, 1135},
-        {  449, 1449},{  586, 1715},{  718, 1950},{  843, 2158},
-        {  977, 2342},{ 1120, 2501},{ 1290, 2632},{ 1466, 2739},
-        { 1613, 2845},{ 1763, 2945},{ 1937, 3015},{ 2093, 3070},
-        { 2225, 3126},{ 2366, 3194},{ 2501, 3267},{ 2634, 3324},
-        { 2815, 3385},{ 2964, 3466},{ 3087, 3538},{ 3263, 3555}
-      },
-      /*Cr  qi=43  INTER*/
-      {
-        {   84,   -4},{   93,  358},{   95,  683},{  113, 1052},
-        {  131, 1421},{  148, 1770},{  171, 2110},{  201, 2439},
-        {  240, 2750},{  287, 3046},{  348, 3322},{  429, 3576},
-        {  527, 3811},{  641, 4029},{  767, 4230},{  904, 4422},
-        { 1053, 4603},{ 1225, 4765},{ 1433, 4903},{ 1661, 5030},
-        { 1928, 5121},{ 2252, 5160},{ 2604, 5164},{ 2979, 5125}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=44  INTRA*/
-      {
-        {  103,   80},{  560, 1244},{ 1183, 2009},{ 1891, 2391},
-        { 2586, 2649},{ 3324, 2884},{ 4093, 3089},{ 4850, 3243},
-        { 5575, 3358},{ 6252, 3452},{ 6886, 3518},{ 7459, 3546},
-        { 7993, 3562},{ 8515, 3594},{ 9030, 3645},{ 9534, 3691},
-        {10004, 3723},{10469, 3750},{10887, 3765},{11236, 3766},
-        {11596, 3762},{11960, 3775},{12317, 3784},{12766, 3789}
-      },
-      /*Y'  qi=44  INTER*/
-      {
-        {   77,  -24},{  145, 1332},{  332, 2580},{  642, 3649},
-        { 1270, 4438},{ 2360, 4860},{ 3685, 4982},{ 4910, 4966},
-        { 5929, 4928},{ 6785, 4900},{ 7529, 4880},{ 8198, 4863},
-        { 8804, 4850},{ 9361, 4842},{ 9882, 4836},{10371, 4830},
-        {10827, 4822},{11262, 4816},{11672, 4811},{12052, 4807},
-        {12431, 4806},{12780, 4798},{13095, 4792},{13401, 4791}
-      }
-    },
-    {
-      /*Cb  qi=44  INTRA*/
-      {
-        {    9,    2},{  122,  371},{  214,  741},{  307, 1109},
-        {  433, 1432},{  576, 1704},{  718, 1939},{  855, 2152},
-        {  991, 2340},{ 1141, 2497},{ 1298, 2632},{ 1463, 2749},
-        { 1636, 2851},{ 1796, 2944},{ 1947, 3041},{ 2101, 3140},
-        { 2260, 3219},{ 2392, 3297},{ 2527, 3366},{ 2693, 3424},
-        { 2872, 3477},{ 3025, 3525},{ 3175, 3584},{ 3451, 3626}
-      },
-      /*Cb  qi=44  INTER*/
-      {
-        {  111,   14},{  110,  339},{  109,  671},{  120, 1040},
-        {  139, 1410},{  162, 1758},{  197, 2084},{  243, 2397},
-        {  291, 2702},{  342, 2992},{  405, 3265},{  484, 3521},
-        {  584, 3760},{  705, 3983},{  855, 4185},{ 1048, 4356},
-        { 1274, 4500},{ 1531, 4617},{ 1816, 4707},{ 2111, 4783},
-        { 2409, 4846},{ 2720, 4901},{ 3044, 4957},{ 3391, 4985}
-      }
-    },
-    {
-      /*Cr  qi=44  INTRA*/
-      {
-        {   17,    7},{  128,  392},{  219,  770},{  329, 1135},
-        {  465, 1442},{  601, 1703},{  734, 1935},{  862, 2142},
-        {  998, 2325},{ 1147, 2482},{ 1321, 2606},{ 1496, 2710},
-        { 1649, 2813},{ 1809, 2908},{ 1984, 2977},{ 2143, 3032},
-        { 2279, 3087},{ 2423, 3152},{ 2559, 3225},{ 2684, 3288},
-        { 2866, 3351},{ 3025, 3426},{ 3161, 3492},{ 3372, 3500}
-      },
-      /*Cr  qi=44  INTER*/
-      {
-        {   89,    0},{  101,  352},{  104,  683},{  121, 1051},
-        {  141, 1414},{  163, 1757},{  192, 2092},{  231, 2415},
-        {  278, 2720},{  336, 3007},{  412, 3273},{  510, 3516},
-        {  633, 3733},{  769, 3936},{  914, 4130},{ 1076, 4307},
-        { 1256, 4472},{ 1469, 4617},{ 1723, 4732},{ 2012, 4822},
-        { 2347, 4871},{ 2716, 4875},{ 3082, 4866},{ 3422, 4826}
+        {   53,   45},{   50,  467},{   45,  789},{   76, 1150},
+        {   92, 1531},{  107, 1877},{  125, 2219},{  147, 2561},
+        {  176, 2893},{  206, 3209},{  231, 3514},{  260, 3808},
+        {  298, 4085},{  350, 4344},{  411, 4587},{  475, 4814},
+        {  532, 5037},{  587, 5261},{  647, 5480},{  707, 5694},
+        {  793, 5900},{  891, 6093},{ 1017, 6292},{ 1205, 6307}
       }
     }
   },
@@ -2848,557 +341,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=45  INTRA*/
       {
-        {  119,   78},{  610, 1226},{ 1271, 1965},{ 2026, 2319},
-        { 2768, 2550},{ 3556, 2757},{ 4369, 2938},{ 5157, 3076},
-        { 5901, 3182},{ 6598, 3268},{ 7253, 3326},{ 7844, 3343},
-        { 8392, 3356},{ 8922, 3386},{ 9453, 3433},{ 9973, 3474},
-        {10457, 3503},{10929, 3530},{11351, 3543},{11709, 3541},
-        {12068, 3537},{12434, 3547},{12805, 3555},{13268, 3563}
+        {   47,  170},{  955, 1217},{ 1713, 2014},{ 3050, 2094},
+        { 3954, 2179},{ 4801, 2357},{ 5629, 2494},{ 6313, 2614},
+        { 6962, 2716},{ 7566, 2820},{ 8138, 2886},{ 8613, 2949},
+        { 9097, 3031},{ 9574, 3044},{10053, 3142},{10514, 3134},
+        {10897, 3241},{11397, 3275},{11775, 3297},{12200, 3350},
+        {12527, 3350},{12959, 3393},{13246, 3401},{13573, 3397}
       },
       /*Y'  qi=45  INTER*/
       {
-        {   77,  -20},{  146, 1330},{  342, 2566},{  699, 3604},
-        { 1439, 4332},{ 2669, 4672},{ 4075, 4727},{ 5318, 4679},
-        { 6345, 4630},{ 7209, 4595},{ 7963, 4570},{ 8644, 4551},
-        { 9262, 4535},{ 9831, 4525},{10370, 4515},{10872, 4506},
-        {11334, 4500},{11783, 4492},{12219, 4489},{12617, 4483},
-        {12995, 4477},{13350, 4472},{13674, 4466},{13968, 4468}
+        {   53,   73},{  175, 1343},{  649, 2439},{ 1339, 3250},
+        { 2297, 3837},{ 3395, 4203},{ 4438, 4400},{ 5401, 4529},
+        { 6222, 4588},{ 7018, 4564},{ 7713, 4532},{ 8378, 4464},
+        { 8959, 4414},{ 9464, 4364},{ 9980, 4315},{10401, 4291},
+        {10805, 4260},{11172, 4260},{11501, 4231},{11798, 4248},
+        {12082, 4254},{12381, 4262},{12572, 4285},{12877, 4289}
       }
     },
     {
       /*Cb  qi=45  INTRA*/
       {
-        {    9,    2},{  122,  370},{  219,  735},{  324, 1096},
-        {  465, 1414},{  619, 1679},{  771, 1905},{  920, 2103},
-        { 1070, 2276},{ 1236, 2419},{ 1410, 2539},{ 1595, 2644},
-        { 1784, 2736},{ 1949, 2831},{ 2104, 2931},{ 2275, 3021},
-        { 2443, 3092},{ 2586, 3166},{ 2735, 3234},{ 2904, 3288},
-        { 3093, 3338},{ 3262, 3382},{ 3419, 3427},{ 3708, 3456}
+        {  112,  -14},{  173,  495},{  260,  827},{  355, 1122},
+        {  451, 1420},{  579, 1695},{  697, 1934},{  917, 2101},
+        { 1104, 2244},{ 1266, 2381},{ 1417, 2520},{ 1609, 2611},
+        { 1801, 2689},{ 1973, 2764},{ 2108, 2864},{ 2298, 2948},
+        { 2452, 3008},{ 2588, 3080},{ 2732, 3161},{ 2888, 3203},
+        { 3052, 3266},{ 3240, 3294},{ 3342, 3351},{ 3467, 3373}
       },
       /*Cb  qi=45  INTER*/
       {
-        {  103,    0},{  109,  339},{  109,  670},{  119, 1039},
-        {  137, 1408},{  162, 1754},{  199, 2076},{  248, 2386},
-        {  301, 2684},{  360, 2967},{  433, 3234},{  525, 3481},
-        {  640, 3713},{  780, 3924},{  956, 4110},{ 1176, 4266},
-        { 1438, 4390},{ 1736, 4481},{ 2057, 4553},{ 2385, 4613},
-        { 2718, 4656},{ 3056, 4698},{ 3416, 4733},{ 3799, 4755}
+        {   41,  -49},{   52,  385},{   87,  743},{  110, 1102},
+        {  135, 1453},{  162, 1788},{  207, 2096},{  272, 2391},
+        {  330, 2677},{  392, 2950},{  464, 3205},{  556, 3442},
+        {  674, 3656},{  827, 3847},{ 1030, 4006},{ 1275, 4132},
+        { 1544, 4234},{ 1809, 4317},{ 2089, 4408},{ 2377, 4456},
+        { 2647, 4532},{ 2919, 4595},{ 3256, 4659},{ 3465, 4657}
       }
     },
     {
       /*Cr  qi=45  INTRA*/
       {
-        {   16,    7},{  128,  391},{  225,  763},{  350, 1120},
-        {  500, 1420},{  649, 1673},{  792, 1893},{  929, 2089},
-        { 1084, 2257},{ 1250, 2401},{ 1440, 2518},{ 1633, 2614},
-        { 1799, 2708},{ 1968, 2798},{ 2151, 2863},{ 2314, 2914},
-        { 2453, 2968},{ 2611, 3025},{ 2759, 3095},{ 2887, 3160},
-        { 3082, 3210},{ 3259, 3278},{ 3403, 3342},{ 3593, 3354}
+        {   99,  -14},{  164,  493},{  247,  832},{  358, 1123},
+        {  468, 1416},{  599, 1680},{  795, 1886},{  958, 2063},
+        { 1133, 2211},{ 1300, 2345},{ 1480, 2461},{ 1664, 2554},
+        { 1807, 2656},{ 1995, 2742},{ 2146, 2799},{ 2331, 2856},
+        { 2440, 2894},{ 2592, 2996},{ 2751, 3033},{ 2865, 3112},
+        { 3073, 3162},{ 3210, 3208},{ 3330, 3306},{ 3454, 3332}
       },
       /*Cr  qi=45  INTER*/
       {
-        {   92,    0},{  101,  352},{  103,  682},{  120, 1049},
-        {  140, 1412},{  163, 1752},{  193, 2083},{  234, 2402},
-        {  287, 2702},{  353, 2983},{  442, 3240},{  557, 3471},
-        {  694, 3680},{  846, 3873},{ 1014, 4056},{ 1200, 4224},
-        { 1414, 4369},{ 1664, 4495},{ 1946, 4595},{ 2278, 4654},
-        { 2654, 4673},{ 3047, 4658},{ 3438, 4627},{ 3825, 4585}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=46  INTRA*/
-      {
-        {  119,   78},{  610, 1227},{ 1277, 1960},{ 2043, 2309},
-        { 2805, 2529},{ 3618, 2719},{ 4452, 2887},{ 5257, 3016},
-        { 6017, 3115},{ 6727, 3195},{ 7392, 3248},{ 7984, 3267},
-        { 8528, 3281},{ 9059, 3310},{ 9593, 3354},{10119, 3395},
-        {10599, 3425},{11064, 3450},{11493, 3464},{11850, 3466},
-        {12207, 3462},{12578, 3471},{12948, 3480},{13407, 3487}
-      },
-      /*Y'  qi=46  INTER*/
-      {
-        {   74,  -14},{  149, 1326},{  382, 2538},{  807, 3541},
-        { 1670, 4211},{ 3000, 4499},{ 4416, 4533},{ 5628, 4490},
-        { 6628, 4453},{ 7479, 4425},{ 8228, 4406},{ 8902, 4393},
-        { 9521, 4380},{10090, 4371},{10623, 4364},{11124, 4356},
-        {11586, 4351},{12043, 4344},{12476, 4341},{12863, 4340},
-        {13244, 4337},{13610, 4329},{13936, 4324},{14246, 4329}
-      }
-    },
-    {
-      /*Cb  qi=46  INTRA*/
-      {
-        {   11,    2},{  132,  371},{  234,  737},{  340, 1094},
-        {  481, 1405},{  637, 1667},{  791, 1891},{  944, 2084},
-        { 1099, 2253},{ 1268, 2392},{ 1444, 2507},{ 1633, 2610},
-        { 1825, 2700},{ 1990, 2794},{ 2147, 2895},{ 2321, 2984},
-        { 2493, 3053},{ 2640, 3126},{ 2787, 3198},{ 2954, 3253},
-        { 3146, 3297},{ 3313, 3344},{ 3473, 3393},{ 3757, 3434}
-      },
-      /*Cb  qi=46  INTER*/
-      {
-        {   97,    0},{  109,  339},{  108,  669},{  120, 1035},
-        {  142, 1398},{  173, 1737},{  221, 2052},{  281, 2353},
-        {  345, 2646},{  415, 2924},{  504, 3183},{  616, 3421},
-        {  749, 3643},{  914, 3842},{ 1123, 4012},{ 1379, 4150},
-        { 1685, 4250},{ 2014, 4327},{ 2366, 4382},{ 2731, 4426},
-        { 3083, 4470},{ 3445, 4490},{ 3805, 4511},{ 4146, 4539}
-      }
-    },
-    {
-      /*Cr  qi=46  INTRA*/
-      {
-        {   19,    7},{  137,  393},{  237,  765},{  364, 1116},
-        {  516, 1411},{  665, 1662},{  809, 1880},{  951, 2072},
-        { 1109, 2236},{ 1278, 2378},{ 1474, 2491},{ 1669, 2584},
-        { 1835, 2678},{ 2014, 2766},{ 2203, 2828},{ 2366, 2880},
-        { 2506, 2933},{ 2661, 2988},{ 2810, 3053},{ 2941, 3116},
-        { 3131, 3175},{ 3310, 3243},{ 3461, 3303},{ 3656, 3321}
-      },
-      /*Cr  qi=46  INTER*/
-      {
-        {   91,    1},{  103,  351},{  104,  681},{  121, 1046},
-        {  144, 1401},{  173, 1736},{  213, 2060},{  265, 2373},
-        {  330, 2666},{  410, 2938},{  517, 3185},{  655, 3404},
-        {  815, 3601},{  989, 3784},{ 1183, 3951},{ 1400, 4104},
-        { 1649, 4241},{ 1933, 4352},{ 2261, 4427},{ 2646, 4458},
-        { 3057, 4446},{ 3453, 4418},{ 3820, 4385},{ 4171, 4352}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=47  INTRA*/
-      {
-        {  117,   83},{  670, 1205},{ 1408, 1904},{ 2239, 2219},
-        { 3049, 2414},{ 3905, 2584},{ 4775, 2734},{ 5610, 2852},
-        { 6393, 2944},{ 7121, 3017},{ 7804, 3066},{ 8407, 3081},
-        { 8957, 3093},{ 9498, 3119},{10043, 3160},{10582, 3199},
-        {11083, 3226},{11561, 3250},{11993, 3263},{12352, 3264},
-        {12711, 3259},{13092, 3266},{13463, 3271},{13918, 3275}
-      },
-      /*Y'  qi=47  INTER*/
-      {
-        {   74,  -11},{  148, 1325},{  404, 2518},{  910, 3478},
-        { 1916, 4080},{ 3369, 4298},{ 4823, 4292},{ 6035, 4238},
-        { 7037, 4197},{ 7894, 4168},{ 8650, 4146},{ 9337, 4129},
-        { 9968, 4116},{10549, 4105},{11096, 4096},{11605, 4089},
-        {12081, 4083},{12547, 4076},{12990, 4070},{13399, 4070},
-        {13776, 4065},{14133, 4059},{14486, 4057},{14842, 4053}
-      }
-    },
-    {
-      /*Cb  qi=47  INTRA*/
-      {
-        {   11,    2},{  133,  370},{  242,  731},{  367, 1077},
-        {  524, 1378},{  692, 1630},{  860, 1844},{ 1028, 2024},
-        { 1203, 2178},{ 1393, 2305},{ 1582, 2413},{ 1787, 2507},
-        { 1992, 2590},{ 2175, 2676},{ 2351, 2767},{ 2534, 2851},
-        { 2707, 2923},{ 2862, 2994},{ 3021, 3060},{ 3193, 3111},
-        { 3396, 3147},{ 3573, 3184},{ 3752, 3220},{ 4038, 3255}
-      },
-      /*Cb  qi=47  INTER*/
-      {
-        {  101,    0},{  107,  339},{  108,  667},{  120, 1033},
-        {  142, 1394},{  175, 1729},{  227, 2040},{  295, 2335},
-        {  369, 2619},{  452, 2888},{  556, 3138},{  686, 3368},
-        {  850, 3574},{ 1050, 3758},{ 1299, 3910},{ 1605, 4024},
-        { 1950, 4104},{ 2317, 4163},{ 2689, 4210},{ 3077, 4239},
-        { 3466, 4258},{ 3840, 4278},{ 4205, 4298},{ 4515, 4340}
-      }
-    },
-    {
-      /*Cr  qi=47  INTRA*/
-      {
-        {   19,    7},{  138,  392},{  248,  758},{  396, 1094},
-        {  563, 1378},{  723, 1621},{  881, 1829},{ 1037, 2011},
-        { 1214, 2165},{ 1410, 2290},{ 1623, 2393},{ 1834, 2480},
-        { 2016, 2564},{ 2203, 2647},{ 2405, 2707},{ 2569, 2757},
-        { 2709, 2810},{ 2871, 2860},{ 3027, 2924},{ 3178, 2980},
-        { 3375, 3034},{ 3563, 3097},{ 3724, 3151},{ 3952, 3153}
-      },
-      /*Cr  qi=47  INTER*/
-      {
-        {   91,    1},{  100,  351},{  102,  681},{  120, 1043},
-        {  144, 1397},{  175, 1729},{  219, 2049},{  277, 2356},
-        {  353, 2640},{  451, 2902},{  579, 3136},{  739, 3342},
-        {  926, 3525},{ 1125, 3698},{ 1343, 3859},{ 1595, 3998},
-        { 1881, 4113},{ 2208, 4205},{ 2589, 4253},{ 3014, 4250},
-        { 3444, 4220},{ 3838, 4183},{ 4196, 4147},{ 4521, 4116}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=48  INTRA*/
-      {
-        {  107,   87},{  681, 1200},{ 1456, 1883},{ 2306, 2193},
-        { 3122, 2386},{ 3984, 2548},{ 4862, 2693},{ 5704, 2808},
-        { 6495, 2899},{ 7232, 2970},{ 7915, 3018},{ 8524, 3034},
-        { 9085, 3043},{ 9635, 3068},{10192, 3108},{10735, 3145},
-        {11237, 3171},{11719, 3194},{12153, 3207},{12516, 3206},
-        {12888, 3202},{13266, 3210},{13637, 3218},{14101, 3219}
-      },
-      /*Y'  qi=48  INTER*/
-      {
-        {   83,  -18},{  147, 1328},{  398, 2519},{  923, 3468},
-        { 1979, 4047},{ 3472, 4246},{ 4936, 4232},{ 6148, 4178},
-        { 7150, 4139},{ 8007, 4111},{ 8765, 4091},{ 9458, 4076},
-        {10090, 4063},{10676, 4054},{11226, 4045},{11742, 4038},
-        {12223, 4033},{12686, 4029},{13127, 4022},{13527, 4015},
-        {13915, 4012},{14277, 4007},{14619, 4004},{14966, 4001}
-      }
-    },
-    {
-      /*Cb  qi=48  INTRA*/
-      {
-        {   11,    2},{  134,  369},{  245,  730},{  373, 1075},
-        {  531, 1374},{  698, 1625},{  865, 1839},{ 1033, 2019},
-        { 1207, 2173},{ 1397, 2300},{ 1588, 2408},{ 1795, 2501},
-        { 2003, 2581},{ 2187, 2666},{ 2362, 2757},{ 2548, 2841},
-        { 2719, 2912},{ 2876, 2983},{ 3034, 3047},{ 3209, 3097},
-        { 3409, 3137},{ 3589, 3178},{ 3762, 3216},{ 4004, 3252}
-      },
-      /*Cb  qi=48  INTER*/
-      {
-        {  113,   26},{  112,  344},{  111,  668},{  120, 1032},
-        {  141, 1392},{  173, 1727},{  224, 2036},{  290, 2330},
-        {  363, 2612},{  447, 2880},{  551, 3130},{  685, 3358},
-        {  852, 3563},{ 1061, 3742},{ 1332, 3884},{ 1654, 3993},
-        { 2011, 4068},{ 2394, 4120},{ 2782, 4160},{ 3172, 4186},
-        { 3557, 4209},{ 3932, 4228},{ 4306, 4237},{ 4675, 4236}
-      }
-    },
-    {
-      /*Cr  qi=48  INTRA*/
-      {
-        {   18,    7},{  139,  389},{  252,  755},{  404, 1090},
-        {  573, 1372},{  732, 1615},{  889, 1823},{ 1045, 2005},
-        { 1222, 2159},{ 1417, 2285},{ 1631, 2387},{ 1843, 2474},
-        { 2027, 2558},{ 2212, 2639},{ 2413, 2697},{ 2578, 2746},
-        { 2720, 2798},{ 2887, 2852},{ 3040, 2913},{ 3181, 2970},
-        { 3381, 3024},{ 3581, 3081},{ 3743, 3130},{ 3948, 3133}
-      },
-      /*Cr  qi=48  INTER*/
-      {
-        {   89,    0},{  106,  352},{  105,  682},{  120, 1044},
-        {  144, 1395},{  174, 1724},{  215, 2044},{  270, 2350},
-        {  343, 2635},{  441, 2895},{  571, 3129},{  735, 3334},
-        {  926, 3518},{ 1139, 3684},{ 1371, 3836},{ 1628, 3977},
-        { 1933, 4089},{ 2279, 4164},{ 2672, 4204},{ 3105, 4205},
-        { 3533, 4176},{ 3931, 4135},{ 4290, 4089},{ 4624, 4057}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=49  INTRA*/
-      {
-        {  120,   85},{  706, 1194},{ 1485, 1875},{ 2348, 2187},
-        { 3190, 2372},{ 4076, 2521},{ 4967, 2658},{ 5819, 2771},
-        { 6611, 2861},{ 7345, 2936},{ 8026, 2990},{ 8626, 3013},
-        { 9182, 3030},{ 9723, 3059},{10266, 3100},{10802, 3143},
-        {11293, 3179},{11768, 3206},{12201, 3221},{12556, 3225},
-        {12914, 3226},{13281, 3237},{13639, 3247},{14089, 3257}
-      },
-      /*Y'  qi=49  INTER*/
-      {
-        {   72,  -11},{  155, 1320},{  458, 2485},{ 1090, 3386},
-        { 2284, 3907},{ 3835, 4075},{ 5272, 4064},{ 6449, 4026},
-        { 7426, 4003},{ 8267, 3987},{ 9017, 3976},{ 9698, 3967},
-        {10328, 3962},{10913, 3959},{11452, 3954},{11961, 3950},
-        {12442, 3947},{12904, 3946},{13347, 3945},{13749, 3943},
-        {14123, 3941},{14490, 3941},{14826, 3939},{15153, 3937}
-      }
-    },
-    {
-      /*Cb  qi=49  INTRA*/
-      {
-        {   11,    2},{  145,  369},{  262,  729},{  393, 1070},
-        {  557, 1363},{  731, 1607},{  907, 1811},{ 1085, 1983},
-        { 1268, 2130},{ 1465, 2251},{ 1658, 2359},{ 1868, 2454},
-        { 2079, 2534},{ 2264, 2621},{ 2440, 2717},{ 2625, 2802},
-        { 2792, 2878},{ 2945, 2954},{ 3106, 3021},{ 3277, 3075},
-        { 3466, 3119},{ 3638, 3170},{ 3824, 3213},{ 4100, 3243}
-      },
-      /*Cb  qi=49  INTER*/
-      {
-        {   98,   -6},{  113,  343},{  110,  669},{  122, 1029},
-        {  149, 1380},{  192, 1706},{  258, 2007},{  340, 2293},
-        {  426, 2569},{  525, 2831},{  653, 3071},{  814, 3287},
-        { 1013, 3478},{ 1262, 3637},{ 1575, 3761},{ 1936, 3851},
-        { 2328, 3910},{ 2741, 3949},{ 3163, 3970},{ 3559, 3994},
-        { 3936, 4025},{ 4300, 4050},{ 4655, 4060},{ 4962, 4062}
-      }
-    },
-    {
-      /*Cr  qi=49  INTRA*/
-      {
-        {   19,    7},{  151,  389},{  270,  753},{  427, 1084},
-        {  602, 1360},{  767, 1595},{  933, 1794},{ 1098, 1968},
-        { 1285, 2115},{ 1489, 2237},{ 1699, 2342},{ 1912, 2435},
-        { 2101, 2519},{ 2288, 2601},{ 2486, 2663},{ 2651, 2715},
-        { 2799, 2769},{ 2958, 2825},{ 3106, 2890},{ 3257, 2948},
-        { 3452, 3007},{ 3634, 3075},{ 3786, 3136},{ 3959, 3164}
-      },
-      /*Cr  qi=49  INTER*/
-      {
-        {   85,    1},{  103,  352},{  104,  681},{  121, 1039},
-        {  152, 1382},{  195, 1702},{  248, 2015},{  316, 2316},
-        {  403, 2595},{  520, 2847},{  676, 3068},{  870, 3258},
-        { 1091, 3429},{ 1329, 3585},{ 1597, 3725},{ 1894, 3849},
-        { 2242, 3940},{ 2656, 3984},{ 3098, 3992},{ 3531, 3981},
-        { 3936, 3950},{ 4304, 3915},{ 4646, 3879},{ 4915, 3861}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=50  INTRA*/
-      {
-        {  122,   89},{  798, 1170},{ 1682, 1812},{ 2613, 2096},
-        { 3501, 2260},{ 4430, 2388},{ 5352, 2510},{ 6228, 2613},
-        { 7043, 2698},{ 7793, 2770},{ 8486, 2823},{ 9092, 2846},
-        { 9652, 2865},{10210, 2895},{10773, 2936},{11315, 2979},
-        {11817, 3014},{12297, 3041},{12734, 3057},{13097, 3064},
-        {13443, 3067},{13813, 3078},{14190, 3088},{14646, 3103}
-      },
-      /*Y'  qi=50  INTER*/
-      {
-        {   73,  -11},{  154, 1318},{  501, 2457},{ 1281, 3291},
-        { 2685, 3719},{ 4356, 3810},{ 5811, 3769},{ 6988, 3726},
-        { 7976, 3700},{ 8835, 3682},{ 9606, 3669},{10307, 3659},
-        {10953, 3652},{11556, 3645},{12115, 3643},{12641, 3640},
-        {13138, 3636},{13613, 3634},{14068, 3629},{14488, 3627},
-        {14876, 3625},{15237, 3621},{15585, 3623},{15922, 3629}
-      }
-    },
-    {
-      /*Cb  qi=50  INTRA*/
-      {
-        {   11,    2},{  148,  368},{  278,  724},{  431, 1052},
-        {  613, 1334},{  806, 1567},{ 1004, 1756},{ 1203, 1915},
-        { 1405, 2051},{ 1621, 2163},{ 1833, 2262},{ 2059, 2347},
-        { 2280, 2424},{ 2476, 2512},{ 2670, 2598},{ 2864, 2679},
-        { 3037, 2754},{ 3201, 2826},{ 3376, 2887},{ 3562, 2936},
-        { 3756, 2976},{ 3932, 3022},{ 4117, 3065},{ 4385, 3094}
-      },
-      /*Cb  qi=50  INTER*/
-      {
-        {   92,   -3},{  112,  343},{  109,  669},{  121, 1027},
-        {  149, 1375},{  196, 1697},{  270, 1992},{  366, 2267},
-        {  471, 2532},{  594, 2782},{  747, 3011},{  942, 3212},
-        { 1189, 3384},{ 1497, 3521},{ 1875, 3613},{ 2297, 3673},
-        { 2739, 3710},{ 3195, 3725},{ 3644, 3737},{ 4057, 3751},
-        { 4445, 3763},{ 4841, 3769},{ 5211, 3779},{ 5568, 3769}
-      }
-    },
-    {
-      /*Cr  qi=50  INTRA*/
-      {
-        {   19,    7},{  155,  388},{  290,  744},{  474, 1060},
-        {  666, 1324},{  847, 1549},{ 1033, 1737},{ 1219, 1898},
-        { 1428, 2034},{ 1653, 2147},{ 1885, 2245},{ 2115, 2329},
-        { 2316, 2410},{ 2517, 2486},{ 2730, 2539},{ 2901, 2586},
-        { 3042, 2638},{ 3199, 2693},{ 3366, 2755},{ 3534, 2805},
-        { 3738, 2858},{ 3934, 2916},{ 4079, 2975},{ 4257, 2992}
-      },
-      /*Cr  qi=50  INTER*/
-      {
-        {   87,    1},{  102,  353},{  103,  680},{  121, 1036},
-        {  153, 1377},{  199, 1694},{  260, 1999},{  339, 2291},
-        {  446, 2559},{  590, 2797},{  780, 3003},{ 1010, 3176},
-        { 1267, 3331},{ 1547, 3474},{ 1874, 3594},{ 2245, 3688},
-        { 2666, 3742},{ 3130, 3758},{ 3594, 3748},{ 4028, 3711},
-        { 4415, 3674},{ 4771, 3641},{ 5122, 3605},{ 5482, 3569}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=51  INTRA*/
-      {
-        {  115,   93},{  819, 1164},{ 1739, 1806},{ 2695, 2101},
-        { 3612, 2257},{ 4552, 2374},{ 5479, 2490},{ 6352, 2593},
-        { 7158, 2683},{ 7898, 2761},{ 8580, 2823},{ 9177, 2854},
-        { 9728, 2880},{10268, 2917},{10816, 2966},{11350, 3016},
-        {11834, 3058},{12311, 3089},{12741, 3109},{13092, 3119},
-        {13434, 3126},{13791, 3142},{14156, 3155},{14590, 3171}
-      },
-      /*Y'  qi=51  INTER*/
-      {
-        {   58,    0},{  171, 1307},{  610, 2407},{ 1563, 3175},
-        { 3116, 3545},{ 4789, 3624},{ 6185, 3602},{ 7320, 3583},
-        { 8282, 3574},{ 9124, 3569},{ 9878, 3567},{10569, 3565},
-        {11207, 3563},{11801, 3564},{12359, 3566},{12884, 3567},
-        {13373, 3568},{13841, 3567},{14289, 3566},{14699, 3568},
-        {15086, 3568},{15446, 3566},{15788, 3564},{16103, 3568}
-      }
-    },
-    {
-      /*Cb  qi=51  INTRA*/
-      {
-        {   14,    3},{  161,  369},{  297,  722},{  454, 1047},
-        {  639, 1325},{  833, 1554},{ 1033, 1742},{ 1236, 1897},
-        { 1440, 2032},{ 1653, 2148},{ 1860, 2253},{ 2077, 2347},
-        { 2288, 2432},{ 2476, 2525},{ 2661, 2621},{ 2841, 2714},
-        { 3010, 2797},{ 3170, 2876},{ 3333, 2945},{ 3510, 3000},
-        { 3696, 3054},{ 3865, 3114},{ 4046, 3164},{ 4317, 3200}
-      },
-      /*Cb  qi=51  INTER*/
-      {
-        {   88,  -11},{  109,  341},{  109,  668},{  126, 1019},
-        {  168, 1358},{  233, 1670},{  329, 1955},{  451, 2219},
-        {  584, 2472},{  736, 2711},{  931, 2923},{ 1179, 3104},
-        { 1480, 3254},{ 1846, 3368},{ 2265, 3448},{ 2714, 3501},
-        { 3180, 3524},{ 3638, 3529},{ 4074, 3543},{ 4485, 3560},
-        { 4868, 3571},{ 5238, 3581},{ 5597, 3594},{ 5953, 3591}
-      }
-    },
-    {
-      /*Cr  qi=51  INTRA*/
-      {
-        {   24,    7},{  168,  388},{  309,  742},{  496, 1054},
-        {  688, 1316},{  873, 1538},{ 1063, 1723},{ 1252, 1882},
-        { 1460, 2018},{ 1682, 2134},{ 1907, 2238},{ 2125, 2332},
-        { 2317, 2422},{ 2507, 2510},{ 2705, 2575},{ 2869, 2630},
-        { 3015, 2684},{ 3178, 2744},{ 3329, 2815},{ 3477, 2878},
-        { 3667, 2945},{ 3848, 3016},{ 3997, 3082},{ 4174, 3121}
-      },
-      /*Cr  qi=51  INTER*/
-      {
-        {   83,   -2},{  102,  351},{  102,  680},{  126, 1029},
-        {  172, 1359},{  238, 1665},{  321, 1962},{  422, 2246},
-        {  552, 2505},{  733, 2728},{  970, 2912},{ 1247, 3069},
-        { 1552, 3209},{ 1876, 3338},{ 2251, 3440},{ 2692, 3502},
-        { 3161, 3529},{ 3637, 3525},{ 4084, 3509},{ 4487, 3479},
-        { 4850, 3444},{ 5181, 3419},{ 5507, 3406},{ 5786, 3398}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=52  INTRA*/
-      {
-        {  117,   93},{  814, 1168},{ 1729, 1822},{ 2706, 2119},
-        { 3655, 2262},{ 4604, 2374},{ 5528, 2490},{ 6394, 2596},
-        { 7189, 2691},{ 7921, 2777},{ 8596, 2846},{ 9184, 2885},
-        { 9728, 2918},{10260, 2961},{10796, 3014},{11316, 3069},
-        {11793, 3115},{12267, 3150},{12692, 3172},{13037, 3185},
-        {13367, 3196},{13717, 3214},{14087, 3227},{14521, 3249}
-      },
-      /*Y'  qi=52  INTER*/
-      {
-        {   52,    0},{  169, 1308},{  668, 2382},{ 1735, 3112},
-        { 3384, 3451},{ 5077, 3519},{ 6461, 3506},{ 7587, 3496},
-        { 8545, 3494},{ 9384, 3494},{10142, 3498},{10838, 3501},
-        {11475, 3503},{12078, 3508},{12640, 3511},{13162, 3513},
-        {13654, 3517},{14130, 3521},{14576, 3522},{14980, 3523},
-        {15369, 3523},{15737, 3522},{16071, 3521},{16382, 3516}
-      }
-    },
-    {
-      /*Cb  qi=52  INTRA*/
-      {
-        {   14,    3},{  163,  369},{  299,  722},{  457, 1044},
-        {  645, 1319},{  843, 1545},{ 1050, 1728},{ 1261, 1879},
-        { 1468, 2013},{ 1678, 2132},{ 1883, 2240},{ 2093, 2338},
-        { 2301, 2428},{ 2488, 2523},{ 2667, 2619},{ 2843, 2718},
-        { 3010, 2805},{ 3163, 2887},{ 3323, 2963},{ 3490, 3028},
-        { 3665, 3087},{ 3841, 3145},{ 4011, 3197},{ 4289, 3230}
-      },
-      /*Cb  qi=52  INTER*/
-      {
-        {   98,   -7},{  109,  342},{  109,  668},{  126, 1018},
-        {  170, 1355},{  242, 1663},{  352, 1941},{  490, 2195},
-        {  642, 2439},{  823, 2666},{ 1052, 2868},{ 1333, 3039},
-        { 1670, 3178},{ 2074, 3280},{ 2524, 3348},{ 2996, 3390},
-        { 3469, 3410},{ 3923, 3420},{ 4355, 3434},{ 4771, 3451},
-        { 5166, 3468},{ 5532, 3483},{ 5885, 3499},{ 6263, 3501}
-      }
-    },
-    {
-      /*Cr  qi=52  INTRA*/
-      {
-        {   25,    7},{  170,  388},{  312,  741},{  500, 1051},
-        {  694, 1310},{  883, 1529},{ 1082, 1709},{ 1280, 1864},
-        { 1491, 1998},{ 1710, 2117},{ 1932, 2225},{ 2143, 2324},
-        { 2328, 2418},{ 2516, 2506},{ 2708, 2578},{ 2870, 2637},
-        { 3017, 2693},{ 3170, 2758},{ 3312, 2835},{ 3455, 2901},
-        { 3644, 2972},{ 3827, 3049},{ 3968, 3121},{ 4115, 3166}
-      },
-      /*Cr  qi=52  INTER*/
-      {
-        {   86,   -2},{  101,  352},{  100,  680},{  126, 1028},
-        {  175, 1356},{  247, 1657},{  341, 1948},{  458, 2224},
-        {  615, 2471},{  828, 2681},{ 1091, 2857},{ 1395, 3008},
-        { 1732, 3140},{ 2095, 3257},{ 2502, 3348},{ 2968, 3402},
-        { 3457, 3420},{ 3926, 3413},{ 4360, 3388},{ 4759, 3357},
-        { 5128, 3329},{ 5449, 3306},{ 5741, 3295},{ 6071, 3296}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=53  INTRA*/
-      {
-        {  138,   93},{  850, 1161},{ 1773, 1810},{ 2763, 2103},
-        { 3722, 2245},{ 4675, 2360},{ 5600, 2483},{ 6464, 2597},
-        { 7255, 2700},{ 7982, 2792},{ 8652, 2867},{ 9237, 2913},
-        { 9775, 2950},{10302, 2998},{10834, 3058},{11347, 3121},
-        {11826, 3169},{12299, 3207},{12713, 3235},{13054, 3250},
-        {13387, 3265},{13744, 3286},{14110, 3302},{14515, 3323}
-      },
-      /*Y'  qi=53  INTER*/
-      {
-        {   52,    2},{  169, 1308},{  680, 2377},{ 1763, 3103},
-        { 3410, 3450},{ 5094, 3531},{ 6469, 3526},{ 7590, 3525},
-        { 8547, 3530},{ 9385, 3534},{10139, 3540},{10835, 3548},
-        {11479, 3553},{12075, 3559},{12634, 3565},{13159, 3570},
-        {13650, 3573},{14124, 3576},{14575, 3580},{14993, 3583},
-        {15375, 3584},{15744, 3584},{16091, 3583},{16421, 3586}
-      }
-    },
-    {
-      /*Cb  qi=53  INTRA*/
-      {
-        {   14,    3},{  167,  367},{  317,  717},{  492, 1033},
-        {  687, 1306},{  887, 1531},{ 1095, 1715},{ 1309, 1866},
-        { 1517, 2000},{ 1729, 2119},{ 1932, 2227},{ 2146, 2325},
-        { 2358, 2414},{ 2544, 2511},{ 2724, 2611},{ 2902, 2711},
-        { 3070, 2800},{ 3227, 2878},{ 3381, 2954},{ 3548, 3021},
-        { 3724, 3077},{ 3888, 3140},{ 4065, 3196},{ 4359, 3225}
-      },
-      /*Cb  qi=53  INTER*/
-      {
-        {   93,   -8},{  110,  342},{  108,  668},{  125, 1018},
-        {  170, 1355},{  242, 1663},{  353, 1939},{  494, 2192},
-        {  651, 2433},{  838, 2658},{ 1076, 2856},{ 1368, 3022},
-        { 1716, 3158},{ 2123, 3260},{ 2575, 3330},{ 3042, 3373},
-        { 3507, 3396},{ 3962, 3413},{ 4394, 3430},{ 4797, 3452},
-        { 5169, 3476},{ 5547, 3496},{ 5914, 3510},{ 6235, 3525}
-      }
-    },
-    {
-      /*Cr  qi=53  INTRA*/
-      {
-        {   25,    7},{  175,  386},{  335,  734},{  541, 1037},
-        {  737, 1296},{  926, 1516},{ 1125, 1696},{ 1324, 1851},
-        { 1540, 1984},{ 1763, 2102},{ 1989, 2210},{ 2202, 2310},
-        { 2386, 2404},{ 2572, 2495},{ 2768, 2569},{ 2929, 2627},
-        { 3071, 2684},{ 3231, 2749},{ 3374, 2825},{ 3514, 2894},
-        { 3703, 2963},{ 3882, 3040},{ 4024, 3111},{ 4190, 3150}
-      },
-      /*Cr  qi=53  INTER*/
-      {
-        {   87,   -1},{   99,  352},{  100,  680},{  125, 1027},
-        {  175, 1355},{  249, 1657},{  343, 1946},{  462, 2220},
-        {  624, 2465},{  844, 2671},{ 1122, 2841},{ 1435, 2989},
-        { 1768, 3125},{ 2134, 3243},{ 2545, 3334},{ 3002, 3393},
-        { 3490, 3412},{ 3965, 3405},{ 4401, 3384},{ 4797, 3359},
-        { 5156, 3328},{ 5482, 3297},{ 5800, 3292},{ 6135, 3293}
+        {   39,  -33},{   48,  403},{   86,  744},{  110, 1101},
+        {  134, 1461},{  165, 1779},{  205, 2095},{  259, 2401},
+        {  318, 2686},{  386, 2958},{  481, 3204},{  610, 3415},
+        {  753, 3603},{  908, 3780},{ 1055, 3959},{ 1220, 4132},
+        { 1422, 4281},{ 1656, 4419},{ 1939, 4512},{ 2259, 4574},
+        { 2593, 4593},{ 2950, 4569},{ 3339, 4505},{ 3542, 4497}
       }
     }
   },
@@ -3406,557 +403,563 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=54  INTRA*/
       {
-        {  184,   94},{  902, 1151},{ 1876, 1776},{ 2881, 2057},
-        { 3832, 2200},{ 4785, 2315},{ 5709, 2442},{ 6570, 2562},
-        { 7362, 2672},{ 8092, 2771},{ 8760, 2852},{ 9337, 2901},
-        { 9874, 2943},{10402, 2995},{10928, 3059},{11443, 3126},
-        {11926, 3178},{12396, 3220},{12805, 3251},{13139, 3266},
-        {13466, 3280},{13822, 3304},{14184, 3322},{14585, 3342}
+        {  339,   30},{  785, 1251},{ 2395, 1971},{ 4075, 2063},
+        { 4924, 2135},{ 5806, 2270},{ 6604, 2372},{ 7224, 2497},
+        { 7879, 2608},{ 8400, 2729},{ 8951, 2829},{ 9379, 2864},
+        { 9782, 2955},{10230, 3020},{10704, 3132},{11264, 3272},
+        {11618, 3284},{12034, 3394},{12500, 3482},{12767, 3484},
+        {13162, 3580},{13552, 3565},{13997, 3732},{14320, 3715}
       },
       /*Y'  qi=54  INTER*/
       {
-        {   60,    5},{  169, 1308},{  683, 2375},{ 1791, 3090},
-        { 3478, 3412},{ 5184, 3470},{ 6568, 3455},{ 7697, 3446},
-        { 8659, 3446},{ 9503, 3447},{10266, 3450},{10971, 3454},
-        {11619, 3458},{12223, 3462},{12789, 3467},{13315, 3471},
-        {13811, 3475},{14291, 3479},{14743, 3479},{15148, 3481},
-        {15535, 3483},{15913, 3481},{16252, 3479},{16569, 3472}
+        {   65,   95},{  269, 1312},{ 1152, 2242},{ 2336, 2863},
+        { 3728, 3239},{ 4944, 3439},{ 6034, 3543},{ 7064, 3580},
+        { 7991, 3586},{ 8849, 3568},{ 9605, 3561},{10306, 3550},
+        {10919, 3544},{11466, 3530},{11972, 3528},{12401, 3536},
+        {12818, 3511},{13185, 3522},{13523, 3505},{13827, 3505},
+        {14114, 3522},{14395, 3521},{14625, 3533},{14909, 3532}
       }
     },
     {
       /*Cb  qi=54  INTRA*/
       {
-        {   13,    2},{  165,  367},{  318,  715},{  498, 1030},
-        {  698, 1301},{  906, 1523},{ 1121, 1703},{ 1336, 1853},
-        { 1549, 1984},{ 1765, 2100},{ 1974, 2207},{ 2192, 2306},
-        { 2402, 2396},{ 2587, 2493},{ 2773, 2591},{ 2953, 2691},
-        { 3119, 2778},{ 3277, 2858},{ 3430, 2940},{ 3603, 3004},
-        { 3788, 3059},{ 3950, 3121},{ 4128, 3173},{ 4398, 3215}
+        {  148,   -3},{  218,  480},{  351,  787},{  437, 1069},
+        {  550, 1350},{  730, 1592},{  931, 1784},{ 1243, 1884},
+        { 1499, 1984},{ 1680, 2115},{ 1864, 2244},{ 2062, 2334},
+        { 2278, 2407},{ 2442, 2496},{ 2602, 2603},{ 2783, 2686},
+        { 2928, 2771},{ 3073, 2856},{ 3207, 2938},{ 3368, 2998},
+        { 3516, 3077},{ 3699, 3122},{ 3818, 3202},{ 3939, 3230}
       },
       /*Cb  qi=54  INTER*/
       {
-        {  100,   -3},{  109,  343},{  107,  668},{  125, 1018},
-        {  169, 1354},{  241, 1662},{  353, 1938},{  496, 2190},
-        {  655, 2431},{  843, 2655},{ 1082, 2851},{ 1381, 3015},
-        { 1739, 3146},{ 2154, 3243},{ 2610, 3310},{ 3094, 3344},
-        { 3581, 3358},{ 4034, 3371},{ 4457, 3384},{ 4867, 3399},
-        { 5255, 3413},{ 5630, 3425},{ 6003, 3440},{ 6346, 3440}
+        {   48,  -11},{   54,  407},{   86,  743},{  122, 1083},
+        {  176, 1400},{  241, 1699},{  347, 1968},{  496, 2208},
+        {  664, 2431},{  863, 2637},{ 1120, 2816},{ 1442, 2961},
+        { 1835, 3066},{ 2261, 3140},{ 2676, 3203},{ 3092, 3245},
+        { 3480, 3266},{ 3862, 3286},{ 4254, 3305},{ 4604, 3316},
+        { 4989, 3335},{ 5306, 3351},{ 5654, 3339},{ 5855, 3345}
       }
     },
     {
       /*Cr  qi=54  INTRA*/
       {
-        {   23,    7},{  174,  386},{  338,  732},{  549, 1034},
-        {  751, 1289},{  947, 1506},{ 1150, 1685},{ 1353, 1837},
-        { 1572, 1969},{ 1800, 2087},{ 2031, 2192},{ 2248, 2291},
-        { 2434, 2387},{ 2622, 2477},{ 2815, 2549},{ 2976, 2607},
-        { 3126, 2663},{ 3286, 2727},{ 3427, 2807},{ 3569, 2877},
-        { 3761, 2941},{ 3942, 3016},{ 4084, 3093},{ 4226, 3131}
+        {  137,   10},{  212,  492},{  315,  795},{  470, 1061},
+        {  612, 1333},{  821, 1539},{ 1105, 1680},{ 1335, 1811},
+        { 1566, 1927},{ 1773, 2038},{ 1973, 2153},{ 2148, 2259},
+        { 2311, 2352},{ 2474, 2460},{ 2647, 2516},{ 2810, 2607},
+        { 2928, 2638},{ 3085, 2742},{ 3232, 2815},{ 3348, 2899},
+        { 3533, 2993},{ 3679, 3029},{ 3803, 3138},{ 3925, 3170}
       },
       /*Cr  qi=54  INTER*/
       {
-        {   88,   -2},{   99,  351},{  100,  680},{  125, 1027},
-        {  175, 1354},{  248, 1656},{  343, 1945},{  463, 2219},
-        {  626, 2463},{  850, 2668},{ 1128, 2837},{ 1445, 2983},
-        { 1791, 3111},{ 2168, 3224},{ 2597, 3309},{ 3075, 3351},
-        { 3560, 3364},{ 4029, 3356},{ 4464, 3335},{ 4858, 3307},
-        { 5218, 3275},{ 5547, 3256},{ 5850, 3247},{ 6171, 3214}
+        {   46,    2},{   47,  419},{   87,  746},{  125, 1083},
+        {  177, 1401},{  249, 1687},{  342, 1964},{  453, 2226},
+        {  627, 2454},{  869, 2641},{ 1152, 2800},{ 1455, 2942},
+        { 1776, 3077},{ 2135, 3187},{ 2524, 3287},{ 2984, 3325},
+        { 3425, 3344},{ 3881, 3328},{ 4313, 3274},{ 4701, 3218},
+        { 5027, 3171},{ 5299, 3130},{ 5597, 3107},{ 5791, 3120}
       }
     }
   },
   {
     {
-      /*Y'  qi=55  INTRA*/
-      {
-        {  178,   95},{  968, 1137},{ 2000, 1747},{ 3013, 2027},
-        { 3966, 2173},{ 4920, 2294},{ 5842, 2427},{ 6702, 2553},
-        { 7489, 2668},{ 8213, 2773},{ 8875, 2858},{ 9452, 2913},
-        { 9986, 2959},{10504, 3016},{11023, 3085},{11530, 3157},
-        {12011, 3213},{12480, 3257},{12882, 3291},{13214, 3310},
-        {13542, 3325},{13890, 3350},{14248, 3371},{14671, 3398}
+      /*Y'  qi=63  INTRA*/
+      {
+        {  -86,  167},{ 2070, 1104},{ 5138, 1428},{ 7014, 1535},
+        { 8430, 1629},{ 9663, 1690},{10576, 1745},{11277, 1809},
+        {12003, 1869},{12663, 1925},{13258, 1983},{13701, 2016},
+        {14228, 2073},{14756, 2088},{15203, 2164},{15993, 2175},
+        {16378, 2256},{16917, 2240},{17361, 2332},{17782, 2312},
+        {18376, 2381},{18728, 2362},{19224, 2408},{19705, 2392}
       },
-      /*Y'  qi=55  INTER*/
-      {
-        {   59,    5},{  170, 1307},{  725, 2358},{ 1886, 3058},
-        { 3589, 3385},{ 5284, 3459},{ 6654, 3458},{ 7771, 3461},
-        { 8727, 3470},{ 9564, 3478},{10322, 3488},{11019, 3497},
-        {11658, 3505},{12258, 3513},{12819, 3520},{13344, 3527},
-        {13840, 3533},{14314, 3537},{14755, 3541},{15161, 3544},
-        {15552, 3548},{15916, 3548},{16257, 3548},{16576, 3540}
+      /*Y'  qi=63  INTER*/
+      {
+        { -529,  154},{  967, 1233},{ 4201, 1610},{ 6285, 1800},
+        { 8058, 1908},{ 9439, 1968},{10737, 1987},{11999, 1979},
+        {13003, 1972},{13854, 1963},{14584, 1965},{15217, 1955},
+        {15773, 1956},{16229, 1949},{16735, 1952},{17085, 1956},
+        {17508, 1956},{17821, 1961},{18191, 1961},{18465, 1982},
+        {18792, 1975},{19158, 1995},{19378, 2010},{19817, 2021}
       }
     },
     {
-      /*Cb  qi=55  INTRA*/
-      {
-        {   13,    2},{  167,  366},{  322,  714},{  508, 1026},
-        {  716, 1292},{  930, 1511},{ 1148, 1690},{ 1366, 1839},
-        { 1578, 1972},{ 1793, 2090},{ 2001, 2199},{ 2217, 2300},
-        { 2427, 2393},{ 2609, 2495},{ 2784, 2600},{ 2961, 2704},
-        { 3121, 2797},{ 3268, 2884},{ 3423, 2965},{ 3590, 3032},
-        { 3764, 3096},{ 3926, 3165},{ 4101, 3223},{ 4405, 3258}
+      /*Cb  qi=63  INTRA*/
+      {
+        {  136,    4},{  338,  438},{  593,  730},{  835,  974},
+        { 1168, 1188},{ 1602, 1345},{ 2004, 1467},{ 2465, 1505},
+        { 2799, 1574},{ 3091, 1669},{ 3384, 1758},{ 3673, 1817},
+        { 3950, 1861},{ 4190, 1924},{ 4444, 1993},{ 4701, 2051},
+        { 4915, 2123},{ 5119, 2166},{ 5329, 2231},{ 5576, 2259},
+        { 5793, 2310},{ 6001, 2334},{ 6198, 2384},{ 6344, 2401}
       },
-      /*Cb  qi=55  INTER*/
-      {
-        {   90,   -4},{  109,  344},{  107,  668},{  126, 1017},
-        {  172, 1351},{  249, 1657},{  370, 1928},{  527, 2174},
-        {  702, 2407},{  909, 2624},{ 1170, 2814},{ 1493, 2970},
-        { 1869, 3097},{ 2292, 3192},{ 2752, 3258},{ 3232, 3295},
-        { 3709, 3314},{ 4156, 3335},{ 4592, 3355},{ 5004, 3373},
-        { 5377, 3389},{ 5737, 3411},{ 6092, 3432},{ 6473, 3423}
+      /*Cb  qi=63  INTER*/
+      {
+        {   49,    4},{   51,  403},{   98,  729},{  185, 1034},
+        {  352, 1304},{  622, 1533},{ 1068, 1696},{ 1604, 1821},
+        { 2203, 1924},{ 2890, 1988},{ 3622, 2017},{ 4359, 2019},
+        { 5025, 2005},{ 5586, 2002},{ 6090, 1989},{ 6519, 1977},
+        { 6927, 1977},{ 7305, 1968},{ 7730, 1984},{ 8087, 1981},
+        { 8435, 1991},{ 8822, 1987},{ 9155, 2008},{ 9392, 2011}
       }
     },
     {
-      /*Cr  qi=55  INTRA*/
-      {
-        {   23,    7},{  175,  385},{  342,  730},{  561, 1028},
-        {  771, 1279},{  973, 1493},{ 1181, 1669},{ 1384, 1822},
-        { 1602, 1956},{ 1830, 2076},{ 2057, 2184},{ 2270, 2288},
-        { 2452, 2389},{ 2637, 2484},{ 2823, 2559},{ 2983, 2621},
-        { 3129, 2682},{ 3280, 2753},{ 3417, 2833},{ 3554, 2904},
-        { 3743, 2977},{ 3921, 3060},{ 4055, 3137},{ 4185, 3186}
+      /*Cr  qi=63  INTRA*/
+      {
+        {  131,   11},{  334,  448},{  569,  739},{  929,  946},
+        { 1285, 1145},{ 1718, 1274},{ 2176, 1343},{ 2531, 1424},
+        { 2866, 1504},{ 3176, 1580},{ 3475, 1657},{ 3736, 1728},
+        { 3962, 1807},{ 4232, 1872},{ 4425, 1921},{ 4657, 1976},
+        { 4817, 2009},{ 5063, 2082},{ 5281, 2129},{ 5480, 2199},
+        { 5743, 2258},{ 5887, 2283},{ 6124, 2358},{ 6273, 2378}
       },
-      /*Cr  qi=55  INTER*/
-      {
-        {   85,    0},{   99,  352},{  100,  679},{  126, 1025},
-        {  178, 1351},{  256, 1650},{  359, 1935},{  493, 2202},
-        {  675, 2439},{  921, 2636},{ 1220, 2799},{ 1552, 2941},
-        { 1910, 3068},{ 2303, 3177},{ 2735, 3262},{ 3206, 3311},
-        { 3689, 3333},{ 4152, 3327},{ 4588, 3299},{ 4978, 3272},
-        { 5325, 3243},{ 5651, 3221},{ 5969, 3210},{ 6218, 3185}
+      /*Cr  qi=63  INTER*/
+      {
+        {   47,   15},{   40,  405},{  100,  730},{  189, 1037},
+        {  351, 1303},{  625, 1526},{  984, 1719},{ 1512, 1862},
+        { 2189, 1947},{ 2895, 2003},{ 3576, 2046},{ 4249, 2072},
+        { 4901, 2068},{ 5514, 2043},{ 6079, 2009},{ 6528, 1977},
+        { 6927, 1940},{ 7274, 1915},{ 7580, 1894},{ 7910, 1910},
+        { 8211, 1902},{ 8472, 1920},{ 8742, 1926},{ 8981, 1930}
       }
     }
-  },
+  }
+};
+
+# if !defined(OC_COLLECT_METRICS)
+static const
+# endif
+oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={
   {
     {
-      /*Y'  qi=56  INTRA*/
-      {
-        {  137,  104},{ 1048, 1128},{ 2147, 1760},{ 3261, 2029},
-        { 4319, 2131},{ 5310, 2234},{ 6245, 2351},{ 7101, 2464},
-        { 7886, 2572},{ 8610, 2675},{ 9270, 2762},{ 9840, 2818},
-        {10365, 2869},{10875, 2928},{11393, 2997},{11900, 3071},
-        {12371, 3128},{12834, 3172},{13233, 3208},{13562, 3228},
-        {13878, 3245},{14221, 3271},{14584, 3292},{15008, 3320}
+      /*Y'  qi=0  INTRA*/
+      {
+        {   33,  122},{   57, 1297},{   13, 2226},{  157, 3890},
+        {  227, 3682},{  169, 3084},{  197, 2700},{  227, 3238},
+        {  290, 4294},{  354, 5230},{  406, 5615},{  417, 5322},
+        {  452, 5462},{  455, 5683},{  493, 5938},{  553, 6374},
+        {  558, 6464},{  606, 6493},{  616, 6417},{  643, 6557},
+        {  641, 6664},{  716, 7285},{  748, 7518},{  747, 7502}
       },
-      /*Y'  qi=56  INTER*/
-      {
-        {   19,   21},{  207, 1292},{ 1031, 2252},{ 2553, 2846},
-        { 4463, 3085},{ 6137, 3131},{ 7441, 3151},{ 8526, 3172},
-        { 9468, 3193},{10301, 3209},{11059, 3224},{11760, 3237},
-        {12405, 3249},{13008, 3261},{13570, 3270},{14100, 3278},
-        {14597, 3284},{15074, 3289},{15524, 3297},{15929, 3302},
-        {16314, 3306},{16675, 3307},{17004, 3305},{17288, 3301}
+      /*Y'  qi=0  INTER*/
+      {
+        {   16,  205},{    5, 1338},{   16, 2554},{    6, 3809},
+        {    9, 5188},{   58, 6446},{   76, 7561},{   95, 8648},
+        {  124, 9713},{  158,10787},{  193,11887},{  233,12991},
+        {  270,14116},{  307,15236},{  341,16346},{  372,17426},
+        {  398,18499},{  422,19594},{  448,20669},{  479,21732},
+        {  526,22720},{  583,23572},{  655,24516},{  758,24647}
       }
     },
     {
-      /*Cb  qi=56  INTRA*/
-      {
-        {   16,    3},{  188,  367},{  353,  712},{  546, 1017},
-        {  765, 1275},{  989, 1484},{ 1221, 1653},{ 1459, 1791},
-        { 1681, 1920},{ 1893, 2046},{ 2102, 2160},{ 2323, 2257},
-        { 2534, 2347},{ 2720, 2447},{ 2902, 2549},{ 3075, 2654},
-        { 3239, 2749},{ 3392, 2835},{ 3544, 2920},{ 3712, 2988},
-        { 3882, 3052},{ 4052, 3123},{ 4227, 3181},{ 4483, 3213}
+      /*Cb  qi=0  INTRA*/
+      {
+        {   26,   40},{   23,  589},{   27,  784},{   27, 1079},
+        {   24, 1186},{   25, 1641},{   25, 1915},{   29, 2207},
+        {   39, 2361},{   39, 2746},{   32, 3020},{   16, 3387},
+        {   31, 3604},{   36, 4076},{   69, 4426},{  102, 4724},
+        {  139, 4923},{  196, 5061},{  211, 5103},{  214, 5063},
+        {  161, 4466},{  208, 4793},{  218, 4537},{  219, 4539}
       },
-      /*Cb  qi=56  INTER*/
-      {
-        {   92,   -1},{  111,  343},{  114,  665},{  148, 1003},
-        {  224, 1321},{  345, 1609},{  526, 1858},{  754, 2077},
-        { 1009, 2281},{ 1319, 2464},{ 1702, 2614},{ 2145, 2732},
-        { 2625, 2824},{ 3123, 2890},{ 3634, 2933},{ 4137, 2954},
-        { 4614, 2965},{ 5052, 2988},{ 5468, 3015},{ 5852, 3035},
-        { 6213, 3060},{ 6557, 3081},{ 6906, 3094},{ 7243, 3112}
+      /*Cb  qi=0  INTER*/
+      {
+        {    3,  164},{    1,  535},{    1,  779},{    2, 1048},
+        {    3, 1267},{    1, 1625},{    2, 1921},{    5, 2224},
+        {    8, 2481},{    8, 2813},{    4, 3089},{   -2, 3386},
+        {   -9, 3642},{  -14, 3993},{  -11, 4300},{   -6, 4628},
+        {    4, 4929},{   25, 5299},{   44, 5623},{   83, 5915},
+        {   93, 6186},{   91, 6483},{   90, 6775},{   95, 6952}
       }
     },
     {
-      /*Cr  qi=56  INTRA*/
-      {
-        {   28,    8},{  195,  385},{  373,  727},{  598, 1019},
-        {  816, 1263},{ 1033, 1465},{ 1260, 1630},{ 1482, 1773},
-        { 1717, 1900},{ 1949, 2018},{ 2178, 2128},{ 2393, 2233},
-        { 2570, 2338},{ 2749, 2435},{ 2937, 2514},{ 3097, 2577},
-        { 3240, 2638},{ 3398, 2709},{ 3540, 2791},{ 3673, 2865},
-        { 3869, 2938},{ 4049, 3019},{ 4179, 3095},{ 4330, 3137}
+      /*Cr  qi=0  INTRA*/
+      {
+        {   22,   49},{   26,  579},{   23,  762},{   15, 1050},
+        {   20, 1191},{   24, 1608},{   26, 1875},{   35, 2173},
+        {   39, 2359},{   30, 2736},{   16, 2987},{    0, 3334},
+        {   14, 3625},{   11, 4095},{   57, 4512},{   95, 4793},
+        {  141, 4949},{  206, 5242},{  230, 5191},{  242, 5177},
+        {  178, 4775},{  237, 5010},{  223, 4656},{  224, 4657}
       },
-      /*Cr  qi=56  INTER*/
-      {
-        {   83,    0},{   99,  353},{  103,  676},{  146, 1010},
-        {  232, 1320},{  355, 1601},{  512, 1866},{  713, 2109},
-        {  988, 2312},{ 1344, 2471},{ 1750, 2602},{ 2180, 2719},
-        { 2642, 2819},{ 3141, 2892},{ 3653, 2939},{ 4159, 2961},
-        { 4636, 2961},{ 5072, 2945},{ 5464, 2917},{ 5813, 2895},
-        { 6134, 2890},{ 6458, 2883},{ 6735, 2881},{ 6953, 2902}
+      /*Cr  qi=0  INTER*/
+      {
+        {    3,  163},{    1,  536},{    1,  773},{    3, 1023},
+        {    2, 1225},{    1, 1607},{    1, 1900},{    5, 2204},
+        {    9, 2453},{    8, 2781},{    3, 3049},{   -5, 3338},
+        {  -13, 3570},{  -17, 3950},{  -13, 4255},{   -6, 4596},
+        {    7, 4893},{   33, 5300},{   53, 5632},{   97, 5942},
+        {  103, 6216},{   96, 6522},{   91, 6849},{   98, 6995}
       }
     }
   },
   {
     {
-      /*Y'  qi=57  INTRA*/
-      {
-        {  170,  106},{ 1106, 1120},{ 2246, 1740},{ 3399, 1993},
-        { 4482, 2077},{ 5492, 2167},{ 6446, 2273},{ 7324, 2379},
-        { 8130, 2482},{ 8866, 2578},{ 9537, 2661},{10119, 2715},
-        {10646, 2762},{11161, 2820},{11694, 2886},{12214, 2957},
-        {12693, 3013},{13166, 3053},{13569, 3087},{13897, 3106},
-        {14224, 3122},{14568, 3148},{14931, 3167},{15390, 3192}
+      /*Y'  qi=9  INTRA*/
+      {
+        {   47,  152},{   50, 1213},{  144, 2543},{  242, 2332},
+        {  210, 1894},{  250, 2386},{  328, 3094},{  407, 3419},
+        {  464, 3507},{  522, 3770},{  613, 4194},{  657, 4618},
+        {  753, 5137},{  796, 5248},{  842, 5110},{  927, 5330},
+        {  994, 5487},{ 1008, 5463},{ 1101, 5794},{ 1169, 5966},
+        { 1208, 6121},{ 1331, 6447},{ 1445, 6618},{ 1449, 6616}
       },
-      /*Y'  qi=57  INTER*/
-      {
-        {   19,   20},{  205, 1292},{ 1096, 2229},{ 2775, 2766},
-        { 4811, 2943},{ 6512, 2964},{ 7832, 2976},{ 8940, 2990},
-        { 9903, 3004},{10755, 3017},{11532, 3029},{12243, 3039},
-        {12891, 3047},{13502, 3058},{14073, 3065},{14603, 3071},
-        {15097, 3078},{15581, 3083},{16036, 3086},{16452, 3090},
-        {16855, 3093},{17222, 3094},{17552, 3092},{17851, 3098}
+      /*Y'  qi=9  INTER*/
+      {
+        {    4,  218},{   16, 1314},{    4, 2563},{   37, 3882},
+        {   83, 5058},{  109, 6184},{  161, 7292},{  224, 8389},
+        {  287, 9485},{  349,10565},{  411,11608},{  464,12648},
+        {  518,13664},{  575,14650},{  649,15585},{  742,16451},
+        {  862,17214},{ 1003,17860},{ 1179,18325},{ 1372,18648},
+        { 1576,18878},{ 1795,18903},{ 2040,18880},{ 2116,18759}
       }
     },
     {
-      /*Cb  qi=57  INTRA*/
-      {
-        {   16,    3},{  197,  365},{  384,  704},{  603, 1001},
-        {  837, 1252},{ 1077, 1455},{ 1326, 1618},{ 1581, 1748},
-        { 1819, 1871},{ 2042, 1993},{ 2264, 2104},{ 2500, 2196},
-        { 2722, 2280},{ 2916, 2375},{ 3103, 2473},{ 3290, 2575},
-        { 3456, 2667},{ 3612, 2748},{ 3775, 2829},{ 3958, 2896},
-        { 4145, 2947},{ 4307, 3012},{ 4476, 3070},{ 4733, 3110}
+      /*Cb  qi=9  INTRA*/
+      {
+        {   27,   42},{   23,  587},{   34,  782},{   37, 1079},
+        {   34, 1204},{   42, 1630},{   37, 1887},{   25, 2210},
+        {   40, 2455},{   71, 2880},{  112, 3193},{  156, 3427},
+        {  168, 3403},{  217, 3488},{  203, 3335},{  224, 3200},
+        {  191, 2742},{  195, 2810},{  207, 2665},{  201, 2661},
+        {  169, 2078},{  211, 2720},{  226, 2813},{  228, 2824}
       },
-      /*Cb  qi=57  INTER*/
-      {
-        {   94,   -1},{  111,  344},{  112,  665},{  147, 1002},
-        {  227, 1319},{  353, 1604},{  543, 1849},{  785, 2062},
-        { 1066, 2257},{ 1408, 2430},{ 1827, 2568},{ 2320, 2670},
-        { 2848, 2743},{ 3386, 2791},{ 3934, 2812},{ 4453, 2820},
-        { 4929, 2830},{ 5368, 2842},{ 5787, 2856},{ 6190, 2875},
-        { 6554, 2896},{ 6895, 2913},{ 7229, 2927},{ 7572, 2932}
+      /*Cb  qi=9  INTER*/
+      {
+        {    4,  158},{    2,  537},{    3,  779},{    2, 1045},
+        {    3, 1284},{    7, 1629},{    7, 1917},{    1, 2218},
+        {   -4, 2497},{   -3, 2845},{    6, 3162},{   23, 3482},
+        {   42, 3788},{   62, 4116},{   76, 4416},{   84, 4700},
+        {   91, 4975},{   95, 5259},{   97, 5518},{   94, 5790},
+        {   99, 6052},{  111, 6311},{  126, 6601},{  136, 6719}
       }
     },
     {
-      /*Cr  qi=57  INTRA*/
-      {
-        {   28,    8},{  207,  383},{  413,  716},{  661,  999},
-        {  889, 1237},{ 1123, 1433},{ 1365, 1592},{ 1603, 1731},
-        { 1853, 1852},{ 2103, 1965},{ 2345, 2072},{ 2571, 2173},
-        { 2763, 2271},{ 2949, 2364},{ 3146, 2438},{ 3315, 2497},
-        { 3459, 2552},{ 3618, 2616},{ 3767, 2697},{ 3906, 2773},
-        { 4099, 2841},{ 4281, 2916},{ 4429, 2987},{ 4569, 3030}
+      /*Cr  qi=9  INTRA*/
+      {
+        {   25,   50},{   32,  576},{   32,  762},{   21, 1049},
+        {   28, 1207},{   41, 1603},{   36, 1839},{   26, 2170},
+        {   34, 2462},{   59, 2872},{  109, 3176},{  157, 3364},
+        {  188, 3397},{  231, 3418},{  250, 3341},{  261, 3228},
+        {  222, 2814},{  258, 3091},{  234, 2915},{  228, 3042},
+        {  210, 2610},{  273, 3210},{  274, 3231},{  276, 3239}
       },
-      /*Cr  qi=57  INTER*/
-      {
-        {   85,    0},{   99,  352},{  102,  675},{  147, 1008},
-        {  235, 1317},{  363, 1597},{  529, 1858},{  748, 2094},
-        { 1050, 2287},{ 1439, 2436},{ 1877, 2557},{ 2352, 2660},
-        { 2869, 2740},{ 3413, 2791},{ 3962, 2815},{ 4485, 2819},
-        { 4955, 2816},{ 5382, 2800},{ 5769, 2772},{ 6107, 2748},
-        { 6443, 2740},{ 6754, 2739},{ 7029, 2737},{ 7284, 2745}
+      /*Cr  qi=9  INTER*/
+      {
+        {    4,  156},{    2,  538},{    3,  772},{    2, 1028},
+        {    3, 1254},{    7, 1613},{    7, 1893},{    0, 2191},
+        {   -8, 2454},{   -4, 2811},{    7, 3121},{   27, 3442},
+        {   48, 3749},{   72, 4101},{   88, 4410},{   91, 4698},
+        {   99, 4988},{   99, 5279},{  101, 5542},{   95, 5813},
+        {   99, 6088},{  114, 6367},{  125, 6683},{  137, 6761}
       }
     }
   },
   {
     {
-      /*Y'  qi=58  INTRA*/
-      {
-        {  164,  109},{ 1198, 1111},{ 2396, 1737},{ 3606, 1978},
-        { 4727, 2048},{ 5749, 2138},{ 6708, 2243},{ 7584, 2347},
-        { 8388, 2449},{ 9122, 2549},{ 9784, 2635},{10354, 2691},
-        {10876, 2740},{11385, 2800},{11912, 2869},{12429, 2941},
-        {12902, 2997},{13375, 3040},{13779, 3075},{14103, 3096},
-        {14435, 3112},{14783, 3140},{15141, 3160},{15599, 3186}
+      /*Y'  qi=18  INTRA*/
+      {
+        {   51,   88},{   88, 1344},{  258, 1643},{  228, 1325},
+        {  372, 2208},{  443, 2371},{  520, 2382},{  584, 2477},
+        {  739, 2906},{  859, 3348},{ 1008, 3697},{ 1131, 3884},
+        { 1278, 4110},{ 1349, 4229},{ 1431, 4329},{ 1544, 4395},
+        { 1602, 4439},{ 1669, 4535},{ 1814, 4656},{ 1883, 4716},
+        { 1957, 4940},{ 2101, 5019},{ 2259, 5249},{ 2265, 5246}
       },
-      /*Y'  qi=58  INTER*/
-      {
-        {   14,   23},{  210, 1290},{ 1277, 2178},{ 3118, 2677},
-        { 5207, 2834},{ 6902, 2857},{ 8218, 2878},{ 9323, 2900},
-        {10285, 2919},{11132, 2934},{11899, 2949},{12599, 2961},
-        {13235, 2971},{13835, 2982},{14394, 2991},{14917, 2997},
-        {15412, 3005},{15882, 3009},{16325, 3013},{16735, 3016},
-        {17131, 3018},{17501, 3021},{17824, 3021},{18125, 3016}
+      /*Y'  qi=18  INTER*/
+      {
+        {   26,  195},{    1, 1317},{   45, 2595},{  103, 3750},
+        {  168, 4903},{  281, 6007},{  397, 7062},{  513, 8064},
+        {  630, 9010},{  758, 9902},{  906,10732},{ 1095,11463},
+        { 1338,12060},{ 1629,12490},{ 1969,12724},{ 2313,12842},
+        { 2666,12828},{ 2993,12747},{ 3294,12670},{ 3558,12553},
+        { 3813,12440},{ 3990,12379},{ 4177,12291},{ 4226,12265}
       }
     },
     {
-      /*Cb  qi=58  INTRA*/
-      {
-        {   17,    3},{  200,  365},{  389,  703},{  613,  996},
-        {  853, 1243},{ 1095, 1445},{ 1349, 1604},{ 1613, 1731},
-        { 1853, 1853},{ 2074, 1978},{ 2292, 2091},{ 2526, 2184},
-        { 2750, 2266},{ 2945, 2360},{ 3134, 2458},{ 3320, 2561},
-        { 3482, 2654},{ 3641, 2737},{ 3804, 2818},{ 3985, 2881},
-        { 4168, 2935},{ 4331, 3003},{ 4499, 3060},{ 4751, 3100}
+      /*Cb  qi=18  INTRA*/
+      {
+        {   31,   43},{   33,  585},{   40,  781},{   58, 1077},
+        {   45, 1189},{   58, 1655},{   66, 1983},{  123, 2221},
+        {  168, 2193},{  227, 2321},{  241, 2246},{  250, 2208},
+        {  221, 1786},{  250, 2087},{  247, 2036},{  250, 2164},
+        {  241, 2054},{  287, 2453},{  302, 2551},{  335, 2758},
+        {  279, 2511},{  379, 2973},{  404, 3028},{  406, 3029}
       },
-      /*Cb  qi=58  INTER*/
-      {
-        {   94,   -1},{  112,  345},{  112,  665},{  152,  998},
-        {  247, 1307},{  406, 1580},{  644, 1810},{  938, 2007},
-        { 1271, 2189},{ 1668, 2348},{ 2151, 2470},{ 2691, 2558},
-        { 3249, 2619},{ 3798, 2659},{ 4334, 2682},{ 4849, 2692},
-        { 5314, 2700},{ 5747, 2721},{ 6167, 2742},{ 6547, 2765},
-        { 6902, 2790},{ 7251, 2804},{ 7583, 2819},{ 7924, 2833}
+      /*Cb  qi=18  INTER*/
+      {
+        {    7,  153},{    4,  537},{    3,  777},{    9, 1034},
+        {    6, 1282},{    0, 1630},{    0, 1943},{   21, 2252},
+        {   48, 2567},{   67, 2881},{   83, 3178},{   89, 3463},
+        {   92, 3738},{   99, 4024},{  114, 4289},{  131, 4552},
+        {  153, 4814},{  179, 5081},{  207, 5333},{  241, 5581},
+        {  273, 5822},{  303, 6068},{  335, 6368},{  353, 6432}
       }
     },
     {
-      /*Cr  qi=58  INTRA*/
-      {
-        {   29,    8},{  210,  382},{  419,  714},{  671,  993},
-        {  903, 1229},{ 1141, 1422},{ 1390, 1578},{ 1635, 1713},
-        { 1889, 1833},{ 2140, 1946},{ 2379, 2055},{ 2604, 2157},
-        { 2794, 2256},{ 2977, 2349},{ 3174, 2422},{ 3339, 2482},
-        { 3483, 2537},{ 3643, 2604},{ 3790, 2684},{ 3927, 2757},
-        { 4112, 2826},{ 4294, 2900},{ 4451, 2975},{ 4600, 3011}
+      /*Cr  qi=18  INTRA*/
+      {
+        {   31,   49},{   42,  575},{   42,  763},{   38, 1045},
+        {   41, 1184},{   56, 1631},{   87, 1968},{  163, 2177},
+        {  191, 2188},{  236, 2264},{  240, 2101},{  234, 2047},
+        {  206, 1651},{  222, 1966},{  238, 2013},{  240, 2176},
+        {  229, 2098},{  321, 2592},{  341, 2748},{  378, 3025},
+        {  367, 2849},{  442, 3283},{  453, 3315},{  455, 3313}
       },
-      /*Cr  qi=58  INTER*/
-      {
-        {   86,    0},{   99,  352},{  103,  675},{  151, 1004},
-        {  256, 1306},{  417, 1573},{  628, 1819},{  901, 2040},
-        { 1262, 2217},{ 1705, 2353},{ 2191, 2466},{ 2713, 2556},
-        { 3268, 2622},{ 3831, 2664},{ 4374, 2682},{ 4881, 2686},
-        { 5339, 2685},{ 5747, 2668},{ 6123, 2646},{ 6465, 2630},
-        { 6783, 2618},{ 7082, 2623},{ 7366, 2632},{ 7673, 2654}
+      /*Cr  qi=18  INTER*/
+      {
+        {    6,  151},{    3,  539},{    3,  775},{    8, 1027},
+        {    6, 1260},{   -3, 1619},{    0, 1927},{   24, 2238},
+        {   58, 2558},{   76, 2871},{   92, 3173},{   96, 3461},
+        {   98, 3742},{  104, 4032},{  116, 4306},{  136, 4578},
+        {  158, 4839},{  185, 5123},{  217, 5383},{  250, 5642},
+        {  279, 5910},{  306, 6169},{  333, 6502},{  350, 6522}
       }
     }
   },
   {
     {
-      /*Y'  qi=59  INTRA*/
-      {
-        {  142,  112},{ 1259, 1100},{ 2552, 1711},{ 3815, 1933},
-        { 4955, 1987},{ 5983, 2068},{ 6949, 2165},{ 7832, 2263},
-        { 8645, 2359},{ 9392, 2454},{10066, 2536},{10643, 2589},
-        {11174, 2636},{11696, 2693},{12230, 2758},{12752, 2826},
-        {13239, 2883},{13721, 2926},{14139, 2959},{14479, 2978},
-        {14811, 2993},{15166, 3020},{15532, 3039},{16000, 3062}
+      /*Y'  qi=27  INTRA*/
+      {
+        {   10,   85},{  280, 1349},{  278,  815},{  497, 1699},
+        {  600, 1569},{  744, 1944},{  894, 2114},{ 1040, 2292},
+        { 1216, 2484},{ 1485, 2816},{ 1778, 3065},{ 1990, 3243},
+        { 2199, 3381},{ 2326, 3515},{ 2370, 3422},{ 2512, 3581},
+        { 2548, 3526},{ 2656, 3615},{ 2803, 3679},{ 2946, 3766},
+        { 3023, 3824},{ 3179, 3908},{ 3374, 4035},{ 3377, 4030}
       },
-      /*Y'  qi=59  INTER*/
-      {
-        {    8,   25},{  211, 1289},{ 1394, 2144},{ 3421, 2580},
-        { 5611, 2689},{ 7316, 2701},{ 8643, 2717},{ 9762, 2734},
-        {10735, 2750},{11587, 2763},{12353, 2775},{13056, 2785},
-        {13693, 2793},{14288, 2805},{14843, 2814},{15361, 2821},
-        {15857, 2827},{16328, 2831},{16763, 2834},{17171, 2838},
-        {17568, 2840},{17941, 2842},{18285, 2843},{18586, 2839}
+      /*Y'  qi=27  INTER*/
+      {
+        {   -2,  172},{   31, 1347},{  117, 2488},{  245, 3651},
+        {  448, 4719},{  668, 5679},{  918, 6524},{ 1204, 7255},
+        { 1557, 7848},{ 1998, 8281},{ 2511, 8531},{ 3055, 8642},
+        { 3582, 8648},{ 4062, 8611},{ 4482, 8582},{ 4845, 8560},
+        { 5140, 8560},{ 5423, 8581},{ 5645, 8596},{ 5855, 8586},
+        { 6061, 8608},{ 6211, 8558},{ 6402, 8583},{ 6472, 8575}
       }
     },
     {
-      /*Cb  qi=59  INTRA*/
-      {
-        {   17,    3},{  224,  363},{  441,  696},{  689,  982},
-        {  945, 1222},{ 1204, 1416},{ 1474, 1571},{ 1751, 1695},
-        { 2001, 1816},{ 2228, 1941},{ 2453, 2055},{ 2693, 2147},
-        { 2924, 2227},{ 3125, 2321},{ 3321, 2416},{ 3510, 2520},
-        { 3676, 2616},{ 3839, 2699},{ 4008, 2778},{ 4193, 2842},
-        { 4371, 2898},{ 4535, 2965},{ 4710, 3023},{ 4921, 3068}
+      /*Cb  qi=27  INTRA*/
+      {
+        {   47,   49},{   35,  580},{   64,  778},{   69, 1071},
+        {   98, 1289},{  186, 1556},{  177, 1654},{  197, 1736},
+        {  211, 1373},{  284, 1742},{  321, 1840},{  344, 2024},
+        {  321, 1969},{  386, 2254},{  397, 2281},{  425, 2320},
+        {  396, 2088},{  448, 2284},{  462, 2213},{  482, 2274},
+        {  410, 1894},{  513, 2310},{  546, 2332},{  549, 2334}
       },
-      /*Cb  qi=59  INTER*/
-      {
-        {   95,   -5},{  111,  343},{  112,  664},{  157,  995},
-        {  258, 1302},{  429, 1569},{  691, 1790},{ 1017, 1977},
-        { 1387, 2148},{ 1832, 2294},{ 2368, 2401},{ 2961, 2472},
-        { 3553, 2518},{ 4133, 2545},{ 4688, 2557},{ 5198, 2563},
-        { 5663, 2574},{ 6100, 2590},{ 6511, 2608},{ 6898, 2621},
-        { 7274, 2634},{ 7631, 2655},{ 7984, 2669},{ 8361, 2669}
+      /*Cb  qi=27  INTER*/
+      {
+        {   11,  145},{    5,  539},{   11,  771},{    0, 1033},
+        {    9, 1334},{   44, 1644},{   70, 1934},{   87, 2227},
+        {   96, 2508},{  113, 2812},{  139, 3085},{  174, 3352},
+        {  216, 3614},{  261, 3873},{  305, 4123},{  349, 4372},
+        {  396, 4611},{  442, 4853},{  493, 5088},{  543, 5313},
+        {  600, 5537},{  662, 5752},{  737, 6018},{  775, 6037}
       }
     },
     {
-      /*Cr  qi=59  INTRA*/
-      {
-        {   31,    8},{  240,  379},{  480,  706},{  748,  978},
-        {  993, 1208},{ 1250, 1394},{ 1519, 1543},{ 1779, 1674},
-        { 2047, 1792},{ 2307, 1904},{ 2552, 2013},{ 2780, 2116},
-        { 2973, 2216},{ 3165, 2309},{ 3362, 2383},{ 3528, 2444},
-        { 3677, 2499},{ 3841, 2566},{ 3995, 2646},{ 4139, 2720},
-        { 4324, 2793},{ 4504, 2867},{ 4658, 2939},{ 4806, 2975}
+      /*Cr  qi=27  INTRA*/
+      {
+        {   49,   52},{   57,  570},{   61,  762},{   44, 1048},
+        {   80, 1291},{  196, 1513},{  224, 1522},{  242, 1532},
+        {  213, 1293},{  260, 1639},{  253, 1691},{  291, 1915},
+        {  294, 1897},{  367, 2178},{  395, 2258},{  432, 2310},
+        {  407, 2105},{  503, 2369},{  492, 2293},{  552, 2421},
+        {  496, 2099},{  598, 2549},{  624, 2531},{  627, 2532}
       },
-      /*Cr  qi=59  INTER*/
-      {
-        {   89,   -3},{   98,  352},{  103,  674},{  156, 1002},
-        {  268, 1300},{  441, 1562},{  673, 1801},{  980, 2010},
-        { 1385, 2175},{ 1868, 2301},{ 2401, 2402},{ 2984, 2474},
-        { 3591, 2520},{ 4179, 2545},{ 4729, 2555},{ 5232, 2553},
-        { 5679, 2545},{ 6081, 2530},{ 6447, 2510},{ 6791, 2496},
-        { 7101, 2487},{ 7393, 2489},{ 7684, 2499},{ 7950, 2501}
+      /*Cr  qi=27  INTER*/
+      {
+        {   10,  147},{    4,  538},{   11,  769},{    0, 1022},
+        {    9, 1318},{   51, 1635},{   80, 1925},{   97, 2214},
+        {  101, 2493},{  115, 2805},{  143, 3083},{  182, 3361},
+        {  226, 3625},{  270, 3898},{  319, 4157},{  366, 4405},
+        {  418, 4649},{  467, 4904},{  509, 5157},{  548, 5412},
+        {  589, 5659},{  636, 5909},{  683, 6208},{  710, 6190}
       }
     }
   },
   {
     {
-      /*Y'  qi=60  INTRA*/
-      {
-        {   92,  116},{ 1361, 1085},{ 2746, 1686},{ 4050, 1895},
-        { 5209, 1939},{ 6244, 2012},{ 7213, 2103},{ 8105, 2197},
-        { 8928, 2290},{ 9685, 2381},{10371, 2460},{10952, 2511},
-        {11487, 2556},{12026, 2611},{12574, 2674},{13102, 2739},
-        {13597, 2793},{14092, 2831},{14523, 2862},{14862, 2881},
-        {15198, 2897},{15568, 2923},{15949, 2941},{16416, 2964}
+      /*Y'  qi=36  INTRA*/
+      {
+        {   86,  252},{  345,  662},{  476, 1143},{  698, 1169},
+        {  894, 1457},{ 1218, 1728},{ 1465, 1849},{ 1731, 2019},
+        { 2183, 2298},{ 2666, 2511},{ 3116, 2731},{ 3371, 2813},
+        { 3621, 2923},{ 3675, 2949},{ 3710, 2921},{ 3740, 2896},
+        { 3746, 2895},{ 3886, 2978},{ 4069, 2991},{ 4229, 3016},
+        { 4338, 3102},{ 4530, 3124},{ 4751, 3248},{ 4753, 3244}
       },
-      /*Y'  qi=60  INTER*/
-      {
-        {    4,   30},{  215, 1287},{ 1547, 2104},{ 3729, 2491},
-        { 5973, 2568},{ 7672, 2577},{ 9001, 2591},{10123, 2606},
-        {11094, 2620},{11943, 2632},{12709, 2643},{13409, 2652},
-        {14044, 2660},{14641, 2669},{15193, 2677},{15709, 2684},
-        {16201, 2689},{16675, 2693},{17118, 2696},{17522, 2701},
-        {17920, 2704},{18293, 2706},{18620, 2702},{18923, 2700}
+      /*Y'  qi=36  INTER*/
+      {
+        {    0,  208},{   73, 1293},{  248, 2449},{  616, 3461},
+        { 1061, 4329},{ 1601, 4986},{ 2189, 5447},{ 2875, 5723},
+        { 3620, 5844},{ 4328, 5879},{ 4954, 5880},{ 5490, 5890},
+        { 5934, 5901},{ 6353, 5926},{ 6706, 5924},{ 7036, 5930},
+        { 7338, 5938},{ 7600, 5930},{ 7870, 5939},{ 8065, 5921},
+        { 8318, 5914},{ 8451, 5912},{ 8648, 5923},{ 8734, 5926}
       }
     },
     {
-      /*Cb  qi=60  INTRA*/
-      {
-        {   18,    3},{  227,  362},{  447,  694},{  708,  974},
-        {  981, 1207},{ 1252, 1397},{ 1532, 1547},{ 1822, 1663},
-        { 2082, 1780},{ 2316, 1903},{ 2548, 2013},{ 2794, 2101},
-        { 3029, 2178},{ 3242, 2266},{ 3445, 2360},{ 3638, 2459},
-        { 3816, 2547},{ 3980, 2628},{ 4146, 2708},{ 4344, 2766},
-        { 4546, 2812},{ 4725, 2872},{ 4880, 2930},{ 5054, 2966}
+      /*Cb  qi=36  INTRA*/
+      {
+        {   52,   54},{   52,  575},{  103,  776},{  185, 1072},
+        {  172, 1069},{  211, 1302},{  217, 1413},{  285, 1586},
+        {  330, 1463},{  453, 1694},{  500, 1741},{  545, 1852},
+        {  501, 1650},{  584, 1874},{  587, 1856},{  638, 1919},
+        {  581, 1742},{  670, 1953},{  688, 1934},{  731, 2030},
+        {  637, 1794},{  806, 2123},{  840, 2091},{  843, 2091}
       },
-      /*Cb  qi=60  INTER*/
-      {
-        {   97,   -4},{  112,  343},{  114,  664},{  162,  993},
-        {  273, 1294},{  472, 1553},{  774, 1762},{ 1138, 1939},
-        { 1543, 2102},{ 2034, 2236},{ 2620, 2329},{ 3244, 2389},
-        { 3860, 2423},{ 4443, 2440},{ 4997, 2449},{ 5502, 2455},
-        { 5962, 2458},{ 6413, 2466},{ 6836, 2485},{ 7217, 2506},
-        { 7592, 2518},{ 7957, 2533},{ 8291, 2543},{ 8574, 2545}
+      /*Cb  qi=36  INTER*/
+      {
+        {   19,  142},{   17,  534},{    6,  772},{   44, 1023},
+        {   82, 1296},{   94, 1614},{  117, 1903},{  158, 2187},
+        {  218, 2450},{  285, 2703},{  352, 2943},{  421, 3181},
+        {  489, 3415},{  564, 3644},{  647, 3861},{  748, 4060},
+        {  861, 4246},{  993, 4419},{ 1132, 4576},{ 1282, 4744},
+        { 1445, 4894},{ 1600, 5034},{ 1782, 5211},{ 1837, 5200}
       }
     },
     {
-      /*Cr  qi=60  INTRA*/
-      {
-        {   32,    8},{  243,  379},{  488,  702},{  771,  968},
-        { 1030, 1192},{ 1300, 1373},{ 1581, 1517},{ 1854, 1643},
-        { 2127, 1757},{ 2393, 1864},{ 2645, 1968},{ 2879, 2068},
-        { 3078, 2166},{ 3277, 2256},{ 3484, 2325},{ 3660, 2381},
-        { 3808, 2433},{ 3970, 2496},{ 4138, 2571},{ 4288, 2643},
-        { 4475, 2710},{ 4655, 2778},{ 4810, 2843},{ 4959, 2879}
+      /*Cr  qi=36  INTRA*/
+      {
+        {   62,   55},{   90,  561},{   56,  767},{  148, 1014},
+        {  207,  981},{  258, 1216},{  273, 1253},{  326, 1392},
+        {  338, 1383},{  417, 1613},{  443, 1629},{  497, 1734},
+        {  466, 1525},{  561, 1778},{  577, 1787},{  631, 1892},
+        {  591, 1706},{  715, 1980},{  730, 1958},{  822, 2113},
+        {  755, 1935},{  928, 2228},{  935, 2205},{  938, 2205}
       },
-      /*Cr  qi=60  INTER*/
-      {
-        {   86,   -2},{   99,  352},{  103,  673},{  160,  998},
-        {  284, 1292},{  484, 1546},{  753, 1774},{ 1100, 1973},
-        { 1546, 2129},{ 2072, 2246},{ 2652, 2334},{ 3279, 2392},
-        { 3911, 2425},{ 4504, 2440},{ 5044, 2443},{ 5536, 2440},
-        { 5979, 2430},{ 6381, 2413},{ 6735, 2397},{ 7062, 2382},
-        { 7383, 2376},{ 7680, 2375},{ 7962, 2373},{ 8203, 2379}
+      /*Cr  qi=36  INTER*/
+      {
+        {   14,  145},{   16,  535},{    5,  772},{   44, 1017},
+        {   91, 1296},{  100, 1605},{  122, 1891},{  163, 2174},
+        {  225, 2443},{  294, 2707},{  362, 2962},{  436, 3210},
+        {  518, 3437},{  607, 3664},{  702, 3876},{  795, 4094},
+        {  886, 4310},{  980, 4538},{ 1089, 4749},{ 1216, 4927},
+        { 1357, 5116},{ 1506, 5247},{ 1758, 5338},{ 1787, 5306}
       }
     }
   },
   {
     {
-      /*Y'  qi=61  INTRA*/
-      {
-        {   54,  121},{ 1477, 1069},{ 3061, 1638},{ 4465, 1808},
-        { 5649, 1827},{ 6710, 1884},{ 7716, 1958},{ 8648, 2037},
-        { 9514, 2116},{10311, 2192},{11033, 2261},{11641, 2305},
-        {12202, 2342},{12771, 2387},{13356, 2440},{13924, 2493},
-        {14444, 2541},{14951, 2576},{15409, 2600},{15779, 2615},
-        {16131, 2626},{16521, 2648},{16921, 2663},{17409, 2694}
+      /*Y'  qi=45  INTRA*/
+      {
+        {  185,  246},{  513,  647},{  883,  891},{ 1313, 1142},
+        { 1760, 1351},{ 2368, 1595},{ 2828, 1718},{ 3097, 1780},
+        { 3762, 1951},{ 4454, 2121},{ 4986, 2227},{ 5281, 2281},
+        { 5477, 2299},{ 5431, 2288},{ 5425, 2283},{ 5439, 2290},
+        { 5324, 2249},{ 5509, 2279},{ 5703, 2321},{ 5896, 2348},
+        { 6049, 2370},{ 6253, 2425},{ 6415, 2432},{ 6419, 2430}
       },
-      /*Y'  qi=61  INTER*/
-      {
-        {   -1,   32},{  216, 1286},{ 1806, 2036},{ 4279, 2327},
-        { 6629, 2352},{ 8347, 2352},{ 9707, 2357},{10860, 2364},
-        {11857, 2372},{12726, 2377},{13508, 2382},{14225, 2387},
-        {14877, 2392},{15484, 2398},{16048, 2401},{16581, 2405},
-        {17092, 2409},{17573, 2409},{18016, 2410},{18427, 2413},
-        {18829, 2415},{19221, 2415},{19578, 2415},{19980, 2413}
+      /*Y'  qi=45  INTER*/
+      {
+        {    6,  215},{  152, 1261},{  691, 2314},{ 1538, 3095},
+        { 2505, 3632},{ 3475, 3935},{ 4355, 4084},{ 5209, 4139},
+        { 5985, 4162},{ 6644, 4185},{ 7235, 4190},{ 7768, 4196},
+        { 8266, 4200},{ 8736, 4210},{ 9143, 4207},{ 9511, 4215},
+        { 9828, 4209},{10112, 4224},{10374, 4226},{10642, 4232},
+        {10842, 4219},{10971, 4208},{11200, 4211},{11299, 4216}
       }
     },
     {
-      /*Cb  qi=61  INTRA*/
-      {
-        {   19,    3},{  231,  362},{  456,  693},{  733,  965},
-        { 1032, 1188},{ 1330, 1369},{ 1637, 1508},{ 1956, 1612},
-        { 2241, 1718},{ 2496, 1832},{ 2750, 1932},{ 3019, 2007},
-        { 3274, 2074},{ 3505, 2154},{ 3725, 2236},{ 3943, 2323},
-        { 4138, 2403},{ 4323, 2476},{ 4505, 2543},{ 4706, 2592},
-        { 4909, 2630},{ 5109, 2675},{ 5292, 2724},{ 5495, 2768}
+      /*Cb  qi=45  INTRA*/
+      {
+        {   58,   71},{   66,  548},{  155,  762},{  213,  944},
+        {  192,  731},{  324, 1147},{  401, 1366},{  481, 1480},
+        {  508, 1238},{  657, 1522},{  727, 1563},{  794, 1611},
+        {  761, 1470},{  885, 1710},{  893, 1700},{  958, 1760},
+        {  893, 1543},{  985, 1719},{ 1014, 1732},{ 1082, 1784},
+        {  963, 1519},{ 1152, 1800},{ 1221, 1830},{ 1226, 1830}
       },
-      /*Cb  qi=61  INTER*/
-      {
-        {   91,   -2},{  111,  344},{  114,  663},{  166,  989},
-        {  291, 1285},{  522, 1534},{  875, 1729},{ 1302, 1889},
-        { 1786, 2031},{ 2368, 2141},{ 3042, 2207},{ 3734, 2243},
-        { 4388, 2259},{ 4982, 2264},{ 5533, 2265},{ 6043, 2262},
-        { 6524, 2264},{ 6982, 2274},{ 7422, 2283},{ 7831, 2295},
-        { 8198, 2308},{ 8593, 2319},{ 8965, 2329},{ 9258, 2340}
+      /*Cb  qi=45  INTER*/
+      {
+        {   35,  135},{   12,  532},{   54,  769},{  106, 1007},
+        {  127, 1258},{  198, 1565},{  289, 1832},{  398, 2082},
+        {  520, 2302},{  653, 2511},{  800, 2705},{  956, 2897},
+        { 1143, 3064},{ 1358, 3220},{ 1623, 3335},{ 1913, 3444},
+        { 2198, 3534},{ 2502, 3626},{ 2787, 3711},{ 3114, 3783},
+        { 3454, 3831},{ 3711, 3871},{ 4163, 3901},{ 4221, 3890}
       }
     },
     {
-      /*Cr  qi=61  INTRA*/
-      {
-        {   33,    9},{  245,  378},{  497,  699},{  801,  958},
-        { 1087, 1171},{ 1384, 1342},{ 1692, 1474},{ 1992, 1589},
-        { 2290, 1692},{ 2576, 1789},{ 2852, 1884},{ 3109, 1973},
-        { 3324, 2061},{ 3544, 2142},{ 3763, 2199},{ 3945, 2244},
-        { 4103, 2292},{ 4283, 2349},{ 4469, 2413},{ 4635, 2476},
-        { 4836, 2534},{ 5038, 2592},{ 5210, 2649},{ 5358, 2682}
+      /*Cr  qi=45  INTRA*/
+      {
+        {   93,   68},{   72,  541},{  154,  769},{  239,  848},
+        {  214,  623},{  377, 1060},{  437, 1200},{  514, 1280},
+        {  512, 1160},{  625, 1453},{  657, 1470},{  718, 1516},
+        {  692, 1331},{  831, 1617},{  875, 1609},{  944, 1678},
+        {  886, 1469},{ 1061, 1699},{ 1082, 1714},{ 1226, 1823},
+        { 1113, 1581},{ 1324, 1872},{ 1370, 1925},{ 1374, 1924}
       },
-      /*Cr  qi=61  INTER*/
-      {
-        {   82,    0},{   97,  353},{  104,  672},{  165,  995},
-        {  303, 1284},{  532, 1529},{  852, 1742},{ 1273, 1921},
-        { 1798, 2057},{ 2409, 2154},{ 3090, 2212},{ 3794, 2240},
-        { 4460, 2251},{ 5057, 2249},{ 5596, 2249},{ 6085, 2245},
-        { 6519, 2234},{ 6908, 2220},{ 7269, 2203},{ 7618, 2196},
-        { 7949, 2198},{ 8269, 2195},{ 8554, 2196},{ 8928, 2217}
+      /*Cr  qi=45  INTER*/
+      {
+        {   31,  140},{   13,  533},{   52,  770},{  109, 1000},
+        {  134, 1253},{  201, 1555},{  298, 1821},{  411, 2076},
+        {  525, 2314},{  659, 2545},{  828, 2747},{ 1019, 2918},
+        { 1205, 3082},{ 1405, 3266},{ 1609, 3443},{ 1847, 3606},
+        { 2085, 3730},{ 2404, 3835},{ 2709, 3876},{ 3049, 3886},
+        { 3381, 3821},{ 3708, 3780},{ 4026, 3663},{ 4043, 3646}
       }
     }
   },
   {
     {
-      /*Y'  qi=62  INTRA*/
-      {
-        {   29,  124},{ 1527, 1067},{ 3221, 1618},{ 4703, 1751},
-        { 5909, 1744},{ 7001, 1779},{ 8057, 1829},{ 9049, 1885},
-        { 9968, 1943},{10813, 1999},{11572, 2050},{12206, 2082},
-        {12801, 2107},{13402, 2140},{14020, 2180},{14625, 2223},
-        {15179, 2260},{15718, 2288},{16196, 2305},{16581, 2313},
-        {16963, 2324},{17382, 2341},{17800, 2351},{18318, 2376}
+      /*Y'  qi=54  INTRA*/
+      {
+        {  316,  203},{  720,  585},{ 1596, 1077},{ 2316, 1289},
+        { 2687, 1439},{ 3133, 1593},{ 3495, 1706},{ 3836, 1775},
+        { 4249, 1892},{ 4804, 2031},{ 5320, 2139},{ 5617, 2203},
+        { 5726, 2199},{ 5726, 2176},{ 5682, 2146},{ 5677, 2127},
+        { 5717, 2124},{ 5707, 2129},{ 5853, 2148},{ 6110, 2180},
+        { 6454, 2247},{ 6714, 2287},{ 6845, 2304},{ 6854, 2303}
       },
-      /*Y'  qi=62  INTER*/
-      {
-        {   -8,   36},{  218, 1284},{ 2073, 1965},{ 4814, 2159},
-        { 7237, 2138},{ 8979, 2124},{10378, 2115},{11570, 2109},
-        {12601, 2106},{13503, 2103},{14320, 2103},{15064, 2103},
-        {15746, 2103},{16384, 2104},{16975, 2105},{17534, 2105},
-        {18062, 2106},{18564, 2107},{19035, 2106},{19471, 2107},
-        {19890, 2107},{20288, 2107},{20651, 2107},{21012, 2108}
+      /*Y'  qi=54  INTER*/
+      {
+        {  -48,  217},{  314, 1261},{ 1450, 2126},{ 2761, 2728},
+        { 4275, 3012},{ 5408, 3167},{ 6305, 3245},{ 7165, 3290},
+        { 7966, 3325},{ 8698, 3359},{ 9352, 3377},{ 9907, 3391},
+        {10389, 3390},{10856, 3395},{11170, 3385},{11530, 3385},
+        {11780, 3362},{12018, 3362},{12266, 3361},{12443, 3339},
+        {12683, 3342},{12713, 3317},{12967, 3325},{13082, 3332}
       }
     },
     {
-      /*Cb  qi=62  INTRA*/
-      {
-        {   21,    3},{  283,  360},{  565,  683},{  907,  938},
-        { 1269, 1143},{ 1611, 1311},{ 1949, 1441},{ 2290, 1535},
-        { 2596, 1632},{ 2877, 1738},{ 3162, 1828},{ 3458, 1893},
-        { 3745, 1948},{ 4011, 2016},{ 4253, 2089},{ 4506, 2164},
-        { 4734, 2233},{ 4943, 2294},{ 5162, 2353},{ 5381, 2393},
-        { 5593, 2420},{ 5807, 2454},{ 6003, 2496},{ 6210, 2543}
+      /*Cb  qi=54  INTRA*/
+      {
+        {   94,   73},{   83,  557},{  152,  818},{  304,  919},
+        {  341,  819},{  506, 1128},{  593, 1281},{  700, 1389},
+        {  714, 1225},{  907, 1502},{  981, 1549},{ 1062, 1641},
+        { 1032, 1523},{ 1170, 1710},{ 1217, 1727},{ 1258, 1714},
+        { 1216, 1575},{ 1309, 1682},{ 1331, 1656},{ 1393, 1712},
+        { 1247, 1456},{ 1469, 1728},{ 1530, 1711},{ 1532, 1711}
       },
-      /*Cb  qi=62  INTER*/
-      {
-        {   91,   -1},{  110,  344},{  113,  663},{  169,  987},
-        {  306, 1279},{  562, 1519},{  961, 1701},{ 1450, 1845},
-        { 2013, 1967},{ 2686, 2053},{ 3437, 2095},{ 4171, 2109},
-        { 4841, 2109},{ 5441, 2105},{ 6002, 2097},{ 6542, 2089},
-        { 7028, 2087},{ 7491, 2088},{ 7949, 2090},{ 8377, 2089},
-        { 8789, 2095},{ 9195, 2103},{ 9569, 2104},{ 9937, 2102}
+      /*Cb  qi=54  INTER*/
+      {
+        {   33,  133},{   12,  532},{   70,  770},{  171,  996},
+        {  279, 1233},{  427, 1503},{  600, 1736},{  824, 1939},
+        { 1101, 2097},{ 1411, 2237},{ 1735, 2374},{ 2097, 2493},
+        { 2486, 2606},{ 2916, 2691},{ 3297, 2771},{ 3715, 2826},
+        { 4088, 2855},{ 4460, 2886},{ 4849, 2911},{ 5198, 2932},
+        { 5489, 2940},{ 5875, 2981},{ 6208, 3017},{ 6270, 3012}
       }
     },
     {
-      /*Cr  qi=62  INTRA*/
-      {
-        {   38,    8},{  308,  374},{  619,  685},{  984,  925},
-        { 1326, 1126},{ 1662, 1285},{ 1999, 1407},{ 2328, 1512},
-        { 2659, 1604},{ 2976, 1691},{ 3285, 1774},{ 3570, 1853},
-        { 3815, 1931},{ 4068, 1998},{ 4304, 2044},{ 4491, 2082},
-        { 4666, 2124},{ 4870, 2174},{ 5078, 2231},{ 5262, 2285},
-        { 5480, 2335},{ 5703, 2378},{ 5905, 2423},{ 6075, 2454}
+      /*Cr  qi=54  INTRA*/
+      {
+        {  103,   63},{   83,  580},{  258,  796},{  301,  802},
+        {  361,  675},{  538, 1001},{  625, 1097},{  713, 1171},
+        {  699, 1103},{  868, 1380},{  915, 1400},{  970, 1491},
+        {  923, 1365},{ 1070, 1603},{ 1154, 1655},{ 1206, 1677},
+        { 1157, 1541},{ 1366, 1736},{ 1391, 1723},{ 1506, 1797},
+        { 1388, 1556},{ 1616, 1828},{ 1655, 1797},{ 1658, 1796}
       },
-      /*Cr  qi=62  INTER*/
-      {
-        {   79,    1},{   95,  353},{  102,  671},{  169,  992},
-        {  318, 1277},{  569, 1515},{  936, 1716},{ 1428, 1876},
-        { 2034, 1993},{ 2738, 2067},{ 3511, 2095},{ 4268, 2094},
-        { 4943, 2087},{ 5543, 2079},{ 6074, 2074},{ 6552, 2069},
-        { 6985, 2057},{ 7366, 2043},{ 7728, 2030},{ 8086, 2021},
-        { 8423, 2017},{ 8752, 2016},{ 9057, 2014},{ 9376, 2008}
+      /*Cr  qi=54  INTER*/
+      {
+        {   30,  138},{   14,  532},{   63,  771},{  176,  990},
+        {  299, 1226},{  438, 1496},{  606, 1735},{  814, 1950},
+        { 1089, 2127},{ 1417, 2281},{ 1761, 2421},{ 2104, 2571},
+        { 2467, 2701},{ 2881, 2827},{ 3303, 2900},{ 3735, 2917},
+        { 4183, 2913},{ 4529, 2882},{ 4915, 2844},{ 5168, 2796},
+        { 5410, 2763},{ 5562, 2753},{ 5815, 2764},{ 5832, 2755}
       }
     }
   },
@@ -3964,61 +967,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
     {
       /*Y'  qi=63  INTRA*/
       {
-        {  -59,  134},{ 1734, 1036},{ 3743, 1521},{ 5309, 1618},
-        { 6520, 1597},{ 7664, 1609},{ 8809, 1630},{ 9894, 1657},
-        {10907, 1687},{11838, 1717},{12673, 1744},{13379, 1758},
-        {14038, 1767},{14698, 1784},{15379, 1806},{16062, 1831},
-        {16694, 1852},{17300, 1867},{17827, 1878},{18250, 1881},
-        {18702, 1884},{19199, 1892},{19665, 1896},{20273, 1908}
+        {  421,  194},{ 1272,  564},{ 3016,  943},{ 3831, 1079},
+        { 4282, 1174},{ 4799, 1290},{ 5166, 1348},{ 5259, 1350},
+        { 5720, 1426},{ 6501, 1539},{ 7048, 1606},{ 7328, 1642},
+        { 7374, 1622},{ 7349, 1612},{ 7192, 1578},{ 7207, 1571},
+        { 7161, 1555},{ 7259, 1573},{ 7432, 1592},{ 7710, 1613},
+        { 8167, 1672},{ 8425, 1697},{ 8597, 1710},{ 8602, 1710}
       },
       /*Y'  qi=63  INTER*/
       {
-        {   -7,   33},{  209, 1285},{ 2309, 1904},{ 5274, 2025},
-        { 7801, 1966},{ 9637, 1924},{11126, 1892},{12403, 1868},
-        {13515, 1849},{14491, 1834},{15380, 1822},{16197, 1814},
-        {16944, 1806},{17645, 1799},{18303, 1794},{18916, 1789},
-        {19494, 1785},{20056, 1782},{20568, 1779},{21047, 1776},
-        {21508, 1775},{21925, 1772},{22327, 1770},{22678, 1771}
+        { -584,  286},{ 1231, 1186},{ 3939, 1663},{ 6096, 1865},
+        { 7849, 1929},{ 8934, 1995},{ 9962, 2039},{11038, 2078},
+        {12016, 2092},{12889, 2100},{13617, 2096},{14221, 2089},
+        {14743, 2083},{15240, 2081},{15619, 2074},{15992, 2065},
+        {16314, 2065},{16529, 2059},{16822, 2056},{17041, 2049},
+        {17321, 2052},{17408, 2043},{17670, 2051},{17801, 2053}
       }
     },
     {
       /*Cb  qi=63  INTRA*/
       {
-        {   20,    3},{  294,  357},{  608,  673},{ 1047,  908},
-        { 1501, 1090},{ 1898, 1240},{ 2275, 1353},{ 2654, 1427},
-        { 3014, 1502},{ 3366, 1579},{ 3726, 1637},{ 4084, 1674},
-        { 4425, 1703},{ 4752, 1743},{ 5058, 1791},{ 5377, 1838},
-        { 5676, 1877},{ 5946, 1912},{ 6213, 1945},{ 6458, 1969},
-        { 6704, 1982},{ 6969, 1997},{ 7210, 2017},{ 7439, 2037}
+        {  154,   55},{  280,  582},{  507,  731},{  788,  853},
+        {  763,  738},{ 1141, 1008},{ 1323, 1090},{ 1540, 1220},
+        { 1487, 1089},{ 1861, 1322},{ 1983, 1347},{ 2145, 1425},
+        { 2047, 1317},{ 2334, 1475},{ 2352, 1413},{ 2458, 1467},
+        { 2243, 1270},{ 2464, 1413},{ 2423, 1335},{ 2506, 1385},
+        { 2182, 1180},{ 2565, 1376},{ 2555, 1321},{ 2557, 1321}
       },
       /*Cb  qi=63  INTER*/
       {
-        {   86,    1},{  108,  345},{  111,  663},{  168,  985},
-        {  307, 1276},{  577, 1513},{ 1007, 1688},{ 1550, 1819},
-        { 2189, 1921},{ 2938, 1981},{ 3744, 2002},{ 4512, 2002},
-        { 5199, 1996},{ 5824, 1986},{ 6419, 1971},{ 6978, 1954},
-        { 7507, 1940},{ 8015, 1932},{ 8502, 1928},{ 8978, 1920},
-        { 9410, 1915},{ 9842, 1910},{10262, 1901},{10634, 1896}
+        {   34,  133},{    6,  531},{  139,  767},{  344,  975},
+        {  608, 1180},{ 1048, 1367},{ 1651, 1495},{ 2376, 1572},
+        { 3103, 1609},{ 3752, 1646},{ 4373, 1680},{ 4980, 1718},
+        { 5540, 1744},{ 6023, 1764},{ 6431, 1766},{ 6800, 1769},
+        { 7149, 1775},{ 7529, 1777},{ 7920, 1817},{ 8198, 1808},
+        { 8691, 1848},{ 8965, 1845},{ 9372, 1865},{ 9459, 1863}
       }
     },
     {
       /*Cr  qi=63  INTRA*/
       {
-        {   38,    7},{  324,  367},{  677,  670},{ 1136,  892},
-        { 1562, 1070},{ 1951, 1209},{ 2326, 1313},{ 2694, 1399},
-        { 3074, 1471},{ 3460, 1531},{ 3850, 1575},{ 4214, 1622},
-        { 4522, 1679},{ 4819, 1723},{ 5089, 1749},{ 5315, 1769},
-        { 5530, 1792},{ 5756, 1825},{ 6006, 1860},{ 6244, 1889},
-        { 6514, 1924},{ 6792, 1946},{ 7026, 1962},{ 7191, 1971}
+        {  121,   59},{  392,  570},{  609,  654},{  800,  760},
+        {  720,  598},{ 1192,  892},{ 1298,  897},{ 1470, 1027},
+        { 1411,  962},{ 1761, 1184},{ 1826, 1197},{ 1981, 1308},
+        { 1854, 1198},{ 2229, 1427},{ 2269, 1365},{ 2428, 1453},
+        { 2217, 1265},{ 2558, 1435},{ 2541, 1356},{ 2660, 1417},
+        { 2337, 1199},{ 2688, 1382},{ 2603, 1301},{ 2605, 1300}
       },
       /*Cr  qi=63  INTER*/
       {
-        {   80,    2},{   95,  354},{  101,  671},{  167,  990},
-        {  321, 1274},{  585, 1509},{  984, 1702},{ 1534, 1849},
-        { 2217, 1947},{ 3005, 1995},{ 3839, 1999},{ 4619, 1986},
-        { 5310, 1973},{ 5933, 1961},{ 6486, 1952},{ 6988, 1942},
-        { 7435, 1927},{ 7817, 1911},{ 8198, 1900},{ 8552, 1895},
-        { 8881, 1890},{ 9253, 1883},{ 9598, 1876},{ 9923, 1859}
+        {   31,  137},{   10,  531},{  136,  768},{  360,  971},
+        {  638, 1166},{ 1029, 1373},{ 1604, 1519},{ 2351, 1595},
+        { 3129, 1640},{ 3861, 1691},{ 4491, 1751},{ 5101, 1783},
+        { 5635, 1784},{ 6136, 1779},{ 6550, 1763},{ 6905, 1746},
+        { 7172, 1726},{ 7495, 1732},{ 7738, 1735},{ 7949, 1735},
+        { 8211, 1744},{ 8424, 1740},{ 8779, 1764},{ 8812, 1760}
       }
     }
   }
diff --git a/thirdparty/libtheora/ocintrin.h b/thirdparty/libtheora/ocintrin.h
index d49ebb2159..b200ceafce 100644
--- a/thirdparty/libtheora/ocintrin.h
+++ b/thirdparty/libtheora/ocintrin.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: ocintrin.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch b/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch
deleted file mode 100644
index 1b9c8e20be..0000000000
--- a/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 0ae66d565e6bead8604d312bc1a4e9dccf245c88 Mon Sep 17 00:00:00 2001
-From: Tim Terriberry <tterribe@xiph.org>
-Date: Tue, 8 May 2012 02:51:57 +0000
-Subject: [PATCH] Fix pp_sharp_mod calculation.
-
-This was broken when the dequant_tables indexing changed in commit
- r16102, but it only affected post-processing quality, so we never
- noticed.
-With gcc 4.8.0, this can now trigger a segfault during decoder
- initialization.
-
-svn path=/trunk/theora/; revision=18268
----
- decode.c | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/decode.c b/decode.c
-index b803505..9f2516a 100644
---- a/decode.c
-+++ b/decode.c
-@@ -400,10 +400,10 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
-     int qsum;
-     qsum=0;
-     for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
--      qsum+=_dec->state.dequant_tables[qti][pli][qi][12]+
--       _dec->state.dequant_tables[qti][pli][qi][17]+
--       _dec->state.dequant_tables[qti][pli][qi][18]+
--       _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0);
-+      qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+
-+       _dec->state.dequant_tables[qi][pli][qti][17]+
-+       _dec->state.dequant_tables[qi][pli][qti][18]+
-+       _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0);
-     }
-     _dec->pp_sharp_mod[qi]=-(qsum>>11);
-   }
--- 
-2.11.0
-
diff --git a/thirdparty/libtheora/quant.c b/thirdparty/libtheora/quant.c
index 8359f5abea..e206202844 100644
--- a/thirdparty/libtheora/quant.c
+++ b/thirdparty/libtheora/quant.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -21,6 +21,14 @@
 #include "quant.h"
 #include "decint.h"
 
+/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
+  These minimum quantizers ensure the result after quantization (and after
+   prediction for DC) will be no more than +/- 510.
+  The tokenization system can handle values up to +/- 580, so there is no need
+   to do any coefficient clamping.
+  I would rather have allowed smaller quantizers and had to clamp, but these
+   minimums were required when constructing the original VP3 matrices and have
+   been formalized in the spec.*/
 static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
 static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
 
diff --git a/thirdparty/libtheora/quant.h b/thirdparty/libtheora/quant.h
index 49ce13a65c..247210eaae 100644
--- a/thirdparty/libtheora/quant.h
+++ b/thirdparty/libtheora/quant.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: quant.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
diff --git a/thirdparty/libtheora/rate.c b/thirdparty/libtheora/rate.c
index 4f43bb2e5f..bf2b1396a1 100644
--- a/thirdparty/libtheora/rate.c
+++ b/thirdparty/libtheora/rate.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: rate.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <stdlib.h>
@@ -190,7 +190,8 @@ void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){
     This may need to be revised if the R-D cost estimation or qii flag
      optimization strategies change.*/
   nqis=1;
-  if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible){
+  if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&&
+   _enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
     qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0,
      lq+(OC_Q57(7)+5)/10);
     if(qi1!=qi)_enc->state.qis[nqis++]=qi1;
@@ -761,6 +762,7 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
       _enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale);
       _enc->rc.cur_metrics.dup_count=_enc->dup_count;
       _enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
+      _enc->rc.cur_metrics.activity_avg=_enc->activity_avg;
       _enc->rc.twopass_buffer_bytes=0;
     }break;
     case 2:{
@@ -863,9 +865,9 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
   return dropped;
 }
 
-#define OC_RC_2PASS_VERSION   (1)
+#define OC_RC_2PASS_VERSION   (2)
 #define OC_RC_2PASS_HDR_SZ    (38)
-#define OC_RC_2PASS_PACKET_SZ (8)
+#define OC_RC_2PASS_PACKET_SZ (12)
 
 static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
   while(_bytes-->0){
@@ -900,6 +902,7 @@ int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){
       oc_rc_buffer_val(&_enc->rc,
        _enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
       oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4);
+      oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.activity_avg,4);
     }
   }
   else if(_enc->packet_state==OC_PACKET_DONE&&
@@ -1050,16 +1053,19 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
         if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
           ogg_uint32_t dup_count;
           ogg_int32_t  log_scale;
+          unsigned     activity;
           int          qti;
           int          arg;
           /*Read the metrics for the next frame.*/
           dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
           log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
+          activity=oc_rc_unbuffer_val(&_enc->rc,4);
           _enc->rc.cur_metrics.log_scale=log_scale;
           qti=(dup_count&0x80000000)>>31;
           _enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
           _enc->rc.cur_metrics.frame_type=qti;
           _enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
+          _enc->activity_avg=_enc->rc.cur_metrics.activity_avg=activity;
           /*"Helpfully" set the dup count back to what it was in pass 1.*/
           arg=_enc->rc.cur_metrics.dup_count;
           th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
@@ -1070,8 +1076,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
       else{
         int frames_needed;
         /*We're using a finite buffer:*/
-        frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
-         -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+        frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay,
+         _enc->rc.scale_window_end-_enc->rc.scale_window0),
          _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
          -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
         while(frames_needed>0){
@@ -1087,9 +1093,11 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
             ogg_uint32_t      dup_count;
             ogg_int32_t       log_scale;
             int               qti;
+            unsigned          activity;
             /*Read the metrics for the next frame.*/
             dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
             log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
+            activity=oc_rc_unbuffer_val(&_enc->rc,4);
             /*Add the to the circular buffer.*/
             fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
             if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
@@ -1098,6 +1106,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
             qti=(dup_count&0x80000000)>>31;
             m->dup_count=dup_count&0x7FFFFFFF;
             m->frame_type=qti;
+            m->activity_avg=activity;
             /*And accumulate the statistics over the window.*/
             _enc->rc.nframes[qti]++;
             _enc->rc.nframes[2]+=m->dup_count;
@@ -1105,8 +1114,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
             _enc->rc.scale_window_end+=m->dup_count+1;
             /*Compute an upper bound on the number of remaining packets needed
                for the current window.*/
-            frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
-             -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+            frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay,
+             _enc->rc.scale_window_end-_enc->rc.scale_window0),
              _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
              -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
             /*Clear the buffer for the next frame.*/
@@ -1124,6 +1133,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
            *(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
           _enc->rc.twopass_force_kf=
            _enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
+          _enc->activity_avg=_enc->rc.cur_metrics.activity_avg;
           /*"Helpfully" set the dup count back to what it was in pass 1.*/
           arg=_enc->rc.cur_metrics.dup_count;
           th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
diff --git a/thirdparty/libtheora/state.c b/thirdparty/libtheora/state.c
index 42ed33a9a3..f4c6240387 100644
--- a/thirdparty/libtheora/state.c
+++ b/thirdparty/libtheora/state.c
@@ -11,25 +11,93 @@
  ********************************************************************
 
   function:
-    last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
 #include <stdlib.h>
 #include <string.h>
-#include "internal.h"
-#if defined(OC_X86_ASM)
-#if defined(_MSC_VER)
-# include "x86_vc/x86int.h"
-#else
-# include "x86/x86int.h"
-#endif
-#endif
+#include "state.h"
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
+# include "zlib.h"
 #endif
 
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X and Y directions
+   (4:2:0).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1])
+   +OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1])
+   +OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the Y direction.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+  dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X direction (4:2:2).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]);
+  dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]);
+  _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+  dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
+  dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
+  _cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with no chroma decimation (4:4:4).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  _cbmvs[0]=_lbmvs[0];
+  _cbmvs[1]=_lbmvs[1];
+  _cbmvs[2]=_lbmvs[2];
+  _cbmvs[3]=_lbmvs[3];
+}
+
+/*A table of functions used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
+};
+
+
+
 /*Returns the fragment index of the top-left block in a macro block.
   This can be used to test whether or not the whole macro block is valid.
   _sb_map: The super block map.
@@ -92,7 +160,7 @@ static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[],
       if(jmax>4)jmax=4;
       else if(jmax<=0)break;
       /*By default, set all fragment indices to -1.*/
-      memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
+      memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
       /*Fill in the fragment map for this super block.*/
       xfrag=yfrag+x;
       for(i=0;i<imax;i++){
@@ -186,10 +254,14 @@ static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3],
   This version is for use with no chroma decimation (4:4:4).
   This uses the already filled-in luma plane values.
   _mb_map:  The macro block map to fill.
-  _fplanes: The descriptions of the fragment planes.*/
+  _fplanes: The descriptions of the fragment planes.
+  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
 static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3],
- const oc_fragment_plane _fplanes[3]){
+ const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
   int k;
+  (void)_xfrag0;
+  (void)_yfrag0;
   for(k=0;k<4;k++){
     _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
     _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
@@ -211,7 +283,7 @@ static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={
   oc_mb_fill_cmapping00,
   oc_mb_fill_cmapping01,
   oc_mb_fill_cmapping10,
-  (oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11
+  oc_mb_fill_cmapping11
 };
 
 /*Fills in the mapping from macro blocks to their corresponding fragment
@@ -469,7 +541,7 @@ static void oc_state_frarray_clear(oc_theora_state *_state){
    unrestricted motion vectors without special casing the boundary.
   If chroma is decimated in either direction, the padding is reduced by a
    factor of 2 on the appropriate sides.
-  _nrefs: The number of reference buffers to init; must be 3 or 4.*/
+  _nrefs: The number of reference buffers to init; must be in the range 3...6.*/
 static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   th_info       *info;
   unsigned char *ref_frame_data;
@@ -481,6 +553,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   int            yheight;
   int            chstride;
   int            cheight;
+  ptrdiff_t      align;
   ptrdiff_t      yoffset;
   ptrdiff_t      coffset;
   ptrdiff_t     *frag_buf_offs;
@@ -489,33 +562,38 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   int            vdec;
   int            rfi;
   int            pli;
-  if(_nrefs<3||_nrefs>4)return TH_EINVAL;
+  if(_nrefs<3||_nrefs>6)return TH_EINVAL;
   info=&_state->info;
   /*Compute the image buffer parameters for each plane.*/
   hdec=!(info->pixel_fmt&1);
   vdec=!(info->pixel_fmt&2);
   yhstride=info->frame_width+2*OC_UMV_PADDING;
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>hdec;
+  /*Require 16-byte aligned rows in the chroma planes.*/
+  chstride=(yhstride>>hdec)+15&~15;
   cheight=yheight>>vdec;
   yplane_sz=yhstride*(size_t)yheight;
   cplane_sz=chstride*(size_t)cheight;
   yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
   coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
-  ref_frame_sz=yplane_sz+2*cplane_sz;
+  /*Although we guarantee the rows of the chroma planes are a multiple of 16
+     bytes, the initial padding on the first row may only be 8 bytes.
+    Compute the offset needed to the actual image data to a multiple of 16.*/
+  align=-coffset&15;
+  ref_frame_sz=yplane_sz+2*cplane_sz+16;
   ref_frame_data_sz=_nrefs*ref_frame_sz;
   /*Check for overflow.
     The same caveats apply as for oc_state_frarray_init().*/
-  if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
+  if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz||
    ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
     return TH_EIMPL;
   }
-  ref_frame_data=_ogg_malloc(ref_frame_data_sz);
+  ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
   frag_buf_offs=_state->frag_buf_offs=
    _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
   if(ref_frame_data==NULL||frag_buf_offs==NULL){
     _ogg_free(frag_buf_offs);
-    _ogg_free(ref_frame_data);
+    oc_aligned_free(ref_frame_data);
     return TH_EFAULT;
   }
   /*Set up the width, height and stride for the image buffers.*/
@@ -532,15 +610,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
     memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
      sizeof(_state->ref_frame_bufs[0]));
   }
+  _state->ref_frame_handle=ref_frame_data;
   /*Set up the data pointers for the image buffers.*/
   for(rfi=0;rfi<_nrefs;rfi++){
-    _state->ref_frame_data[rfi]=ref_frame_data;
     _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
-    ref_frame_data+=yplane_sz;
+    ref_frame_data+=yplane_sz+align;
     _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
     ref_frame_data+=cplane_sz;
     _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
-    ref_frame_data+=cplane_sz;
+    ref_frame_data+=cplane_sz+(16-align);
     /*Flip the buffer upside down.
       This allows us to decode Theora's bottom-up frames in their natural
        order, yet return a top-down buffer with a positive stride to the user.*/
@@ -550,7 +628,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   _state->ref_ystride[0]=-yhstride;
   _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
   /*Initialize the fragment buffer offsets.*/
-  ref_frame_data=_state->ref_frame_data[0];
+  ref_frame_data=_state->ref_frame_bufs[0][0].data;
   fragi=0;
   for(pli=0;pli<3;pli++){
     th_img_plane      *iplane;
@@ -576,41 +654,44 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
       vpix+=stride<<3;
     }
   }
-  /*Initialize the reference frame indices.*/
+  /*Initialize the reference frame pointers and indices.*/
   _state->ref_frame_idx[OC_FRAME_GOLD]=
    _state->ref_frame_idx[OC_FRAME_PREV]=
-   _state->ref_frame_idx[OC_FRAME_SELF]=-1;
-  _state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
+   _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
+   _state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
+   _state->ref_frame_idx[OC_FRAME_SELF]=
+   _state->ref_frame_idx[OC_FRAME_IO]=-1;
+  _state->ref_frame_data[OC_FRAME_GOLD]=
+   _state->ref_frame_data[OC_FRAME_PREV]=
+   _state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
+   _state->ref_frame_data[OC_FRAME_PREV_ORIG]=
+   _state->ref_frame_data[OC_FRAME_SELF]=
+   _state->ref_frame_data[OC_FRAME_IO]=NULL;
   return 0;
 }
 
 static void oc_state_ref_bufs_clear(oc_theora_state *_state){
   _ogg_free(_state->frag_buf_offs);
-  _ogg_free(_state->ref_frame_data[0]);
+  oc_aligned_free(_state->ref_frame_handle);
 }
 
 
-void oc_state_vtable_init_c(oc_theora_state *_state){
+void oc_state_accel_init_c(oc_theora_state *_state){
+  _state->cpu_flags=0;
+#if defined(OC_STATE_USE_VTABLE)
   _state->opt_vtable.frag_copy=oc_frag_copy_c;
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
   _state->opt_vtable.idct8x8=oc_idct8x8_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
-  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
-  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
-}
-
-/*Initialize the accelerated function pointers.*/
-void oc_state_vtable_init(oc_theora_state *_state){
-#if defined(OC_X86_ASM)
-  oc_state_vtable_init_x86(_state);
-#else
-  oc_state_vtable_init_c(_state);
 #endif
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
 }
 
 
@@ -626,7 +707,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
      how it is specified in the bitstream, because the Y axis is flipped in
      the bitstream.
     The displayable frame must fit inside the encoded frame.
-    The color space must be one known by the encoder.*/
+    The color space must be one known by the encoder.
+    The framerate ratio must not contain a zero value.*/
   if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
    _info->frame_width<=0||_info->frame_width>=0x100000||
    _info->frame_height<=0||_info->frame_height>=0x100000||
@@ -639,7 +721,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
       but there are a number of compilers which will mis-optimize this.
      It's better to live with the spurious warnings.*/
    _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
-   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
+   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS||
+   _info->fps_numerator<1||_info->fps_denominator<1){
     return TH_EINVAL;
   }
   memset(_state,0,sizeof(*_state));
@@ -648,7 +731,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
      system.*/
   _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
   _state->frame_type=OC_UNKWN_FRAME;
-  oc_state_vtable_init(_state);
+  oc_state_accel_init(_state);
   ret=oc_state_frarray_init(_state);
   if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
   if(ret<0){
@@ -758,11 +841,10 @@ void oc_state_borders_fill(oc_theora_state *_state,int _refi){
             _offsets[1] is set if the motion vector has non-zero fractional
              components.
   _pli:     The color plane index.
-  _dx:      The X component of the motion vector.
-  _dy:      The Y component of the motion vector.
+  _mv:      The motion vector.
   Return: The number of offsets returned: 1 or 2.*/
 int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
- int _pli,int _dx,int _dy){
+ int _pli,oc_mv _mv){
   /*Here is a brief description of how Theora handles motion vectors:
     Motion vector components are specified to half-pixel accuracy in
      undecimated directions of each plane, and quarter-pixel accuracy in
@@ -785,21 +867,25 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
   int xfrac;
   int yfrac;
   int offs;
+  int dx;
+  int dy;
   ystride=_state->ref_ystride[_pli];
   /*These two variables decide whether we are in half- or quarter-pixel
      precision in each component.*/
   xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
   yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
   /*These two variables are either 0 if all the fractional bits are zero or -1
      if any of them are non-zero.*/
-  xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
-  yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
-  offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
+  xfrac=OC_SIGNMASK(-(dx&(xprec|1)));
+  yfrac=OC_SIGNMASK(-(dy&(yprec|1)));
+  offs=(dx>>xprec)+(dy>>yprec)*ystride;
   if(xfrac||yfrac){
     int xmask;
     int ymask;
-    xmask=OC_SIGNMASK(_dx);
-    ymask=OC_SIGNMASK(_dy);
+    xmask=OC_SIGNMASK(dx);
+    ymask=OC_SIGNMASK(dy);
     yfrac&=ystride;
     _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
     _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
@@ -848,13 +934,17 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
   int mx2;
   int my2;
   int offs;
+  int dx;
+  int dy;
   ystride=_state->ref_ystride[_pli];
   qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
-  my=OC_MVMAP[qpy][_dy+31];
-  my2=OC_MVMAP2[qpy][_dy+31];
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
+  my=OC_MVMAP[qpy][dy+31];
+  my2=OC_MVMAP2[qpy][dy+31];
   qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
-  mx=OC_MVMAP[qpx][_dx+31];
-  mx2=OC_MVMAP2[qpx][_dx+31];
+  mx=OC_MVMAP[qpx][dx+31];
+  mx2=OC_MVMAP2[qpx][dx+31];
   offs=my*ystride+mx;
   if(mx2||my2){
     _offsets[1]=offs+my2*ystride+mx2;
@@ -866,18 +956,12 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
 #endif
 }
 
-void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
-  _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
-   _last_zzi,_dc_quant);
-}
-
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            mb_mode;
+  int            refi;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -887,69 +971,35 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
     /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+    for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
   }
   else{
     /*First, dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2(_state,
-       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
+    }
+    else{
+      oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
     }
-    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
-  }
-}
-
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  _state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
-   _src_frame,_pli);
-}
-
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    oc_frag_copy(_state,dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
   }
 }
 
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
   int y;
   _pix-=2;
   for(y=0;y<8;y++){
@@ -965,7 +1015,7 @@ static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
   }
 }
 
-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
   int x;
   _pix-=_ystride*2;
   for(x=0;x<8;x++){
@@ -982,20 +1032,16 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
 
 /*Initialize the bounding values array used by the loop filter.
   _bv: Storage for the array.
-  Return: 0 on success, or a non-zero value if no filtering need be applied.*/
-int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
-  int flimit;
+  _flimit: The filter limit as defined in Section 7.10 of the spec.*/
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
   int i;
-  flimit=_state->loop_filter_limits[_state->qis[0]];
-  if(flimit==0)return 1;
   memset(_bv,0,sizeof(_bv[0])*256);
-  for(i=0;i<flimit;i++){
-    if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
-    _bv[127-i]=-i;
-    _bv[127+i]=i;
-    if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
+  for(i=0;i<_flimit;i++){
+    if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
+    _bv[127-i]=(signed char)(-i);
+    _bv[127+i]=(signed char)(i);
+    if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
   }
-  return 0;
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -1006,14 +1052,8 @@ int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256],
- int _refi,int _pli,int _fragy0,int _fragy_end){
-  _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
-   _fragy0,_fragy_end);
-}
-
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -1030,7 +1070,7 @@ void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
diff --git a/thirdparty/libtheora/state.h b/thirdparty/libtheora/state.h
new file mode 100644
index 0000000000..f176a53ce9
--- /dev/null
+++ b/thirdparty/libtheora/state.h
@@ -0,0 +1,552 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $
+
+ ********************************************************************/
+#if !defined(_state_H)
+# define _state_H (1)
+# include "internal.h"
+# include "huffman.h"
+# include "quant.h"
+
+
+
+/*A single quadrant of the map from a super block to fragment numbers.*/
+typedef ptrdiff_t       oc_sb_map_quad[4];
+/*A map from a super block to fragment numbers.*/
+typedef oc_sb_map_quad  oc_sb_map[4];
+/*A single plane of the map from a macro block to fragment numbers.*/
+typedef ptrdiff_t       oc_mb_map_plane[4];
+/*A map from a macro block to fragment numbers.*/
+typedef oc_mb_map_plane oc_mb_map[3];
+/*A motion vector.*/
+typedef ogg_int16_t     oc_mv;
+
+typedef struct oc_sb_flags              oc_sb_flags;
+typedef struct oc_border_info           oc_border_info;
+typedef struct oc_fragment              oc_fragment;
+typedef struct oc_fragment_plane        oc_fragment_plane;
+typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_base_opt_data         oc_base_opt_data;
+typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
+typedef struct oc_theora_state          oc_theora_state;
+
+
+
+/*Shared accelerated functions.*/
+# if defined(OC_X86_ASM)
+#  if defined(_MSC_VER)
+#   include "x86_vc/x86int.h"
+#  else
+#   include "x86/x86int.h"
+#  endif
+# endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armint.h"
+# endif
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xint.h"
+# endif
+
+# if !defined(oc_state_accel_init)
+#  define oc_state_accel_init oc_state_accel_init_c
+# endif
+# if defined(OC_STATE_USE_VTABLE)
+#  if !defined(oc_frag_copy)
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  ((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
+#  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs))
+#  endif
+#  if !defined(oc_frag_recon_intra)
+#   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
+#  endif
+#  if !defined(oc_frag_recon_inter)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
+#  endif
+#  if !defined(oc_frag_recon_inter2)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  ((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \
+   _src1,_src2,_ystride,_residue))
+#  endif
+# if !defined(oc_idct8x8)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
+#  endif
+#  if !defined(oc_state_frag_recon)
+#   define oc_state_frag_recon(_state,_fragi, \
+ _pli,_dct_coeffs,_last_zzi,_dc_quant) \
+  ((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
+   _pli,_dct_coeffs,_last_zzi,_dc_quant))
+#  endif
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
+#  endif
+#  if !defined(oc_state_loop_filter_frag_rows)
+#   define oc_state_loop_filter_frag_rows(_state, \
+ _bv,_refi,_pli,_fragy0,_fragy_end) \
+  ((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \
+   _bv,_refi,_pli,_fragy0,_fragy_end))
+#  endif
+#  if !defined(oc_restore_fpu)
+#   define oc_restore_fpu(_state) \
+  ((*(_state)->opt_vtable.restore_fpu)())
+#  endif
+# else
+#  if !defined(oc_frag_copy)
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_c(_dst,_src,_ystride)
+#  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs)
+#  endif
+#  if !defined(oc_frag_recon_intra)
+#   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+  oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
+#  endif
+#  if !defined(oc_frag_recon_inter2)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
+#  endif
+#  if !defined(oc_idct8x8)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
+#  endif
+#  if !defined(oc_state_frag_recon)
+#   define oc_state_frag_recon oc_state_frag_recon_c
+#  endif
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_c(_bv,_flimit)
+#  endif
+#  if !defined(oc_state_loop_filter_frag_rows)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
+#  endif
+#  if !defined(oc_restore_fpu)
+#   define oc_restore_fpu(_state) do{}while(0)
+#  endif
+# endif
+
+
+
+/*A keyframe.*/
+# define OC_INTRA_FRAME (0)
+/*A predicted frame.*/
+# define OC_INTER_FRAME (1)
+/*A frame of unknown type (frame type decision has not yet been made).*/
+# define OC_UNKWN_FRAME (-1)
+
+/*The amount of padding to add to the reconstructed frame buffers on all
+   sides.
+  This is used to allow unrestricted motion vectors without special casing.
+  This must be a multiple of 2.*/
+# define OC_UMV_PADDING (16)
+
+/*Frame classification indices.*/
+/*The previous golden frame.*/
+# define OC_FRAME_GOLD      (0)
+/*The previous frame.*/
+# define OC_FRAME_PREV      (1)
+/*The current frame.*/
+# define OC_FRAME_SELF      (2)
+/*Used to mark uncoded fragments (for DC prediction).*/
+# define OC_FRAME_NONE      (3)
+
+/*The input or output buffer.*/
+# define OC_FRAME_IO        (3)
+/*Uncompressed prev golden frame.*/
+# define OC_FRAME_GOLD_ORIG (4)
+/*Uncompressed previous frame. */
+# define OC_FRAME_PREV_ORIG (5)
+
+/*Macroblock modes.*/
+/*Macro block is invalid: It is never coded.*/
+# define OC_MODE_INVALID        (-1)
+/*Encoded difference from the same macro block in the previous frame.*/
+# define OC_MODE_INTER_NOMV     (0)
+/*Encoded with no motion compensated prediction.*/
+# define OC_MODE_INTRA          (1)
+/*Encoded difference from the previous frame offset by the given motion
+   vector.*/
+# define OC_MODE_INTER_MV       (2)
+/*Encoded difference from the previous frame offset by the last coded motion
+   vector.*/
+# define OC_MODE_INTER_MV_LAST  (3)
+/*Encoded difference from the previous frame offset by the second to last
+   coded motion vector.*/
+# define OC_MODE_INTER_MV_LAST2 (4)
+/*Encoded difference from the same macro block in the previous golden
+   frame.*/
+# define OC_MODE_GOLDEN_NOMV    (5)
+/*Encoded difference from the previous golden frame offset by the given motion
+   vector.*/
+# define OC_MODE_GOLDEN_MV      (6)
+/*Encoded difference from the previous frame offset by the individual motion
+   vectors given for each block.*/
+# define OC_MODE_INTER_MV_FOUR  (7)
+/*The number of (coded) modes.*/
+# define OC_NMODES              (8)
+
+/*Determines the reference frame used for a given MB mode.*/
+# define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
+
+/*Constants for the packet state machine common between encoder and decoder.*/
+
+/*Next packet to emit/read: Codec info header.*/
+# define OC_PACKET_INFO_HDR    (-3)
+/*Next packet to emit/read: Comment header.*/
+# define OC_PACKET_COMMENT_HDR (-2)
+/*Next packet to emit/read: Codec setup header.*/
+# define OC_PACKET_SETUP_HDR   (-1)
+/*No more packets to emit/read.*/
+# define OC_PACKET_DONE        (INT_MAX)
+
+
+
+#define OC_MV(_x,_y)         ((oc_mv)((_x)&0xFF|(_y)<<8))
+#define OC_MV_X(_mv)         ((signed char)(_mv))
+#define OC_MV_Y(_mv)         ((_mv)>>8)
+#define OC_MV_ADD(_mv1,_mv2) \
+  OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \
+   OC_MV_Y(_mv1)+OC_MV_Y(_mv2))
+#define OC_MV_SUB(_mv1,_mv2) \
+  OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \
+   OC_MV_Y(_mv1)-OC_MV_Y(_mv2))
+
+
+
+/*Super blocks are 32x32 segments of pixels in a single color plane indexed
+   in image order.
+  Internally, super blocks are broken up into four quadrants, each of which
+   contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
+  Quadrants, and the blocks within them, are indexed in a special order called
+   a "Hilbert curve" within the super block.
+
+  In order to differentiate between the Hilbert-curve indexing strategy and
+   the regular image order indexing strategy, blocks indexed in image order
+   are called "fragments".
+  Fragments are indexed in image order, left to right, then bottom to top,
+   from Y' plane to Cb plane to Cr plane.
+
+  The co-located fragments in all image planes corresponding to the location
+   of a single quadrant of a luma plane super block form a macro block.
+  Thus there is only a single set of macro blocks for all planes, each of which
+   contains between 6 and 12 fragments, depending on the pixel format.
+  Therefore macro block information is kept in a separate set of arrays from
+   super blocks to avoid unused space in the other planes.
+  The lists are indexed in super block order.
+  That is, the macro block corresponding to the macro block mbi in (luma plane)
+   super block sbi is at index (sbi<<2|mbi).
+  Thus the number of macro blocks in each dimension is always twice the number
+   of super blocks, even when only an odd number fall inside the coded frame.
+  These "extra" macro blocks are just an artifact of our internal data layout,
+   and not part of the coded stream; they are flagged with a negative MB mode.*/
+
+
+
+/*Super block information.*/
+struct oc_sb_flags{
+  unsigned char coded_fully:1;
+  unsigned char coded_partially:1;
+  unsigned char quad_valid:4;
+};
+
+
+
+/*Information about a fragment which intersects the border of the displayable
+   region.
+  This marks which pixels belong to the displayable region.*/
+struct oc_border_info{
+  /*A bit mask marking which pixels are in the displayable region.
+    Pixel (x,y) corresponds to bit (y<<3|x).*/
+  ogg_int64_t mask;
+  /*The number of pixels in the displayable region.
+    This is always positive, and always less than 64.*/
+  int         npixels;
+};
+
+
+
+/*Fragment information.*/
+struct oc_fragment{
+  /*A flag indicating whether or not this fragment is coded.*/
+  unsigned   coded:1;
+  /*A flag indicating that this entire fragment lies outside the displayable
+     region of the frame.
+    Note the contrast with an invalid macro block, which is outside the coded
+     frame, not just the displayable one.
+    There are no fragments outside the coded frame by construction.*/
+  unsigned   invalid:1;
+  /*The index of the quality index used for this fragment's AC coefficients.*/
+  unsigned   qii:4;
+  /*The index of the reference frame this fragment is predicted from.*/
+  unsigned   refi:2;
+  /*The mode of the macroblock this fragment belongs to.*/
+  unsigned   mb_mode:3;
+  /*The index of the associated border information for fragments which lie
+     partially outside the displayable region.
+    For fragments completely inside or outside this region, this is -1.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int borderi:5;
+  /*The prediction-corrected DC component.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int dc:16;
+};
+
+
+
+/*A description of each fragment plane.*/
+struct oc_fragment_plane{
+  /*The number of fragments in the horizontal direction.*/
+  int       nhfrags;
+  /*The number of fragments in the vertical direction.*/
+  int       nvfrags;
+  /*The offset of the first fragment in the plane.*/
+  ptrdiff_t froffset;
+  /*The total number of fragments in the plane.*/
+  ptrdiff_t nfrags;
+  /*The number of super blocks in the horizontal direction.*/
+  unsigned  nhsbs;
+  /*The number of super blocks in the vertical direction.*/
+  unsigned  nvsbs;
+  /*The offset of the first super block in the plane.*/
+  unsigned  sboffset;
+  /*The total number of super blocks in the plane.*/
+  unsigned  nsbs;
+};
+
+
+typedef void (*oc_state_loop_filter_frag_rows_func)(
+ const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
+ int _fragy0,int _fragy_end);
+
+/*The shared (encoder and decoder) functions that have accelerated variants.*/
+struct oc_base_opt_vtable{
+  void (*frag_copy)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride);
+  void (*frag_copy_list)(unsigned char *_dst_frame,
+   const unsigned char *_src_frame,int _ystride,
+   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
+   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+  void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+  void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
+   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+  void (*loop_filter_init)(signed char _bv[256],int _flimit);
+  oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
+  void (*restore_fpu)(void);
+};
+
+/*The shared (encoder and decoder) tables that vary according to which variants
+   of the above functions are used.*/
+struct oc_base_opt_data{
+  const unsigned char *dct_fzig_zag;
+};
+
+
+/*State information common to both the encoder and decoder.*/
+struct oc_theora_state{
+  /*The stream information.*/
+  th_info             info;
+# if defined(OC_STATE_USE_VTABLE)
+  /*Table for shared accelerated functions.*/
+  oc_base_opt_vtable  opt_vtable;
+# endif
+  /*Table for shared data used by accelerated functions.*/
+  oc_base_opt_data    opt_data;
+  /*CPU flags to detect the presence of extended instruction sets.*/
+  ogg_uint32_t        cpu_flags;
+  /*The fragment plane descriptions.*/
+  oc_fragment_plane   fplanes[3];
+  /*The list of fragments, indexed in image order.*/
+  oc_fragment        *frags;
+  /*The the offset into the reference frame buffer to the upper-left pixel of
+     each fragment.*/
+  ptrdiff_t          *frag_buf_offs;
+  /*The motion vector for each fragment.*/
+  oc_mv              *frag_mvs;
+  /*The total number of fragments in a single frame.*/
+  ptrdiff_t           nfrags;
+  /*The list of super block maps, indexed in image order.*/
+  oc_sb_map          *sb_maps;
+  /*The list of super block flags, indexed in image order.*/
+  oc_sb_flags        *sb_flags;
+  /*The total number of super blocks in a single frame.*/
+  unsigned            nsbs;
+  /*The fragments from each color plane that belong to each macro block.
+    Fragments are stored in image order (left to right then top to bottom).
+    When chroma components are decimated, the extra fragments have an index of
+     -1.*/
+  oc_mb_map          *mb_maps;
+  /*The list of macro block modes.
+    A negative number indicates the macro block lies entirely outside the
+     coded frame.*/
+  signed char        *mb_modes;
+  /*The number of macro blocks in the X direction.*/
+  unsigned            nhmbs;
+  /*The number of macro blocks in the Y direction.*/
+  unsigned            nvmbs;
+  /*The total number of macro blocks.*/
+  size_t              nmbs;
+  /*The list of coded fragments, in coded order.
+    Uncoded fragments are stored in reverse order from the end of the list.*/
+  ptrdiff_t          *coded_fragis;
+  /*The number of coded fragments in each plane.*/
+  ptrdiff_t           ncoded_fragis[3];
+  /*The total number of coded fragments.*/
+  ptrdiff_t           ntotal_coded_fragis;
+  /*The actual buffers used for the reference frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[6];
+  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
+  int                 ref_frame_idx[6];
+  /*The storage for the reference frame buffers.
+    This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
+     for faster look-up.*/
+  unsigned char      *ref_frame_data[6];
+  /*The handle used to allocate the reference frame buffers.*/
+  unsigned char      *ref_frame_handle;
+  /*The strides for each plane in the reference frames.*/
+  int                 ref_ystride[3];
+  /*The number of unique border patterns.*/
+  int                 nborders;
+  /*The unique border patterns for all border fragments.
+    The borderi field of fragments which straddle the border indexes this
+     list.*/
+  oc_border_info      borders[16];
+  /*The frame number of the last keyframe.*/
+  ogg_int64_t         keyframe_num;
+  /*The frame number of the current frame.*/
+  ogg_int64_t         curframe_num;
+  /*The granpos of the current frame.*/
+  ogg_int64_t         granpos;
+  /*The type of the current frame.*/
+  signed char         frame_type;
+  /*The bias to add to the frame count when computing granule positions.*/
+  unsigned char       granpos_bias;
+  /*The number of quality indices used in the current frame.*/
+  unsigned char       nqis;
+  /*The quality indices of the current frame.*/
+  unsigned char       qis[3];
+  /*The dequantization tables, stored in zig-zag order, and indexed by
+     qi, pli, qti, and zzi.*/
+  ogg_uint16_t       *dequant_tables[64][3][2];
+  OC_ALIGN16(oc_quant_table      dequant_table_data[64][3][2]);
+  /*Loop filter strength parameters.*/
+  unsigned char       loop_filter_limits[64];
+};
+
+
+
+/*The function type used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
+
+
+
+/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
+
+
+
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
+void oc_state_clear(oc_theora_state *_state);
+void oc_state_accel_init_c(oc_theora_state *_state);
+void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
+ int _y0,int _yend);
+void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
+void oc_state_borders_fill(oc_theora_state *_state,int _refi);
+void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
+ th_ycbcr_buffer _img);
+int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
+ int _pli,oc_mv _mv);
+
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
+void oc_state_loop_filter(oc_theora_state *_state,int _frame);
+# if defined(OC_DUMP_IMAGES)
+int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
+ const char *_suf);
+# endif
+
+/*Default pure-C implementations of shared accelerated functions.*/
+void oc_frag_copy_c(unsigned char *_dst,
+ const unsigned char *_src,int _src_ystride);
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_c(void);
+
+/*We need a way to call a few encoder functions without introducing a link-time
+   dependency into the decoder, while still allowing the old alpha API which
+   does not distinguish between encoder and decoder objects to be used.
+  We do this by placing a function table at the start of the encoder object
+   which can dispatch into the encoder library.
+  We do a similar thing for the decoder in case we ever decide to split off a
+   common base library.*/
+typedef void (*oc_state_clear_func)(theora_state *_th);
+typedef int (*oc_state_control_func)(theora_state *th,int _req,
+ void *_buf,size_t _buf_sz);
+typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+typedef double (*oc_state_granule_time_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+
+
+struct oc_state_dispatch_vtable{
+  oc_state_clear_func         clear;
+  oc_state_control_func       control;
+  oc_state_granule_frame_func granule_frame;
+  oc_state_granule_time_func  granule_time;
+};
+
+#endif
diff --git a/thirdparty/libtheora/theora/codec.h b/thirdparty/libtheora/theora/codec.h
index 5c2669630c..29b8602325 100644
--- a/thirdparty/libtheora/theora/codec.h
+++ b/thirdparty/libtheora/theora/codec.h
@@ -16,11 +16,12 @@
  ********************************************************************/
 
 /**\mainpage
- * 
+ *
  * \section intro Introduction
  *
- * This is the documentation for <tt>libtheora</tt> C API.
- * The current reference
+ * This is the documentation for the <tt>libtheora</tt> C API.
+ *
+ * The \c libtheora package is the current reference
  * implementation for <a href="http://www.theora.org/">Theora</a>, a free,
  * patent-unencumbered video codec.
  * Theora is derived from On2's VP3 codec with additional features and
@@ -30,29 +31,31 @@
  * <a href="http://www.theora.org/doc/Theora.pdf">the Theora
  *  specification</a>.
  *
- * \subsection Organization
+ * \section Organization
  *
- * The functions documented here are actually subdivided into three 
+ * The functions documented here are divided between two
  * separate libraries:
- * - <tt>libtheoraenc</tt> contains the encoder interface,
+ * - \c libtheoraenc contains the encoder interface,
  *   described in \ref encfuncs.
- * - <tt>libtheoradec</tt> contains the decoder interface and
- *   routines shared with the encoder.
- *   You must also link to this if you link to <tt>libtheoraenc</tt>.
- *   The routines in this library are described in \ref decfuncs and 
- *   \ref basefuncs.
- * - <tt>libtheora</tt> contains the \ref oldfuncs.
+ * - \c libtheoradec contains the decoder interface,
+ *   described in \ref decfuncs, \n
+ *   and additional \ref basefuncs.
+ *
+ * New code should link to \c libtheoradec. If using encoder
+ * features, it must also link to \c libtheoraenc.
  *
- * New code should link to <tt>libtheoradec</tt> and, if using encoder
- * features, <tt>libtheoraenc</tt>. Together these two export both
- * the standard and the legacy API, so this is all that is needed by
- * any code. The older <tt>libtheora</tt> library is provided just for
- * compatibility with older build configurations.
+ * During initial development, prior to the 1.0 release,
+ * \c libtheora exported a different \ref oldfuncs which
+ * combined both encode and decode functions.
+ * In general, legacy API symbols can be indentified
+ * by their \c theora_ or \c OC_ namespace prefixes.
+ * The current API uses \c th_ or \c TH_ instead.
  *
- * In general the recommended 1.x API symbols can be distinguished
- * by their <tt>th_</tt> or <tt>TH_</tt> namespace prefix.
- * The older, legacy API uses <tt>theora_</tt> or <tt>OC_</tt>
- * prefixes instead.
+ * While deprecated, \c libtheoraenc and \c libtheoradec
+ * together export the legacy api as well at the one documented above.
+ * Likewise, the legacy \c libtheora included with this package
+ * exports the new 1.x API. Older code and build scripts can therefore
+ * but updated independently to the current scheme.
  */
 
 /**\file
@@ -168,7 +171,7 @@ typedef struct{
 typedef th_img_plane th_ycbcr_buffer[3];
 
 /**Theora bitstream information.
- * This contains the basic playback parameters for a stream, and corresponds to 
+ * This contains the basic playback parameters for a stream, and corresponds to
  *  the initial 'info' header packet.
  * To initialize an encoder, the application fills in this structure and
  *  passes it to th_encode_alloc().
@@ -317,7 +320,7 @@ typedef struct{
  * In filling in this structure, th_decode_headerin() will null-terminate
  *  the user_comment strings for safety.
  * However, the bitstream format itself treats them as 8-bit clean vectors,
- *  possibly containing null characters, and so the length array should be
+ *  possibly containing null characters, so the length array should be
  *  treated as their authoritative length.
  */
 typedef struct th_comment{
@@ -448,7 +451,13 @@ typedef struct{
 
 /**\defgroup basefuncs Functions Shared by Encode and Decode*/
 /*@{*/
-/**\name Basic shared functions*/
+/**\name Basic shared functions
+ * These functions return information about the library itself,
+ * or provide high-level information about codec state
+ * and packet type.
+ *
+ * You must link to \c libtheoradec if you use any of the
+ * functions in this section.*/
 /*@{*/
 /**Retrieves a human-readable string to identify the library vendor and
  *  version.
@@ -510,7 +519,12 @@ extern int th_packet_iskeyframe(ogg_packet *_op);
 /*@}*/
 
 
-/**\name Functions for manipulating header data*/
+/**\name Functions for manipulating header data
+ * These functions manipulate the #th_info and #th_comment structures
+ * which describe video parameters and key-value metadata, respectively.
+ *
+ * You must link to \c libtheoradec if you use any of the
+ * functions in this section.*/
 /*@{*/
 /**Initializes a th_info structure.
  * This should be called on a freshly allocated #th_info structure before
@@ -537,7 +551,7 @@ extern void th_comment_init(th_comment *_tc);
  * \param _tc      The #th_comment struct to add the comment to.
  * \param _comment Must be a null-terminated UTF-8 string containing the
  *                  comment in "TAG=the value" form.*/
-extern void th_comment_add(th_comment *_tc, char *_comment);
+extern void th_comment_add(th_comment *_tc,const char *_comment);
 /**Add a comment to an initialized #th_comment structure.
  * \note Neither th_comment_add() nor th_comment_add_tag() support
  *  comments containing null values, although the bitstream format does
@@ -545,10 +559,11 @@ extern void th_comment_add(th_comment *_tc, char *_comment);
  * To add such comments you will need to manipulate the #th_comment
  *  structure directly.
  * \param _tc  The #th_comment struct to add the comment to.
- * \param _tag A null-terminated string containing the tag  associated with
+ * \param _tag A null-terminated string containing the tag associated with
  *              the comment.
  * \param _val The corresponding value as a null-terminated string.*/
-extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val);
+extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
+ const char *_val);
 /**Look up a comment value by its tag.
  * \param _tc    An initialized #th_comment structure.
  * \param _tag   The tag to look up.
@@ -564,15 +579,15 @@ extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val);
  *         It should not be modified or freed by the application, and
  *          modifications to the structure may invalidate the pointer.
  * \retval NULL If no matching tag is found.*/
-extern char *th_comment_query(th_comment *_tc,char *_tag,int _count);
+extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
 /**Look up the number of instances of a tag.
  * Call this first when querying for a specific tag and then iterate over the
  *  number of instances with separate calls to th_comment_query() to
  *  retrieve all the values for that tag in order.
  * \param _tc    An initialized #th_comment structure.
  * \param _tag   The tag to look up.
- * \return The number on instances of this particular tag.*/
-extern int th_comment_query_count(th_comment *_tc,char *_tag);
+ * \return The number of instances of this particular tag.*/
+extern int th_comment_query_count(th_comment *_tc,const char *_tag);
 /**Clears a #th_comment structure.
  * This should be called on a #th_comment structure after it is no longer
  *  needed.
diff --git a/thirdparty/libtheora/theora/theora.h b/thirdparty/libtheora/theora/theora.h
index af6eb6f380..a729a76890 100644
--- a/thirdparty/libtheora/theora/theora.h
+++ b/thirdparty/libtheora/theora/theora.h
@@ -34,41 +34,41 @@ extern "C"
  *
  * \section intro Introduction
  *
- * This is the documentation for the libtheora legacy C API, declared in 
+ * This is the documentation for the libtheora legacy C API, declared in
  * the theora.h header, which describes the old interface used before
  * the 1.0 release. This API was widely deployed for several years and
- * remains supported, but for new code we recommend the cleaner API 
+ * remains supported, but for new code we recommend the cleaner API
  * declared in theoradec.h and theoraenc.h.
  *
  * libtheora is the reference implementation for
  * <a href="http://www.theora.org/">Theora</a>, a free video codec.
  * Theora is derived from On2's VP3 codec with improved integration with
  * Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
- * 
+ *
  * \section overview Overview
  *
- * This library will both decode and encode theora packets to/from raw YUV 
+ * This library will both decode and encode theora packets to/from raw YUV
  * frames.  In either case, the packets will most likely either come from or
- * need to be embedded in an Ogg stream.  Use 
- * <a href="http://xiph.org/ogg/">libogg</a> or 
+ * need to be embedded in an Ogg stream.  Use
+ * <a href="http://xiph.org/ogg/">libogg</a> or
  * <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
  * to extract/package these packets.
  *
  * \section decoding Decoding Process
  *
  * Decoding can be separated into the following steps:
- * -# initialise theora_info and theora_comment structures using 
+ * -# initialise theora_info and theora_comment structures using
  *    theora_info_init() and theora_comment_init():
  \verbatim
  theora_info     info;
  theora_comment  comment;
-   
+
  theora_info_init(&info);
  theora_comment_init(&comment);
  \endverbatim
- * -# retrieve header packets from Ogg stream (there should be 3) and decode 
- *    into theora_info and theora_comment structures using 
- *    theora_decode_header().  See \ref identification for more information on 
+ * -# retrieve header packets from Ogg stream (there should be 3) and decode
+ *    into theora_info and theora_comment structures using
+ *    theora_decode_header().  See \ref identification for more information on
  *    identifying which packets are theora packets.
  \verbatim
  int i;
@@ -79,14 +79,14 @@ extern "C"
  }
  \endverbatim
  * -# initialise the decoder based on the information retrieved into the
- *    theora_info struct by theora_decode_header().  You will need a 
+ *    theora_info struct by theora_decode_header().  You will need a
  *    theora_state struct.
  \verbatim
  theora_state state;
- 
+
  theora_decode_init(&state, &info);
  \endverbatim
- * -# pass in packets and retrieve decoded frames!  See the yuv_buffer 
+ * -# pass in packets and retrieve decoded frames!  See the yuv_buffer
  *    documentation for information on how to retrieve raw YUV data.
  \verbatim
  yuf_buffer buffer;
@@ -96,20 +96,20 @@ extern "C"
    theora_decode_YUVout(&state, &buffer);
  }
  \endverbatim
- *  
+ *
  *
  * \subsection identification Identifying Theora Packets
  *
- * All streams inside an Ogg file have a unique serial_no attached to the 
- * stream.  Typically, you will want to 
- *  - retrieve the serial_no for each b_o_s (beginning of stream) page 
- *    encountered within the Ogg file; 
- *  - test the first (only) packet on that page to determine if it is a theora 
+ * All streams inside an Ogg file have a unique serial_no attached to the
+ * stream.  Typically, you will want to
+ *  - retrieve the serial_no for each b_o_s (beginning of stream) page
+ *    encountered within the Ogg file;
+ *  - test the first (only) packet on that page to determine if it is a theora
  *    packet;
- *  - once you have found a theora b_o_s page then use the retrieved serial_no 
+ *  - once you have found a theora b_o_s page then use the retrieved serial_no
  *    to identify future packets belonging to the same theora stream.
- * 
- * Note that you \e cannot use theora_packet_isheader() to determine if a 
+ *
+ * Note that you \e cannot use theora_packet_isheader() to determine if a
  * packet is a theora packet or not, as this function does not perform any
  * checking beyond whether a header bit is present.  Instead, use the
  * theora_decode_header() function and check the return value; or examine the
@@ -124,9 +124,9 @@ extern "C"
  * A YUV buffer for passing uncompressed frames to and from the codec.
  * This holds a Y'CbCr frame in planar format. The CbCr planes can be
  * subsampled and have their own separate dimensions and row stride
- * offsets. Note that the strides may be negative in some 
+ * offsets. Note that the strides may be negative in some
  * configurations. For theora the width and height of the largest plane
- * must be a multiple of 16. The actual meaningful picture size and 
+ * must be a multiple of 16. The actual meaningful picture size and
  * offset are stored in the theora_info structure; frames returned by
  * the decoder may need to be cropped for display.
  *
@@ -135,8 +135,8 @@ extern "C"
  * are ordered from left to right.
  *
  * During decode, the yuv_buffer struct is allocated by the user, but all
- * fields (including luma and chroma pointers) are filled by the library.  
- * These pointers address library-internal memory and their contents should 
+ * fields (including luma and chroma pointers) are filled by the library.
+ * These pointers address library-internal memory and their contents should
  * not be modified.
  *
  * Conversely, during encode the user allocates the struct and fills out all
@@ -179,14 +179,14 @@ typedef enum {
   OC_PF_420,    /**< Chroma subsampling by 2 in each direction (4:2:0) */
   OC_PF_RSVD,   /**< Reserved value */
   OC_PF_422,    /**< Horizonatal chroma subsampling by 2 (4:2:2) */
-  OC_PF_444,    /**< No chroma subsampling at all (4:4:4) */
+  OC_PF_444     /**< No chroma subsampling at all (4:4:4) */
 } theora_pixelformat;
 
 /**
  * Theora bitstream info.
  * Contains the basic playback parameters for a stream,
  * corresponding to the initial 'info' header packet.
- * 
+ *
  * Encoded theora frames must be a multiple of 16 in width and height.
  * To handle other frame sizes, a crop rectangle is specified in
  * frame_height and frame_width, offset_x and * offset_y. The offset
@@ -198,10 +198,10 @@ typedef enum {
  * fraction. Aspect ratio is also stored as a rational fraction, and
  * refers to the aspect ratio of the frame pixels, not of the
  * overall frame itself.
- * 
+ *
  * See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
  * examples/encoder_example.c</a> for usage examples of the
- * other paramters and good default settings for the encoder parameters.
+ * other parameters and good default settings for the encoder parameters.
  */
 typedef struct {
   ogg_uint32_t  width;		/**< encoded frame width  */
@@ -253,14 +253,14 @@ typedef struct{
 
 } theora_state;
 
-/** 
+/**
  * Comment header metadata.
  *
  * This structure holds the in-stream metadata corresponding to
  * the 'comment' header packet.
  *
  * Meta data is stored as a series of (tag, value) pairs, in
- * length-encoded string vectors. The first occurence of the 
+ * length-encoded string vectors. The first occurence of the
  * '=' character delimits the tag and value. A particular tag
  * may occur more than once. The character set encoding for
  * the strings is always UTF-8, but the tag names are limited
@@ -285,7 +285,7 @@ typedef struct theora_comment{
 /* \anchor decctlcodes_old
  * These are the available request codes for theora_control()
  * when called with a decoder instance.
- * By convention decoder control codes are odd, to distinguish 
+ * By convention decoder control codes are odd, to distinguish
  * them from \ref encctlcodes_old "encoder control codes" which
  * are even.
  *
@@ -306,7 +306,7 @@ typedef struct theora_comment{
 #define TH_DECCTL_GET_PPLEVEL_MAX (1)
 
 /**Set the post-processing level.
- * Sets the level of post-processing to use when decoding the 
+ * Sets the level of post-processing to use when decoding the
  * compressed stream. This must be a value between zero (off)
  * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
  */
@@ -345,9 +345,9 @@ typedef struct theora_comment{
  * \param[in] buf #th_quant_info
  * \retval OC_FAULT  \a theora_state is <tt>NULL</tt>.
  * \retval OC_EINVAL Encoding has already begun, the quantization parameters
- *                    are not acceptable to this version of the encoder, 
- *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero, 
- *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is 
+ *                    are not acceptable to this version of the encoder,
+ *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
+ *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is
  *                    not <tt>sizeof(#th_quant_info)</tt>.
  * \retval OC_IMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_QUANT_PARAMS (2)
@@ -424,7 +424,7 @@ typedef struct theora_comment{
 #define OC_NEWPACKET   -25      /**< Packet is an (ignorable) unhandled extension */
 #define OC_DUPFRAME    1        /**< Packet is a dropped frame */
 
-/** 
+/**
  * Retrieve a human-readable string to identify the encoder vendor and version.
  * \returns A version string.
  */
@@ -462,7 +462,7 @@ extern int theora_encode_init(theora_state *th, theora_info *ti);
 extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
 
 /**
- * Request the next packet of encoded video. 
+ * Request the next packet of encoded video.
  * The encoded data is placed in a user-provided ogg_packet structure.
  * \param t A theora_state handle previously initialized for encoding.
  * \param last_p whether this is the last packet the encoder should produce.
@@ -496,7 +496,11 @@ extern int theora_encode_header(theora_state *t, ogg_packet *op);
  * \param op An ogg_packet structure to fill. libtheora will set all
  *           elements of this structure, including a pointer to the encoded
  *           comment data. The memory for the comment data is owned by
- *           libtheora.
+ *           the application, and must be freed by it using _ogg_free().
+ *           On some systems (such as Windows when using dynamic linking), this
+ *           may mean the free is executed in a different module from the
+ *           malloc, which will crash; there is no way to free this memory on
+ *           such systems.
  * \retval 0 Success
  */
 extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
@@ -581,8 +585,8 @@ extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
  * \param th A theora_state handle previously initialized for decoding.
  * \param yuv A yuv_buffer in which libtheora should place the decoded data.
  *            Note that the buffer struct itself is allocated by the user, but
- *            that the luma and chroma pointers will be filled in by the 
- *            library.  Also note that these luma and chroma regions should be 
+ *            that the luma and chroma pointers will be filled in by the
+ *            library.  Also note that these luma and chroma regions should be
  *            considered read-only by the user.
  * \retval 0 Success
  */
@@ -617,22 +621,22 @@ extern int theora_packet_iskeyframe(ogg_packet *op);
 /**
  * Report the granulepos shift radix
  *
- * When embedded in Ogg, Theora uses a two-part granulepos, 
+ * When embedded in Ogg, Theora uses a two-part granulepos,
  * splitting the 64-bit field into two pieces. The more-significant
  * section represents the frame count at the last keyframe,
  * and the less-significant section represents the count of
  * frames since the last keyframe. In this way the overall
  * field is still non-decreasing with time, but usefully encodes
  * a pointer to the last keyframe, which is necessary for
- * correctly restarting decode after a seek. 
+ * correctly restarting decode after a seek.
  *
  * This function reports the number of bits used to represent
  * the distance to the last keyframe, and thus how the granulepos
  * field must be shifted or masked to obtain the two parts.
- * 
+ *
  * Since libtheora returns compressed data in an ogg_packet
  * structure, this may be generally useful even if the Theora
- * packets are not being used in an Ogg container. 
+ * packets are not being used in an Ogg container.
  *
  * \param ti A previously initialized theora_info struct
  * \returns The bit shift dividing the two granulepos fields
@@ -644,7 +648,7 @@ int theora_granule_shift(theora_info *ti);
 /**
  * Convert a granulepos to an absolute frame index, starting at 0.
  * The granulepos is interpreted in the context of a given theora_state handle.
- * 
+ *
  * Note that while the granulepos encodes the frame count (i.e. starting
  * from 1) this call returns the frame index, starting from zero. Thus
  * One can calculate the presentation time by multiplying the index by
@@ -670,9 +674,7 @@ extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos)
  *          This is the "end time" for the frame, or the latest time it should
  *           be displayed.
  *          It is not the presentation time.
- * \retval -1. The given granulepos is undefined (i.e. negative), or
- * \retval -1. The function has been disabled because floating 
- *              point support is not available.
+ * \retval -1. The given granulepos is undefined (i.e. negative).
  */
 extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
 
@@ -699,7 +701,7 @@ extern void theora_clear(theora_state *t);
 
 /**
  * Initialize an allocated theora_comment structure
- * \param tc An allocated theora_comment structure 
+ * \param tc An allocated theora_comment structure
  **/
 extern void theora_comment_init(theora_comment *tc);
 
@@ -720,7 +722,7 @@ extern void theora_comment_add(theora_comment *tc, char *comment);
 /**
  * Add a comment to an initialized theora_comment structure.
  * \param tc A previously initialized theora comment structure
- * \param tag A null-terminated string containing the tag 
+ * \param tag A null-terminated string containing the tag
  *            associated with the comment.
  * \param value The corresponding value as a null-terminated string
  *
@@ -752,9 +754,9 @@ extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
  *  \param tc An initialized theora_comment structure
  *  \param tag The tag to look up
  *  \returns The number on instances of a particular tag.
- * 
+ *
  *  Call this first when querying for a specific tag and then interate
- *  over the number of instances with separate calls to 
+ *  over the number of instances with separate calls to
  *  theora_comment_query() to retrieve all instances in order.
  **/
 extern int   theora_comment_query_count(theora_comment *tc, char *tag);
@@ -769,7 +771,7 @@ extern void  theora_comment_clear(theora_comment *tc);
  * This is used to provide advanced control the encoding process.
  * \param th     A #theora_state handle.
  * \param req    The control code to process.
- *                See \ref encctlcodes_old "the list of available 
+ *                See \ref encctlcodes_old "the list of available
  *			control codes" for details.
  * \param buf    The parameters for this control code.
  * \param buf_sz The size of the parameter buffer.*/
diff --git a/thirdparty/libtheora/theora/theoradec.h b/thirdparty/libtheora/theora/theoradec.h
index b20f0e3a64..77bef81909 100644
--- a/thirdparty/libtheora/theora/theoradec.h
+++ b/thirdparty/libtheora/theora/theoradec.h
@@ -92,13 +92,17 @@ extern "C" {
  *                     <tt>sizeof(th_stripe_callback)</tt>.*/
 #define TH_DECCTL_SET_STRIPE_CB (7)
 
-/**Enables telemetry and sets the macroblock display mode */
+/**Sets the macroblock display mode. Set to 0 to disable displaying
+ * macroblocks.*/
 #define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
-/**Enables telemetry and sets the motion vector display mode */
+/**Sets the motion vector display mode. Set to 0 to disable displaying motion
+ * vectors.*/
 #define TH_DECCTL_SET_TELEMETRY_MV (11)
-/**Enables telemetry and sets the adaptive quantization display mode */
+/**Sets the adaptive quantization display mode. Set to 0 to disable displaying
+ * adaptive quantization. */
 #define TH_DECCTL_SET_TELEMETRY_QI (13)
-/**Enables telemetry and sets the bitstream breakdown visualization mode */
+/**Sets the bitstream breakdown visualization mode. Set to 0 to disable
+ * displaying bitstream breakdown.*/
 #define TH_DECCTL_SET_TELEMETRY_BITS (15)
 /*@}*/
 
@@ -171,7 +175,7 @@ typedef struct th_setup_info th_setup_info;
 /**\defgroup decfuncs Functions for Decoding*/
 /*@{*/
 /**\name Functions for decoding
- * You must link to <tt>libtheoradec</tt> if you use any of the 
+ * You must link to <tt>libtheoradec</tt> if you use any of the
  * functions in this section.
  *
  * The functions are listed in the order they are used in a typical decode.
@@ -267,7 +271,10 @@ extern void th_setup_free(th_setup_info *_setup);
  *                See \ref decctlcodes "the list of available control codes"
  *                 for details.
  * \param _buf    The parameters for this control code.
- * \param _buf_sz The size of the parameter buffer.*/
+ * \param _buf_sz The size of the parameter buffer.
+ * \return Possible return values depend on the control code used.
+ *          See \ref decctlcodes "the list of control codes" for
+ *          specific values. Generally 0 indicates success.*/
 extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
  size_t _buf_sz);
 /**Submits a packet containing encoded video data to the decoder.
@@ -283,7 +290,8 @@ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
  * \retval 0             Success.
  *                       A new decoded frame can be retrieved by calling
  *                        th_decode_ycbcr_out().
- * \retval TH_DUPFRAME   The packet represented a dropped (0-byte) frame.
+ * \retval TH_DUPFRAME   The packet represented a dropped frame (either a
+ *                        0-byte frame or an INTER frame with no coded blocks).
  *                       The player can skip the call to th_decode_ycbcr_out(),
  *                        as the contents of the decoded frame buffer have not
  *                        changed.
diff --git a/thirdparty/libtheora/theora/theoraenc.h b/thirdparty/libtheora/theora/theoraenc.h
index fdf2ab21e2..79b1c2b880 100644
--- a/thirdparty/libtheora/theora/theoraenc.h
+++ b/thirdparty/libtheora/theora/theoraenc.h
@@ -43,7 +43,7 @@ extern "C" {
  * <tt>NULL</tt> may be specified to revert to the default tables.
  *
  * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
  * \retval TH_EINVAL Encoding has already begun or one or more of the given
  *                     tables is not full or prefix-free, \a _buf is
  *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
@@ -57,8 +57,8 @@ extern "C" {
  * <tt>NULL</tt> may be specified to revert to the default parameters.
  *
  * \param[in] _buf #th_quant_info
- * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
- * \retval TH_EINVAL Encoding has already begun, \a _buf is 
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
+ * \retval TH_EINVAL Encoding has already begun, \a _buf is
  *                    <tt>NULL</tt> and \a _buf_sz is not zero,
  *                    or \a _buf is non-<tt>NULL</tt> and
  *                    \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
@@ -73,7 +73,7 @@ extern "C" {
  * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
  *                   frames.
  * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
@@ -101,7 +101,7 @@ extern "C" {
  *                   4:2:0, the picture region is smaller than the full frame,
  *                   or if encoding has begun, preventing the quantization
  *                   tables and codebooks from being set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
@@ -114,7 +114,7 @@ extern "C" {
  *  the current encoding mode (VBR vs. constant quality, etc.).
  *
  * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
@@ -124,7 +124,7 @@ extern "C" {
  *
  * \param[in] _buf <tt>int</tt>: The new encoding speed level.
  *                 0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    encoding speed level is out of bounds.
  *                   The maximum encoding speed level may be
@@ -142,7 +142,7 @@ extern "C" {
  *
  * \param[out] _buf <tt>int</tt>: The current encoding speed level.
  *                  0 is slowest, larger values use less CPU.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
  * \retval TH_EIMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
@@ -162,7 +162,7 @@ extern "C" {
  *
  * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
  *                 If this is negative or zero, no duplicates will be produced.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
  *                    number of duplicates is greater than or equal to the
  *                    maximum keyframe interval.
@@ -187,7 +187,7 @@ extern "C" {
  *                    use.
  *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
  *                    later.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
  *                    is not enabled.
  * \retval TH_EIMPL   Not supported by this implementation in the current
@@ -211,7 +211,7 @@ extern "C" {
  * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
  *                   frames.
  * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
  *                    is not enabled.  The buffer has an implementation
  *                    defined minimum and maximum size and the value in _buf
@@ -243,7 +243,7 @@ extern "C" {
  *              application.
  * \retval >=0       The number of bytes of metric data available in the
  *                    returned buffer.
- * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
  *                    bitrate has been set, or the first call was made after
  *                    the first frame was submitted for encoding.
@@ -283,7 +283,7 @@ extern "C" {
  *                  of bytes consumed.
  * \retval >0            The number of bytes of metric data required/consumed.
  * \retval 0             No more data is required before the next frame.
- * \retval TH_EFAULT     \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc is <tt>NULL</tt>.
  * \retval TH_EINVAL     No target bitrate has been set, or the first call was
  *                        made after the first frame was submitted for
  *                        encoding.
@@ -306,7 +306,7 @@ extern "C" {
  * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
  *                  inclusive.
  * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL     A target bitrate has already been specified, or the
  *                        quality index was not in the range 0...63.
  * \retval TH_EIMPL       Not supported by this implementation.*/
@@ -328,10 +328,54 @@ extern "C" {
  *
  * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
  * \retval 0             Success.
- * \retval TH_EFAULT     \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL     The target bitrate was not positive.
- * \retval TH_EIMPL       Not supported by this implementation.*/
+ *                       A future version of this library may allow passing 0
+ *                        to disabled rate-controlled mode and return to a
+ *                        quality-based mode, in which case this function will
+ *                        not return an error for that value.
+ * \retval TH_EIMPL      Not supported by this implementation.*/
 #define TH_ENCCTL_SET_BITRATE (30)
+/**Sets the configuration to be compatible with that from the given setup
+ *  header.
+ * This sets the Huffman codebooks and quantization parameters to match those
+ *  found in the given setup header.
+ * This guarantees that packets encoded by this encoder will be decodable using
+ *  a decoder configured with the passed-in setup header.
+ * It does <em>not</em> guarantee that th_encode_flushheader() will produce a
+ *  bit-identical setup header, only that they will be compatible.
+ * If you need a bit-identical setup header, then use the one you passed into
+ *  this command, and not the one returned by th_encode_flushheader().
+ *
+ * This also does <em>not</em> enable or disable VP3 compatibility; that is not
+ *  signaled in the setup header (or anywhere else in the encoded stream), and
+ *  is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
+ * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
+ *  and quantization parameters to match the given setup header, you should
+ *  enable VP3 compatibility before invoking this command, otherwise the
+ *  codebooks and quantization parameters will be reset to the VP3 defaults.
+ *
+ * The current encoder does not support Huffman codebooks which do not contain
+ *  codewords for all 32 tokens.
+ * Such codebooks are legal, according to the specification, but cannot be
+ *  configured with this function.
+ *
+ * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
+ *                                            the configuration from.
+ *                                           This should be the original,
+ *                                            undecoded setup header packet,
+ *                                            and <em>not</em> a #th_setup_info
+ *                                            structure filled in by
+ *                                            th_decode_headerin().
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     Encoding has already begun, so the codebooks and
+ *                        quantization parameters cannot be changed, or the
+ *                        data in the setup header was not supported by this
+ *                        encoder.
+ * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
+ * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
 
 /*@}*/
 
@@ -342,7 +386,8 @@ extern "C" {
 /*@{*/
 /**Drop frames to keep within bitrate buffer constraints.
  * This can have a severe impact on quality, but is the only way to ensure that
- *  bitrate targets are met at low rates during sudden bursts of activity.*/
+ *  bitrate targets are met at low rates during sudden bursts of activity.
+ * It is enabled by default.*/
 #define TH_RATECTL_DROP_FRAMES   (0x1)
 /**Ignore bitrate buffer overflows.
  * If the encoder uses so few bits that the reservoir of available bits
@@ -350,14 +395,14 @@ extern "C" {
  * The encoder will not try to use these extra bits in future frames.
  * At high rates this may cause the result to be undersized, but allows a
  *  client to play the stream using a finite buffer; it should normally be
- *  enabled.*/
+ *  enabled, which is the default.*/
 #define TH_RATECTL_CAP_OVERFLOW  (0x2)
 /**Ignore bitrate buffer underflows.
  * If the encoder uses so many bits that the reservoir of available bits
  *  underflows, ignore the deficit.
  * The encoder will not try to make up these extra bits in future frames.
  * At low rates this may cause the result to be oversized; it should normally
- *  be disabled.*/
+ *  be disabled, which is the default.*/
 #define TH_RATECTL_CAP_UNDERFLOW (0x4)
 /*@}*/
 
@@ -401,8 +446,8 @@ typedef struct th_enc_ctx    th_enc_ctx;
  *    packets.
  * - For each uncompressed frame:
  *   - Submit the uncompressed frame via th_encode_ycbcr_in()
- *   - Repeatedly call th_encode_packetout() to retrieve any video data packets
- *      that are ready.
+ *   - Repeatedly call th_encode_packetout() to retrieve any video
+ *      data packets that are ready.
  * - Call th_encode_free() to release all encoder memory.*/
 /*@{*/
 /**Allocates an encoder instance.
@@ -417,7 +462,10 @@ extern th_enc_ctx *th_encode_alloc(const th_info *_info);
  *                See \ref encctlcodes "the list of available control codes"
  *                 for details.
  * \param _buf    The parameters for this control code.
- * \param _buf_sz The size of the parameter buffer.*/
+ * \param _buf_sz The size of the parameter buffer.
+ * \return Possible return values depend on the control code used.
+ *          See \ref encctlcodes "the list of control codes" for
+ *          specific values. Generally 0 indicates success.*/
 extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
 /**Outputs the next header packet.
  * This should be called repeatedly after encoder initialization until it
@@ -441,11 +489,25 @@ extern int th_encode_flushheader(th_enc_ctx *_enc,
 /**Submits an uncompressed frame to the encoder.
  * \param _enc   A #th_enc_ctx handle.
  * \param _ycbcr A buffer of Y'CbCr data to encode.
+ *               If the width and height of the buffer matches the frame size
+ *                the encoder was initialized with, the encoder will only
+ *                reference the portion inside the picture region.
+ *               Any data outside this region will be ignored, and need not map
+ *                to a valid address.
+ *               Alternatively, you can pass a buffer equal to the size of the
+ *                picture region, if this is less than the full frame size.
+ *               When using subsampled chroma planes, odd picture sizes or odd
+ *                picture offsets may require an unexpected chroma plane size,
+ *                and their use is generally discouraged, as they will not be
+ *                well-supported by players and other media frameworks.
+ *               See Section 4.4 of
+ *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *                specification</a> for details if you wish to use them anyway.
  * \retval 0         Success.
  * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
- * \retval TH_EINVAL The buffer size does not match the frame size the encoder
- *                    was initialized with, or encoding has already
- *                    completed.*/
+ * \retval TH_EINVAL The buffer size matches neither the frame size nor the
+ *                    picture size the encoder was initialized with, or
+ *                    encoding has already completed.*/
 extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
 /**Retrieves encoded video data packets.
  * This should be called repeatedly after each frame is submitted to flush any
diff --git a/thirdparty/libtheora/tokenize.c b/thirdparty/libtheora/tokenize.c
index 60574c3594..57b7aa8da9 100644
--- a/thirdparty/libtheora/tokenize.c
+++ b/thirdparty/libtheora/tokenize.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-  last mod: $Id: tokenize.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 #include <stdlib.h>
@@ -20,27 +20,26 @@
 
 
 
+static unsigned char OC_DCT_EOB_TOKEN[31]={
+  0,1,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+
 static int oc_make_eob_token(int _run_count){
-  if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
-  else{
-    int cat;
-    cat=OC_ILOGNZ_32(_run_count)-3;
-    cat=OC_MINI(cat,3);
-    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
-  }
+  return _run_count<32?OC_DCT_EOB_TOKEN[_run_count-1]:OC_DCT_REPEAT_RUN3_TOKEN;
 }
 
+static unsigned char OC_DCT_EOB_EB[31]={
+  0,0,0,0,1,2,3,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+};
+
 static int oc_make_eob_token_full(int _run_count,int *_eb){
-  if(_run_count<4){
-    *_eb=0;
-    return OC_DCT_EOB1_TOKEN+_run_count-1;
+  if(_run_count<32){
+    *_eb=OC_DCT_EOB_EB[_run_count-1];
+    return OC_DCT_EOB_TOKEN[_run_count-1];
   }
   else{
-    int cat;
-    cat=OC_ILOGNZ_32(_run_count)-3;
-    cat=OC_MINI(cat,3);
-    *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
-    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
+    *_eb=_run_count;
+    return OC_DCT_REPEAT_RUN3_TOKEN;
   }
 }
 
@@ -49,86 +48,330 @@ static int oc_decode_eob_token(int _token,int _eb){
   return (0x20820C41U>>_token*5&0x1F)+_eb;
 }
 
-/*TODO: This is now only used during DCT tokenization, and never for runs; it
-   should be simplified.*/
-static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
-  int neg;
-  int zero_run;
-  int token;
-  int eb;
-  neg=_val<0;
-  _val=abs(_val);
-  zero_run=_zzj-_zzi;
-  if(zero_run>0){
-    int adj;
-    /*Implement a minor restriction on stack 1 so that we know during DC fixups
-       that extending a dctrun token from stack 1 will never overflow.*/
-    adj=_zzi!=1;
-    if(_val<2&&zero_run<17+adj){
-      if(zero_run<6){
-        token=OC_DCT_RUN_CAT1A+zero_run-1;
-        eb=neg;
-      }
-      else if(zero_run<10){
-        token=OC_DCT_RUN_CAT1B;
-        eb=zero_run-6+(neg<<2);
-      }
-      else{
-        token=OC_DCT_RUN_CAT1C;
-        eb=zero_run-10+(neg<<3);
-      }
-    }
-    else if(_val<4&&zero_run<3+adj){
-      if(zero_run<2){
-        token=OC_DCT_RUN_CAT2A;
-        eb=_val-2+(neg<<1);
-      }
-      else{
-        token=OC_DCT_RUN_CAT2B;
-        eb=zero_run-2+(_val-2<<1)+(neg<<2);
-      }
-    }
-    else{
-      if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
-      else token=OC_DCT_ZRL_TOKEN;
-      eb=zero_run-1;
-    }
-  }
-  else if(_val<3){
-    token=OC_ONE_TOKEN+(_val-1<<1)+neg;
-    eb=0;
-  }
-  else if(_val<7){
-    token=OC_DCT_VAL_CAT2+_val-3;
-    eb=neg;
-  }
-  else if(_val<9){
-    token=OC_DCT_VAL_CAT3;
-    eb=_val-7+(neg<<1);
-  }
-  else if(_val<13){
-    token=OC_DCT_VAL_CAT4;
-    eb=_val-9+(neg<<2);
-  }
-  else if(_val<21){
-    token=OC_DCT_VAL_CAT5;
-    eb=_val-13+(neg<<3);
-  }
-  else if(_val<37){
-    token=OC_DCT_VAL_CAT6;
-    eb=_val-21+(neg<<4);
-  }
-  else if(_val<69){
-    token=OC_DCT_VAL_CAT7;
-    eb=_val-37+(neg<<5);
-  }
-  else{
-    token=OC_DCT_VAL_CAT8;
-    eb=_val-69+(neg<<9);
-  }
-  *_eb=eb;
-  return token;
-}
+/*Some tables for fast construction of value tokens.*/
+
+static const unsigned char OC_DCT_VALUE_TOKEN[1161]={
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,21,21,21,21,21,21,21,21,
+  21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,
+  21,21,21,21,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,
+  19,19,19,19,19,19,19,19,18,18,18,18,17,17,16,15,14,13,12,10,
+   7,
+   9,11,13,14,15,16,17,17,18,18,18,18,19,19,19,19,19,19,19,19,
+  20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,21,21,21,21,
+  21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,
+  21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,
+  22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22
+};
+
+static const ogg_uint16_t OC_DCT_VALUE_EB[1161]={
+  1023,1022,1021,1020,1019,1018,1017,1016,1015,1014,
+  1013,1012,1011,1010,1009,1008,1007,1006,1005,1004,
+  1003,1002,1001,1000, 999, 998, 997, 996, 995, 994,
+   993, 992, 991, 990, 989, 988, 987, 986, 985, 984,
+   983, 982, 981, 980, 979, 978, 977, 976, 975, 974,
+   973, 972, 971, 970, 969, 968, 967, 966, 965, 964,
+   963, 962, 961, 960, 959, 958, 957, 956, 955, 954,
+   953, 952, 951, 950, 949, 948, 947, 946, 945, 944,
+   943, 942, 941, 940, 939, 938, 937, 936, 935, 934,
+   933, 932, 931, 930, 929, 928, 927, 926, 925, 924,
+   923, 922, 921, 920, 919, 918, 917, 916, 915, 914,
+   913, 912, 911, 910, 909, 908, 907, 906, 905, 904,
+   903, 902, 901, 900, 899, 898, 897, 896, 895, 894,
+   893, 892, 891, 890, 889, 888, 887, 886, 885, 884,
+   883, 882, 881, 880, 879, 878, 877, 876, 875, 874,
+   873, 872, 871, 870, 869, 868, 867, 866, 865, 864,
+   863, 862, 861, 860, 859, 858, 857, 856, 855, 854,
+   853, 852, 851, 850, 849, 848, 847, 846, 845, 844,
+   843, 842, 841, 840, 839, 838, 837, 836, 835, 834,
+   833, 832, 831, 830, 829, 828, 827, 826, 825, 824,
+   823, 822, 821, 820, 819, 818, 817, 816, 815, 814,
+   813, 812, 811, 810, 809, 808, 807, 806, 805, 804,
+   803, 802, 801, 800, 799, 798, 797, 796, 795, 794,
+   793, 792, 791, 790, 789, 788, 787, 786, 785, 784,
+   783, 782, 781, 780, 779, 778, 777, 776, 775, 774,
+   773, 772, 771, 770, 769, 768, 767, 766, 765, 764,
+   763, 762, 761, 760, 759, 758, 757, 756, 755, 754,
+   753, 752, 751, 750, 749, 748, 747, 746, 745, 744,
+   743, 742, 741, 740, 739, 738, 737, 736, 735, 734,
+   733, 732, 731, 730, 729, 728, 727, 726, 725, 724,
+   723, 722, 721, 720, 719, 718, 717, 716, 715, 714,
+   713, 712, 711, 710, 709, 708, 707, 706, 705, 704,
+   703, 702, 701, 700, 699, 698, 697, 696, 695, 694,
+   693, 692, 691, 690, 689, 688, 687, 686, 685, 684,
+   683, 682, 681, 680, 679, 678, 677, 676, 675, 674,
+   673, 672, 671, 670, 669, 668, 667, 666, 665, 664,
+   663, 662, 661, 660, 659, 658, 657, 656, 655, 654,
+   653, 652, 651, 650, 649, 648, 647, 646, 645, 644,
+   643, 642, 641, 640, 639, 638, 637, 636, 635, 634,
+   633, 632, 631, 630, 629, 628, 627, 626, 625, 624,
+   623, 622, 621, 620, 619, 618, 617, 616, 615, 614,
+   613, 612, 611, 610, 609, 608, 607, 606, 605, 604,
+   603, 602, 601, 600, 599, 598, 597, 596, 595, 594,
+   593, 592, 591, 590, 589, 588, 587, 586, 585, 584,
+   583, 582, 581, 580, 579, 578, 577, 576, 575, 574,
+   573, 572, 571, 570, 569, 568, 567, 566, 565, 564,
+   563, 562, 561, 560, 559, 558, 557, 556, 555, 554,
+   553, 552, 551, 550, 549, 548, 547, 546, 545, 544,
+   543, 542, 541, 540, 539, 538, 537, 536, 535, 534,
+   533, 532, 531, 530, 529, 528, 527, 526, 525, 524,
+   523, 522, 521, 520, 519, 518, 517, 516, 515, 514,
+   513, 512,  63,  62,  61,  60,  59,  58,  57,  56,
+    55,  54,  53,  52,  51,  50,  49,  48,  47,  46,
+    45,  44,  43,  42,  41,  40,  39,  38,  37,  36,
+    35,  34,  33,  32,  31,  30,  29,  28,  27,  26,
+    25,  24,  23,  22,  21,  20,  19,  18,  17,  16,
+    15,  14,  13,  12,  11,  10,   9,   8,   7,   6,
+     5,   4,   3,   2,   1,   1,   1,   1,   0,   0,
+     0,
+     0,   0,   0,   0,   0,   0,   0,   1,   0,   1,
+     2,   3,   0,   1,   2,   3,   4,   5,   6,   7,
+     0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,   0,   1,   2,   3,
+     4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
+    14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+    24,  25,  26,  27,  28,  29,  30,  31,   0,   1,
+     2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
+    12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
+    22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+    32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
+    42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+    52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
+    62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+    72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
+    82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
+    92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
+   102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+   112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+   122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+   132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
+   142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
+   152, 153, 154, 155, 156, 157, 158, 159, 160, 161,
+   162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
+   172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+   182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+   192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
+   202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+   212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
+   222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+   232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+   242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
+   252, 253, 254, 255, 256, 257, 258, 259, 260, 261,
+   262, 263, 264, 265, 266, 267, 268, 269, 270, 271,
+   272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
+   282, 283, 284, 285, 286, 287, 288, 289, 290, 291,
+   292, 293, 294, 295, 296, 297, 298, 299, 300, 301,
+   302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+   312, 313, 314, 315, 316, 317, 318, 319, 320, 321,
+   322, 323, 324, 325, 326, 327, 328, 329, 330, 331,
+   332, 333, 334, 335, 336, 337, 338, 339, 340, 341,
+   342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
+   352, 353, 354, 355, 356, 357, 358, 359, 360, 361,
+   362, 363, 364, 365, 366, 367, 368, 369, 370, 371,
+   372, 373, 374, 375, 376, 377, 378, 379, 380, 381,
+   382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
+   392, 393, 394, 395, 396, 397, 398, 399, 400, 401,
+   402, 403, 404, 405, 406, 407, 408, 409, 410, 411,
+   412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+   422, 423, 424, 425, 426, 427, 428, 429, 430, 431,
+   432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+   442, 443, 444, 445, 446, 447, 448, 449, 450, 451,
+   452, 453, 454, 455, 456, 457, 458, 459, 460, 461,
+   462, 463, 464, 465, 466, 467, 468, 469, 470, 471,
+   472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
+   482, 483, 484, 485, 486, 487, 488, 489, 490, 491,
+   492, 493, 494, 495, 496, 497, 498, 499, 500, 501,
+   502, 503, 504, 505, 506, 507, 508, 509, 510, 511
+};
+
+/*The first DCT coefficient that both has a smaller magnitude and gets coded
+   with a different token.*/
+static const ogg_int16_t OC_DCT_TRELLIS_ALT_VALUE[1161]={
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -68, -68, -68, -68, -68, -68, -68, -68,
+   -68, -68, -36, -36, -36, -36, -36, -36, -36, -36,
+   -36, -36, -36, -36, -36, -36, -36, -36, -36, -36,
+   -36, -36, -36, -36, -36, -36, -36, -36, -36, -36,
+   -36, -36, -36, -36, -20, -20, -20, -20, -20, -20,
+   -20, -20, -20, -20, -20, -20, -20, -20, -20, -20,
+   -12, -12, -12, -12, -12, -12, -12, -12,  -8,  -8,
+    -8,  -8,  -6,  -6,  -5,  -4,  -3,  -2,  -1,   0,
+     0,
+     0,   1,   2,   3,   4,   5,   6,   6,   8,   8,
+     8,   8,  12,  12,  12,  12,  12,  12,  12,  12,
+    20,  20,  20,  20,  20,  20,  20,  20,  20,  20,
+    20,  20,  20,  20,  20,  20,  36,  36,  36,  36,
+    36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+    36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+    36,  36,  36,  36,  36,  36,  36,  36,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68,
+    68,  68,  68,  68,  68,  68,  68,  68,  68,  68
+};
+
+#define OC_DCT_VALUE_TOKEN_PTR (OC_DCT_VALUE_TOKEN+580)
+#define OC_DCT_VALUE_EB_PTR (OC_DCT_VALUE_EB+580)
+#define OC_DCT_TRELLIS_ALT_VALUE_PTR (OC_DCT_TRELLIS_ALT_VALUE+580)
+
+/*Some tables for fast construction of combo tokens.*/
+
+static const unsigned char OC_DCT_RUN_CAT1_TOKEN[17]={
+  23,24,25,26,27,28,28,28,28,29,29,29,29,29,29,29,29
+};
+
+static const unsigned char OC_DCT_RUN_CAT1_EB[17][2]={
+  {0,1},{0,1},{0, 1},{0, 1},{0, 1},{0, 4},{1, 5},{2, 6},{3,7},
+  {0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}
+};
+
+static const unsigned char OC_DCT_RUN_CAT2_EB[3][2][2]={
+  { {0,1},{2,3} },{ {0,2},{4,6} },{ {1,3},{5,7} }
+};
 
 /*Token logging to allow a few fragments of efficient rollback.
   Late SKIP analysis is tied up in the tokenization process, so we need to be
@@ -211,10 +454,11 @@ struct oc_quant_token{
 
 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
    dequantizes and de-zig-zags the result.
-  The DC coefficient is not preserved; it should be restored by the caller.*/
+  The AC coefficients of _idct must be pre-initialized to zero.*/
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
- int _zzi,oc_token_checkpoint **_stack,int _acmin){
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
   oc_token_checkpoint *stack;
   ogg_int64_t          zflags;
   ogg_int64_t          nzflags;
@@ -242,31 +486,29 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
   d2_accum[0]=0;
   zzj=64;
   for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
-    ogg_int32_t  lambda;
     ogg_uint32_t best_cost;
     int          best_bits=best_bits;
     int          best_next=best_next;
     int          best_token=best_token;
     int          best_eb=best_eb;
     int          best_qc=best_qc;
-    int          flush_bits;
     ogg_uint32_t d2;
     int          dq;
+    int          qc_m;
     int          e;
     int          c;
     int          s;
     int          tj;
-    lambda=_enc->lambda;
     qc=_qdct[zzi];
     s=-(qc<0);
-    qc=qc+s^s;
-    c=_dct[OC_FZIG_ZAG[zzi]];
-    if(qc<=1){
+    qc_m=qc+s^s;
+    c=_dct[zzi];
+    /*The hard case: try a zero run.*/
+    if(qc_m<=1){
       ogg_uint32_t sum_d2;
       int          nzeros;
       int          dc_reserve;
-      /*The hard case: try a zero run.*/
-      if(!qc){
+      if(!qc_m){
         /*Skip runs that are already quantized to zeros.
           If we considered each zero coefficient in turn, we might
            theoretically find a better way to partition long zero runs (e.g.,
@@ -281,15 +523,14 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
         d2=0;
       }
       else{
-        c=c+s^s;
         d2=c*(ogg_int32_t)c;
+        c=c+s^s;
       }
       eob=eob_run[zzi];
       nzeros=zzj-zzi;
       zzj&=63;
       sum_d2=d2+d2_accum[zzj];
       d2_accum[zzi]=sum_d2;
-      flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
       /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
          to ensure they can be extended to include the DC coefficient if
          necessary; this greatly simplifies stack-rewriting later on.*/
@@ -297,7 +538,6 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
       best_cost=0xFFFFFFFF;
       for(;;){
         if(nzflags>>zzj&1){
-          int cat;
           int val;
           int val_s;
           int zzk;
@@ -306,11 +546,10 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
           tk=next&1;
           zzk=next>>1;
           /*Try a pure zero run to this point.*/
-          cat=nzeros+55>>6;
-          token=OC_DCT_SHORT_ZRL_TOKEN+cat;
-          bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+          token=OC_DCT_SHORT_ZRL_TOKEN+(nzeros+55>>6);
+          bits=oc_token_bits(_enc,huffi,zzi,token);
           d2=sum_d2-d2_accum[zzj];
-          cost=d2+lambda*bits+tokens[zzj][1].cost;
+          cost=d2+_lambda*bits+tokens[zzj][1].cost;
           if(cost<=best_cost){
             best_next=(zzj<<1)+1;
             best_token=token;
@@ -319,25 +558,18 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
             best_bits=bits+tokens[zzj][1].bits;
             best_qc=0;
           }
-          if(nzeros<16+dc_reserve){
+          if(nzeros<17+dc_reserve){
             val=_qdct[zzj];
             val_s=-(val<0);
             val=val+val_s^val_s;
             if(val<=2){
               /*Try a +/- 1 combo token.*/
-              if(nzeros<6){
-                token=OC_DCT_RUN_CAT1A+nzeros-1;
-                eb=-val_s;
-              }
-              else{
-                cat=nzeros+54>>6;
-                token=OC_DCT_RUN_CAT1B+cat;
-                eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
-              }
-              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
+              token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
+              eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-val_s];
+              e=_dct[zzj]-(_dequant[zzj]+val_s^val_s);
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
-              bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-              cost=d2+lambda*bits+tokens[zzk][tk].cost;
+              bits=oc_token_bits(_enc,huffi,zzi,token);
+              cost=d2+_lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
                 best_next=next;
                 best_token=token;
@@ -347,22 +579,23 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
                 best_qc=1+val_s^val_s;
               }
             }
-            if(nzeros<2+dc_reserve&&2<=val&&val<=4){
+            if(nzeros<3+dc_reserve&&2<=val&&val<=4){
+              int sval;
               /*Try a +/- 2/3 combo token.*/
-              cat=nzeros>>1;
-              token=OC_DCT_RUN_CAT2A+cat;
-              bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-              val=2+((val+val_s^val_s)>2);
-              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
+              token=OC_DCT_RUN_CAT2A+(nzeros>>1);
+              bits=oc_token_bits(_enc,huffi,zzi,token);
+              val=2+(val>2);
+              sval=val+val_s^val_s;
+              e=_dct[zzj]-_dequant[zzj]*sval;
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
-              cost=d2+lambda*bits+tokens[zzk][tk].cost;
+              cost=d2+_lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
                 best_cost=cost;
                 best_bits=bits+tokens[zzk][tk].bits;
                 best_next=next;
                 best_token=token;
-                best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
-                best_qc=val+val_s^val_s;
+                best_eb=OC_DCT_RUN_CAT2_EB[nzeros-1][-val_s][val-2];
+                best_qc=sval;
               }
             }
           }
@@ -378,10 +611,10 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
           /*We made it all the way to the end of the block; try an EOB token.*/
           if(eob<4095){
             bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
-             -flush_bits;
+             -(eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0);
           }
           else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
-          cost=sum_d2+bits*lambda;
+          cost=sum_d2+bits*_lambda;
           /*If the best route so far is still a pure zero run to the end of the
              block, force coding it as an EOB.
             Even if it's not optimal for this block, it has a good chance of
@@ -408,20 +641,20 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
       tokens[zzi][0].bits=best_bits;
       tokens[zzi][0].qc=best_qc;
       zflags|=(ogg_int64_t)1<<zzi;
-      if(qc){
+      if(qc_m){
         dq=_dequant[zzi];
-        if(zzi<_acmin)lambda=0;
+        if(zzi<_acmin)_lambda=0;
         e=dq-c;
         d2=e*(ogg_int32_t)e;
         token=OC_ONE_TOKEN-s;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        bits=oc_token_bits(_enc,huffi,zzi,token);
         zzj=zzi+1&63;
         tj=best_flags>>zzj&1;
         next=(zzj<<1)+tj;
         tokens[zzi][1].next=(unsigned char)next;
         tokens[zzi][1].token=(signed char)token;
         tokens[zzi][1].eb=0;
-        tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
+        tokens[zzi][1].cost=d2+_lambda*bits+tokens[zzj][tj].cost;
         tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
         tokens[zzi][1].qc=1+s^s;
         nzflags|=(ogg_int64_t)1<<zzi;
@@ -430,200 +663,38 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
       }
     }
     else{
+      int alt_qc;
       eob=eob_run[zzi];
-      if(zzi<_acmin)lambda=0;
-      c=c+s^s;
+      if(zzi<_acmin)_lambda=0;
       dq=_dequant[zzi];
       /*No zero run can extend past this point.*/
       d2_accum[zzi]=0;
-      flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
-      if(qc<=2){
-        e=2*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_TWO_TOKEN-s;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e-=dq;
-        d2=e*(ogg_int32_t)e;
-        token=OC_ONE_TOKEN-s;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_bits=bits;
-          best_cost=cost;
-          qc--;
-        }
-        best_eb=0;
-      }
-      else if(qc<=3){
-        e=3*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT2;
-        best_eb=-s;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e-=dq;
-        d2=e*(ogg_int32_t)e;
-        token=OC_TWO_TOKEN-s;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_eb=0;
-          best_bits=bits;
-          best_cost=cost;
-          qc--;
-        }
-      }
-      else if(qc<=6){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT2+qc-3;
-        best_eb=-s;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e-=dq;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_bits=bits;
-          best_cost=cost;
-          qc--;
-        }
-      }
-      else if(qc<=8){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT3;
-        best_eb=(-s<<1)+qc-7;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=6*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=OC_DCT_VAL_CAT2+3;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_eb=-s;
-          best_bits=bits;
-          best_cost=cost;
-          qc=6;
-        }
-      }
-      else if(qc<=12){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT4;
-        best_eb=(-s<<2)+qc-9;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=8*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_eb=(-s<<1)+1;
-          best_bits=bits;
-          best_cost=cost;
-          qc=8;
-        }
-      }
-      else if(qc<=20){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT5;
-        best_eb=(-s<<3)+qc-13;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=12*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_eb=(-s<<2)+3;
-          best_bits=bits;
-          best_cost=cost;
-          qc=12;
-        }
-      }
-      else if(qc<=36){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT6;
-        best_eb=(-s<<4)+qc-21;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=20*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<=best_cost){
-          best_token=token;
-          best_eb=(-s<<3)+7;
-          best_bits=bits;
-          best_cost=cost;
-          qc=20;
-        }
-      }
-      else if(qc<=68){
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT7;
-        best_eb=(-s<<5)+qc-37;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=36*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<best_cost){
-          best_token=token;
-          best_eb=(-s<<4)+15;
-          best_bits=bits;
-          best_cost=cost;
-          qc=36;
-        }
-      }
-      else{
-        e=qc*dq-c;
-        d2=e*(ogg_int32_t)e;
-        best_token=OC_DCT_VAL_CAT8;
-        best_eb=(-s<<9)+qc-69;
-        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
-        best_cost=d2+lambda*best_bits;
-        e=68*dq-c;
-        d2=e*(ogg_int32_t)e;
-        token=best_token-1;
-        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
-        cost=d2+lambda*bits;
-        if(cost<best_cost){
-          best_token=token;
-          best_eb=(-s<<5)+31;
-          best_bits=bits;
-          best_cost=cost;
-          qc=68;
-        }
+      e=qc*dq-c;
+      d2=e*(ogg_int32_t)e;
+      best_token=*(OC_DCT_VALUE_TOKEN_PTR+qc);
+      best_bits=oc_token_bits(_enc,huffi,zzi,best_token);
+      best_cost=d2+_lambda*best_bits;
+      alt_qc=*(OC_DCT_TRELLIS_ALT_VALUE_PTR+qc);
+      e=alt_qc*dq-c;
+      d2=e*(ogg_int32_t)e;
+      token=*(OC_DCT_VALUE_TOKEN_PTR+alt_qc);
+      bits=oc_token_bits(_enc,huffi,zzi,token);
+      cost=d2+_lambda*bits;
+      if(cost<best_cost){
+        best_token=token;
+        best_bits=bits;
+        best_cost=cost;
+        qc=alt_qc;
       }
       zzj=zzi+1&63;
       tj=best_flags>>zzj&1;
       next=(zzj<<1)+tj;
       tokens[zzi][1].next=(unsigned char)next;
       tokens[zzi][1].token=(signed char)best_token;
-      tokens[zzi][1].eb=best_eb;
+      tokens[zzi][1].eb=*(OC_DCT_VALUE_EB_PTR+qc);
       tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
       tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
-      tokens[zzi][1].qc=qc+s^s;
+      tokens[zzi][1].qc=qc;
       nzflags|=(ogg_int64_t)1<<zzi;
       best_flags|=(ogg_int64_t)1<<zzi;
     }
@@ -631,9 +702,6 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
-  /*We blow away the first entry here so that things vectorize better.
-    The DC coefficient is not actually stored in the array yet.*/
-  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   zzi=1;
   ti=best_flags>>1&1;
@@ -643,12 +711,15 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
     eob=eob_run[zzi];
     if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
       if(++eob>=4095){
-        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        oc_enc_token_log(_enc,_pli,zzi,OC_DCT_REPEAT_RUN3_TOKEN,eob);
         eob=0;
       }
       eob_run[zzi]=eob;
       /*We don't include the actual EOB cost for this block in the return value.
-        It will be paid for by the fragment that terminates the EOB run.*/
+        It is very likely to eventually be spread over several blocks, and
+         including it more harshly penalizes the first few blocks in a long EOB
+         run.
+        Omitting it here gives a small PSNR and SSIM gain.*/
       bits-=tokens[zzi][ti].bits;
       zzi=_zzi;
       break;
@@ -664,7 +735,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
     zzj=(next>>1)-1&63;
     /*TODO: It may be worth saving the dequantized coefficient in the trellis
        above; we had to compute it to measure the error anyway.*/
-    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+    _idct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
   }
@@ -673,6 +744,237 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
   return bits;
 }
 
+/*Simplistic R/D tokenizer.
+  The AC coefficients of _idct must be pre-initialized to zero.
+  This could be made more accurate by using more sophisticated
+   rate predictions for zeros.
+  It could be made faster by switching from R/D decisions to static
+   lambda-derived rounding biases.*/
+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
+  const unsigned char *dct_fzig_zag;
+  ogg_uint16_t        *eob_run;
+  oc_token_checkpoint *stack;
+  int                  huffi;
+  int                  zzi;
+  int                  zzj;
+  int                  zzk;
+  int                  total_bits;
+  int                  zr[4];
+  stack=*_stack;
+  total_bits=0;
+  /*The apparent bit-cost of coding a zero from observing the trellis
+     quantizer is pre-combined with lambda.
+    Four predictive cases are considered: the last optimized value is zero (+2)
+     or non-zero and the non-optimized value is zero (+1) or non-zero.*/
+  zr[0]=3*_lambda>>1;
+  zr[1]=_lambda;
+  zr[2]=4*_lambda;
+  zr[3]=7*_lambda>>1;
+  eob_run=_enc->eob_run[_pli];
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
+  huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
+  for(zzj=zzi=1;zzj<_zzi&&!_qdct[zzj];zzj++);
+  while(zzj<_zzi){
+    int v;
+    int d0;
+    int d1;
+    int sign;
+    int k;
+    int eob;
+    int dq0;
+    int dq1;
+    int dd0;
+    int dd1;
+    int next_zero;
+    int eob_bits;
+    int dct_fzig_zzj;
+    dct_fzig_zzj=dct_fzig_zag[zzj];
+    v=_dct[zzj];
+    d0=_qdct[zzj];
+    eob=eob_run[zzi];
+    for(zzk=zzj+1;zzk<_zzi&&!_qdct[zzk];zzk++);
+    next_zero=zzk-zzj+62>>6;
+    dq0=d0*_dequant[zzj];
+    dd0=dq0-v;
+    dd0*=dd0;
+    sign=-(d0<0);
+    k=d0+sign^sign;
+    d1=(k-(zzj>_acmin))+sign^sign;
+    dq1=d1*_dequant[zzj];
+    dd1=dq1-v;
+    dd1*=dd1;
+    /*The cost of ending an eob run is included when the alternative is to
+       extend this eob run.
+      A per qi/zzi weight would probably be useful.
+      Including it in the overall tokenization cost was not helpful.
+      The same is true at the far end of the zero run plus token case.*/
+    if(eob>0&&d1==0&&zzk==_zzi){
+      eob_bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
+    }
+    else eob_bits=0;
+    if(zzj==zzi){
+      /*No active zero run.*/
+      int best_token;
+      int best_eb;
+      int token;
+      int best_bits;
+      int bits;
+      int cost;
+      best_token=*(OC_DCT_VALUE_TOKEN_PTR+d0);
+      best_bits=oc_token_bits(_enc,huffi,zzi,best_token);
+      if(d1!=0){
+        token=*(OC_DCT_VALUE_TOKEN_PTR+d1);
+        bits=oc_token_bits(_enc,huffi,zzi,token);
+        cost=dd1+(bits+eob_bits)*_lambda;
+      }
+      else{
+        token=bits=0;
+        cost=dd1+zr[next_zero];
+      }
+      if((dd0+(best_bits+eob_bits)*_lambda)>cost){
+        _idct[dct_fzig_zzj]=dq1;
+        if(d1==0){
+          zzj=zzk;
+          continue;
+        }
+        best_bits=bits;
+        best_token=token;
+        best_eb=*(OC_DCT_VALUE_EB_PTR+d1);
+      }
+      else{
+        best_eb=*(OC_DCT_VALUE_EB_PTR+d0);
+        _idct[dct_fzig_zzj]=dq0;
+      }
+      oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+      if(eob>0){
+        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        eob_run[zzi]=0;
+      }
+      oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb);
+      total_bits+=best_bits;
+    }
+    else{
+      int d;
+      int dc_reserve;
+      int best_token;
+      int best_eb;
+      int best_bits;
+      int best_cost;
+      int best_bits1;
+      int best_token1;
+      int best_eb1;
+      int zr_bits;
+      int eob2;
+      int eob_bits2;
+      int bits;
+      int token;
+      int nzeros;
+      nzeros=zzj-zzi;
+      dc_reserve=zzi+62>>6;
+      /*A zero run, followed by the value alone.*/
+      best_token=best_token1=OC_DCT_SHORT_ZRL_TOKEN+(nzeros+55>>6);
+      best_eb=best_eb1=nzeros-1;
+      eob2=eob_run[zzj];
+      eob_bits2=eob2>0?oc_token_bits(_enc,huffi,zzj,OC_DCT_EOB1_TOKEN):0;
+      zr_bits=oc_token_bits(_enc,huffi,zzi,best_token)+eob_bits2;
+      best_bits=zr_bits
+       +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d0));
+      d=d0;
+      best_bits1=0;
+      if(d1!=0){
+        best_bits1=zr_bits
+         +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d1));
+      }
+      if(nzeros<17+dc_reserve){
+        if(k<=2){
+          /*+/- 1 combo token.*/
+          token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
+          bits=oc_token_bits(_enc,huffi,zzi,token);
+          if(k==2&&bits<=best_bits1){
+            best_bits1=bits;
+            best_token1=token;
+            best_eb1=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign];
+          }
+          if(k==1&&bits<=best_bits){
+            best_bits=bits;
+            best_token=token;
+            best_eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign];
+          }
+        }
+        if(nzeros<3+dc_reserve&&2<=k&&k<=4){
+          /*+/- 2/3 combo token.*/
+          token=OC_DCT_RUN_CAT2A+(nzeros>>1);
+          bits=oc_token_bits(_enc,huffi,zzi,token);
+          if(k==4&&bits<=best_bits1){
+            best_bits1=bits;
+            best_token1=token;
+            best_eb1=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][1];
+          }
+          if(k!=4&&bits<=best_bits){
+            best_bits=bits;
+            best_token=token;
+            best_eb=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][k-2];
+          }
+        }
+      }
+      best_cost=dd0+(best_bits+eob_bits)*_lambda;
+      if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){
+        zzj=zzk;
+        continue;
+      }
+      if(d1!=0&&dd1+(best_bits1+eob_bits)*_lambda<best_cost){
+        best_bits=best_bits1;
+        best_token=best_token1;
+        best_eb=best_eb1;
+        d=d1;
+        _idct[dct_fzig_zzj]=dq1;
+      }
+      else _idct[dct_fzig_zzj]=dq0;
+      oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+      if(eob){
+        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        eob_run[zzi]=0;
+      }
+      oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb);
+      /*If a zero run won vs. the combo token we still need to code this
+         value.*/
+      if(best_token<=OC_DCT_ZRL_TOKEN){
+        oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzj);
+        if(eob2){
+          oc_enc_eob_log(_enc,_pli,zzj,eob2);
+          /*The cost of any EOB run we disrupted is ignored because doing so
+             improved PSNR/SSIM by a small amount.*/
+          best_bits-=eob_bits2;
+          eob_run[zzj]=0;
+        }
+        oc_enc_token_log(_enc,_pli,zzj,
+         *(OC_DCT_VALUE_TOKEN_PTR+d),*(OC_DCT_VALUE_EB_PTR+d));
+      }
+      total_bits+=best_bits;
+    }
+    zzi=zzj+1;
+    zzj=zzk;
+  }
+  /*Code an EOB run to complete this block.
+    The cost of the EOB run is not included in the total as explained in
+     in a comment in the trellis tokenizer above.*/
+  if(zzi<64){
+    int eob;
+    eob=eob_run[zzi]+1;
+    oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+    if(eob>=4095){
+      oc_enc_token_log(_enc,_pli,zzi,OC_DCT_REPEAT_RUN3_TOKEN,eob);
+      eob=0;
+    }
+    eob_run[zzi]=eob;
+  }
+  *_stack=stack;
+  return total_bits;
+}
+
 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
  int _pli,int _fragy0,int _frag_yend){
   const oc_fragment_plane *fplane;
@@ -695,10 +997,10 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
          predictor for the same reference frame.*/
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         if(frags[fragi].coded){
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
-          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
-          pred_last[ref]=frags[fragi].dc;
+          int refi;
+          refi=frags[fragi].refi;
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[refi]);
+          pred_last[refi]=frags[fragi].dc;
         }
       }
     }
@@ -710,27 +1012,24 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
       u_frags=frags-nhfrags;
       l_ref=-1;
       ul_ref=-1;
-      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      u_ref=u_frags[fragi].refi;
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         int ur_ref;
         if(fragx+1>=nhfrags)ur_ref=-1;
-        else{
-          ur_ref=u_frags[fragi+1].coded?
-           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
-        }
+        else ur_ref=u_frags[fragi+1].refi;
         if(frags[fragi].coded){
           int pred;
-          int ref;
-          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          int refi;
+          refi=frags[fragi].refi;
           /*We break out a separate case based on which of our neighbors use
              the same reference frames.
             This is somewhat faster than trying to make a generic case which
              handles all of them, since it reduces lots of poorly predicted
              jumps to one switch statement, and also lets a number of the
              multiplications be optimized out by strength reduction.*/
-          switch((l_ref==ref)|(ul_ref==ref)<<1|
-           (u_ref==ref)<<2|(ur_ref==ref)<<3){
-            default:pred=pred_last[ref];break;
+          switch((l_ref==refi)|(ul_ref==refi)<<1|
+           (u_ref==refi)<<2|(ur_ref==refi)<<3){
+            default:pred=pred_last[refi];break;
             case  1:
             case  3:pred=frags[fragi-1].dc;break;
             case  2:pred=u_frags[fragi-1].dc;break;
@@ -764,8 +1063,8 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
             }break;
           }
           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
-          pred_last[ref]=frags[fragi].dc;
-          l_ref=ref;
+          pred_last[refi]=frags[fragi].dc;
+          l_ref=refi;
         }
         else l_ref=-1;
         ul_ref=u_ref;
@@ -850,9 +1149,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
         ti0++;
         eob_run0=0;
       }
-      token=oc_make_dct_token_full(0,0,val,&eb);
-      dct_tokens0[ti0]=(unsigned char)token;
-      extra_bits0[ti0]=(ogg_uint16_t)eb;
+      dct_tokens0[ti0]=*(OC_DCT_VALUE_TOKEN_PTR+val);
+      extra_bits0[ti0]=*(OC_DCT_VALUE_EB_PTR+val);
       ti0++;
     }
     else{
@@ -863,9 +1161,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
         /*We're in the middle of an active EOB run in stack 1.
           Move it to stack 0.*/
         if(++eob_run0>=4095){
-          token=oc_make_eob_token_full(eob_run0,&eb);
-          dct_tokens0[ti0]=(unsigned char)token;
-          extra_bits0[ti0]=(ogg_uint16_t)eb;
+          dct_tokens0[ti0]=OC_DCT_REPEAT_RUN3_TOKEN;
+          extra_bits0[ti0]=eob_run0;
           ti0++;
           eob_run0=0;
         }
@@ -996,9 +1293,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
       neobs1--;
       /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
       if(eob_run1-neobs1>=4095){
-        token=oc_make_eob_token_full(4095,&eb);
-        dct_tokens1[ti1w]=(unsigned char)token;
-        extra_bits1[ti1w]=(ogg_uint16_t)eb;
+        dct_tokens1[ti1w]=OC_DCT_REPEAT_RUN3_TOKEN;
+        extra_bits1[ti1w]=4095;
         ti1w++;
         eob_run1-=4095;
       }
diff --git a/thirdparty/libtheora/x86/mmxencfrag.c b/thirdparty/libtheora/x86/mmxencfrag.c
index c79ff01fcc..cc9be8d867 100644
--- a/thirdparty/libtheora/x86/mmxencfrag.c
+++ b/thirdparty/libtheora/x86/mmxencfrag.c
@@ -65,7 +65,7 @@ unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
     "paddw %%mm6,%%mm0\n\t"
     "paddw %%mm2,%%mm0\n\t"
     "movd %%mm0,%[ret]\n\t"
-    :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
+    :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
     :[ystride]"r"((ptrdiff_t)_ystride)
   );
   return (unsigned)ret;
@@ -87,7 +87,9 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
    The latter is exactly 1 too large when the low bit of two corresponding \
     bytes is only set in one of them. \
    Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
-    correct the output of pavgb.*/ \
+    correct the output of pavgb. \
+   TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
+    schedules better; currently, however, this function is unused.*/ \
  "movq %%mm0,%%mm6\n\t" \
  "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
  "pxor %%mm1,%%mm0\n\t" \
@@ -153,7 +155,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
     OC_SAD2_LOOP
     OC_SAD2_LOOP
     OC_SAD2_TAIL
-    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
+    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
     :[ystride]"r"((ptrdiff_t)_ystride)
   );
   return (unsigned)ret;
@@ -163,54 +165,54 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    16-bit difference in %%mm0...%%mm7.*/
 #define OC_LOAD_SUB_8x4(_off) \
  "#OC_LOAD_SUB_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[ref]),%%mm4\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm4\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "movd "_off"(%[src]),%%mm2\n\t" \
- "movd "_off"(%[ref]),%%mm7\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm2\n\t" \
+ "movd "#_off"(%[ref]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
  "punpcklbw %%mm4,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movd "_off"(%[src]),%%mm4\n\t" \
- "movq %%mm0,"_off"*2(%[buf])\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[src]),%%mm4\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm5,%%mm1\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psubw %%mm5,%%mm1\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm6,%%mm3\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm6,%%mm6\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movd "_off"(%[src]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm6\n\t" \
  "punpcklbw %%mm0,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "psubw %%mm0,%%mm4\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm7,%%mm5\n\t" \
  "neg %[src_ystride]\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm5\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm0,%%mm6\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "neg %[ref_ystride]\n\t" \
  "psubw %%mm0,%%mm6\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
  "lea (%[src],%[src_ystride],8),%[src]\n\t" \
  "punpcklbw %%mm0,%%mm7\n\t" \
  "neg %[src_ystride]\n\t" \
@@ -218,24 +220,24 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
  "psubw %%mm0,%%mm7\n\t" \
  "neg %[ref_ystride]\n\t" \
- "movq "_off"*2(%[buf]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
 
 /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
 #define OC_LOAD_8x4(_off) \
  "#OC_LOAD_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
- "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
  "pxor %%mm7,%%mm7\n\t" \
- "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
+ "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
  "punpcklbw %%mm7,%%mm0\n\t" \
- "movd "_off"(%[src4]),%%mm4\n\t" \
+ "movd "#_off"(%[src4]),%%mm4\n\t" \
  "punpcklbw %%mm7,%%mm1\n\t" \
- "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
+ "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
  "punpcklbw %%mm7,%%mm3\n\t" \
- "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
+ "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psrlw $8,%%mm4\n\t" \
@@ -248,7 +250,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
   The transform is performed in place, except that outputs 0-3 are swapped with
    outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
    perform this stage in place with no temporary registers).*/
 #define OC_HADAMARD_AB_8x4 \
  "#OC_HADAMARD_AB_8x4\n\t" \
@@ -281,7 +283,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "psubw %%mm5,%%mm7\n\t" \
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
    place with no temporary registers).*/
 #define OC_HADAMARD_C_8x4 \
  "#OC_HADAMARD_C_8x4\n\t" \
@@ -324,8 +326,8 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
    This implementation is only 26 (+4 for spilling registers).*/ \
  "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
- "movq %%mm7,"_r7"(%[buf])\n\t" \
- "movq %%mm6,"_r6"(%[buf])\n\t" \
+ "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
+ "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
  /*mm7={0x7FFF}x4 \
    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  "pcmpeqb %%mm7,%%mm7\n\t" \
@@ -343,14 +345,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "pmaxsw %%mm5,%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
- "movq "_r7"(%[buf]),%%mm3\n\t" \
+ "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
 
 /*Performs the second part of the final stage of the Hadamard transform and
    summing of absolute values.*/
 #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
  "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
  "paddsw %%mm7,%%mm6\n\t" \
- "movq "_r6"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
  "paddsw %%mm7,%%mm1\n\t" \
  "psubw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm4\n\t" \
@@ -391,7 +393,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 #define OC_TRANSPOSE_4x4x2(_off) \
  "#OC_TRANSPOSE_4x4x2\n\t" \
  /*First 4x4 transpose:*/ \
- "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
+ "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
  /*mm0 = e3 e2 e1 e0 \
    mm1 = f3 f2 f1 f0 \
    mm2 = g3 g2 g1 g0 \
@@ -411,13 +413,13 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "punpckhdq %%mm2,%%mm1\n\t" \
  "movq %%mm3,%%mm2\n\t" \
  "punpckhdq %%mm5,%%mm3\n\t" \
- "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
  "punpckldq %%mm5,%%mm2\n\t" \
  /*mm0 = h0 g0 f0 e0 \
    mm1 = h1 g1 f1 e1 \
    mm2 = h2 g2 f2 e2 \
    mm3 = h3 g3 f3 e3*/ \
- "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
  /*Second 4x4 transpose:*/ \
  /*mm4 = a3 a2 a1 a0 \
    mm5 = b3 b2 b1 b0 \
@@ -425,11 +427,11 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm7 = d3 d2 d1 d0*/ \
  "movq %%mm6,%%mm0\n\t" \
  "punpcklwd %%mm7,%%mm6\n\t" \
- "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
+ "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
  "punpckhwd %%mm7,%%mm0\n\t" \
  "movq %%mm4,%%mm7\n\t" \
  "punpcklwd %%mm5,%%mm4\n\t" \
- "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
+ "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
  "punpckhwd %%mm5,%%mm7\n\t" \
  /*mm4 = b1 a1 b0 a0 \
    mm7 = b3 a3 b2 a2 \
@@ -437,7 +439,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm0 = d3 c3 d2 c2*/ \
  "movq %%mm4,%%mm5\n\t" \
  "punpckldq %%mm6,%%mm4\n\t" \
- "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
+ "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
  "punpckhdq %%mm6,%%mm5\n\t" \
  "movq %%mm7,%%mm6\n\t" \
  "punpckhdq %%mm0,%%mm7\n\t" \
@@ -447,100 +449,102 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm6 = d2 c2 b2 a2 \
    mm7 = d3 c3 b3 a3*/ \
 
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
   __asm__ __volatile__(
-    OC_LOAD_SUB_8x4("0x00")
+    OC_LOAD_SUB_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_SUB_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_SUB_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
       Now we finally have to promote things to dwords.
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
-    "mov %[thresh],%[ret2]\n\t"
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    "lea -32(%[ret],%[ret]),%[ret]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "cmp %[ret2],%[ret]\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    "jae 1f\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    "sub $32,%[ret]\n\t"
+    /*Subtract abs(dc) from 2*ret2.*/
+    "movsx %w[dc],%[dc]\n\t"
+    "cdq\n\t"
+    "lea (%[ret],%[ret2],2),%[ret2]\n\t"
     "movq %%mm0,%%mm4\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
+    "xor %[dc],%[ret]\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "lea (%[ret],%[ret2],2),%[ret]\n\t"
-    ".p2align 4,,15\n\t"
-    "1:\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "sub %[ret],%[ret2]\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
        and %[ret2] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+       them until after we're done using everything but %[buf].*/
     /*Note that _src_ystride and _ref_ystride must be given non-overlapping
        constraints, otherewise if gcc can prove they're equal it will allocate
        them to the same register (which is bad); _src and _ref face a similar
        problem, though those are never actually the same.*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
-     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
-     [thresh]"m"(_thresh)
+     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
     /*We have to use neg, so we actually clobber the condition codes for once
        (not to mention cmp, sub, and add).*/
     :"cc"
   );
+  *_dc=dc;
   return ret;
 }
 
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
 
 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+   we can share code with oc_enc_frag_satd2_mmxext().*/
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
   __asm__ __volatile__(
     /*Load the first 3 rows.*/
@@ -649,55 +653,53 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
     "psubb %%mm4,%%mm2\n\t"
     /*%%mm2 (row 7) is done, write it out.*/
     "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
-    :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
+    :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
     :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
      [src_ystride]"r"((ptrdiff_t)_src_ystride)
     :"memory"
   );
 }
 
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
- int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
   __asm__ __volatile__(
-    OC_LOAD_8x4("0x00")
+    OC_LOAD_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
     /*We split out the stages here so we can save the DC coefficient in the
        middle.*/
     OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
-    "movd %%mm1,%[ret]\n\t"
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
@@ -705,41 +707,43 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
     /*We assume that the DC coefficient is always positive (which is true,
        because the input to the INTRA transform was not a difference).*/
-    "movzx %w[ret],%[ret]\n\t"
-    "add %[ret2],%[ret2]\n\t"
-    "sub %[ret],%[ret2]\n\t"
+    "movzx %w[dc],%[dc]\n\t"
+    "add %[ret],%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
     "movq %%mm0,%%mm4\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+    "movd %%mm4,%[ret2]\n\t"
+    "lea -64(%[ret],%[ret2],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
        and %[ret2] with some of the inputs, since for once we don't write to
        them until after we're done using everything but %[buf] (which is also
        listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
     /*We have to use sub, so we actually clobber the condition codes for once
        (not to mention add).*/
     :"cc"
   );
+  *_dc=dc;
   return ret;
 }
 
diff --git a/thirdparty/libtheora/x86/mmxfdct.c b/thirdparty/libtheora/x86/mmxfdct.c
index 211875255e..17668358b8 100644
--- a/thirdparty/libtheora/x86/mmxfdct.c
+++ b/thirdparty/libtheora/x86/mmxfdct.c
@@ -12,6 +12,7 @@
 /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -462,8 +463,9 @@
    mm7 = d3 c3 b3 a3*/ \
 
 /*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ptrdiff_t   a;
   __asm__ __volatile__(
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
@@ -586,77 +588,88 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "movq 0x30(%[y]),%%mm3\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x18(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x08(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x10(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq 0x30(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
     "psraw $2,%%mm7\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    /*Load the next block.*/
     "movq 0x40(%[y]),%%mm0\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x30(%[y])\n\t"
     "movq 0x78(%[y]),%%mm7\n\t"
-    "movq %%mm1,0x08(%[y])\n\t"
     "movq 0x50(%[y]),%%mm1\n\t"
-    "movq %%mm6,0x20(%[y])\n\t"
     "movq 0x68(%[y]),%%mm6\n\t"
-    "movq %%mm2,0x28(%[y])\n\t"
     "movq 0x60(%[y]),%%mm2\n\t"
-    "movq %%mm5,0x10(%[y])\n\t"
     "movq 0x58(%[y]),%%mm5\n\t"
-    "movq %%mm3,0x38(%[y])\n\t"
     "movq 0x70(%[y]),%%mm3\n\t"
-    "movq %%mm4,0x00(%[y])\n\t"
     "movq 0x48(%[y]),%%mm4\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x58(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x48(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
-    "movq %%mm2,0x68(%[y])\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x50(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "movq %%mm3,0x78(%[y])\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "movq %%mm4,0x40(%[y])\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
+    "movq 0x70(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
-    "movq %%mm5,0x50(%[y])\n\t"
-    "psraw $2,%%mm7\n\t"
-    "movq %%mm6,0x60(%[y])\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
     "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x70(%[y])\n\t"
-    "movq %%mm1,0x48(%[y])\n\t"
-    :[a]"=&r"(a)
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
+    /*Final transpose and zig-zag.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+    :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[y]"r"(_y),[x]"r"(_x)
     :"memory"
   );
diff --git a/thirdparty/libtheora/x86/mmxfrag.c b/thirdparty/libtheora/x86/mmxfrag.c
index 2c732939c3..b3ec508956 100644
--- a/thirdparty/libtheora/x86/mmxfrag.c
+++ b/thirdparty/libtheora/x86/mmxfrag.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,17 +22,92 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm__ __volatile__(
@@ -280,7 +355,7 @@ void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
       /*Advance dest ptr.*/
       "lea (%[dst],%[ystride],2),%[dst]\n\t"
      :[dst]"+r"(_dst),[residue]"+r"(_residue),
-      [src1]"+%r"(_src1),[src2]"+r"(_src2)
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
      :[ystride]"r"((ptrdiff_t)_ystride)
      :"memory"
     );
diff --git a/thirdparty/libtheora/x86/mmxfrag.h b/thirdparty/libtheora/x86/mmxfrag.h
deleted file mode 100644
index a398427629..0000000000
--- a/thirdparty/libtheora/x86/mmxfrag.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm__ __volatile__( \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*ystride3=ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[src],%[ystride],4),%[src]\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/thirdparty/libtheora/x86/mmxidct.c b/thirdparty/libtheora/x86/mmxidct.c
index 76424e6364..b8e3077066 100644
--- a/thirdparty/libtheora/x86/mmxidct.c
+++ b/thirdparty/libtheora/x86/mmxidct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -30,89 +30,66 @@
 
 
 
-/*A table of constants used by the MMX routines.*/
-static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
-/*Converts the expression in the argument to a string.*/
-#define OC_M2STR(_s) #_s
-
 /*38 cycles*/
-#define OC_IDCT_BEGIN \
+#define OC_IDCT_BEGIN(_y,_x) \
   "#OC_IDCT_BEGIN\n\t" \
-  "movq "OC_I(3)",%%mm2\n\t" \
-  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq "OC_I(3,_x)",%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
   "movq %%mm2,%%mm4\n\t" \
-  "movq "OC_J(5)",%%mm7\n\t" \
+  "movq "OC_J(5,_x)",%%mm7\n\t" \
   "pmulhw %%mm6,%%mm4\n\t" \
-  "movq "OC_C(5)",%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
   "pmulhw %%mm7,%%mm6\n\t" \
   "movq %%mm1,%%mm5\n\t" \
   "pmulhw %%mm2,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm3\n\t" \
+  "movq "OC_I(1,_x)",%%mm3\n\t" \
   "pmulhw %%mm7,%%mm5\n\t" \
-  "movq "OC_C(1)",%%mm0\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
   "paddw %%mm2,%%mm4\n\t" \
   "paddw %%mm7,%%mm6\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "movq "OC_J(7)",%%mm1\n\t" \
+  "movq "OC_J(7,_x)",%%mm1\n\t" \
   "paddw %%mm5,%%mm7\n\t" \
   "movq %%mm0,%%mm5\n\t" \
   "pmulhw %%mm3,%%mm0\n\t" \
   "paddw %%mm7,%%mm4\n\t" \
   "pmulhw %%mm1,%%mm5\n\t" \
-  "movq "OC_C(7)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm3,%%mm0\n\t" \
   "pmulhw %%mm7,%%mm3\n\t" \
-  "movq "OC_I(2)",%%mm2\n\t" \
+  "movq "OC_I(2,_x)",%%mm2\n\t" \
   "pmulhw %%mm1,%%mm7\n\t" \
   "paddw %%mm1,%%mm5\n\t" \
   "movq %%mm2,%%mm1\n\t" \
-  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
   "psubw %%mm5,%%mm3\n\t" \
-  "movq "OC_J(6)",%%mm5\n\t" \
+  "movq "OC_J(6,_x)",%%mm5\n\t" \
   "paddw %%mm7,%%mm0\n\t" \
   "movq %%mm5,%%mm7\n\t" \
   "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
   "paddw %%mm4,%%mm4\n\t" \
   "paddw %%mm0,%%mm4\n\t" \
   "psubw %%mm6,%%mm3\n\t" \
   "paddw %%mm7,%%mm5\n\t" \
   "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
   "paddw %%mm3,%%mm6\n\t" \
-  "movq %%mm4,"OC_I(1)"\n\t" \
+  "movq %%mm4,"OC_I(1,_y)"\n\t" \
   "psubw %%mm5,%%mm1\n\t" \
-  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
   "movq %%mm3,%%mm5\n\t" \
   "pmulhw %%mm4,%%mm3\n\t" \
   "paddw %%mm2,%%mm7\n\t" \
-  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm6,"OC_I(2,_y)"\n\t" \
   "movq %%mm0,%%mm2\n\t" \
-  "movq "OC_I(0)",%%mm6\n\t" \
+  "movq "OC_I(0,_x)",%%mm6\n\t" \
   "pmulhw %%mm4,%%mm0\n\t" \
   "paddw %%mm3,%%mm5\n\t" \
-  "movq "OC_J(4)",%%mm3\n\t" \
+  "movq "OC_J(4,_x)",%%mm3\n\t" \
   "psubw %%mm1,%%mm5\n\t" \
   "paddw %%mm0,%%mm2\n\t" \
   "psubw %%mm3,%%mm6\n\t" \
@@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "paddw %%mm0,%%mm6\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm2,%%mm2\n\t" \
-  "movq "OC_I(1)",%%mm0\n\t" \
+  "movq "OC_I(1,_y)",%%mm0\n\t" \
   "paddw %%mm6,%%mm2\n\t" \
   "paddw %%mm3,%%mm4\n\t" \
   "psubw %%mm1,%%mm2\n\t" \
   "#end OC_IDCT_BEGIN\n\t" \
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
+#define OC_ROW_IDCT(_y,_x) \
   "#OC_ROW_IDCT\n" \
-  OC_IDCT_BEGIN \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r4=E'=E-G*/ \
   "psubw %%mm7,%%mm4\n\t" \
   /*r1=H'+H'*/ \
@@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "psubw %%mm0,%%mm7\n\t" \
   "paddw %%mm0,%%mm0\n\t" \
   /*Save R1.*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r0=R0=G.+C.*/ \
   "paddw %%mm7,%%mm0\n\t" \
   "#end OC_ROW_IDCT\n\t" \
@@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE \
+#define OC_TRANSPOSE(_y) \
   "#OC_TRANSPOSE\n\t" \
   "movq %%mm4,%%mm1\n\t" \
   "punpcklwd %%mm5,%%mm4\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm1\n\t" \
   "movq %%mm6,%%mm0\n\t" \
   "punpcklwd %%mm7,%%mm6\n\t" \
@@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "punpckldq %%mm6,%%mm4\n\t" \
   "punpckhdq %%mm6,%%mm5\n\t" \
   "movq %%mm1,%%mm6\n\t" \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   "punpckhwd %%mm7,%%mm0\n\t" \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   "punpckhdq %%mm0,%%mm6\n\t" \
-  "movq "OC_I(0)",%%mm4\n\t" \
+  "movq "OC_I(0,_y)",%%mm4\n\t" \
   "punpckldq %%mm0,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq "OC_I(1,_y)",%%mm5\n\t" \
   "movq %%mm4,%%mm0\n\t" \
-  "movq %%mm6,"OC_J(7)"\n\t" \
+  "movq %%mm6,"OC_J(7,_y)"\n\t" \
   "punpcklwd %%mm5,%%mm0\n\t" \
-  "movq %%mm1,"OC_J(6)"\n\t" \
+  "movq %%mm1,"OC_J(6,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm4\n\t" \
   "movq %%mm2,%%mm5\n\t" \
   "punpcklwd %%mm3,%%mm2\n\t" \
@@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "punpckldq %%mm2,%%mm0\n\t" \
   "punpckhdq %%mm2,%%mm1\n\t" \
   "movq %%mm4,%%mm2\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm3,%%mm5\n\t" \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   "punpckhdq %%mm5,%%mm4\n\t" \
   "punpckldq %%mm5,%%mm2\n\t" \
-  "movq %%mm4,"OC_I(3)"\n\t" \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm4,"OC_I(3,_y)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   "#end OC_TRANSPOSE\n\t" \
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
+#define OC_COLUMN_IDCT(_y) \
   "#OC_COLUMN_IDCT\n" \
-  OC_IDCT_BEGIN \
-  "paddw "OC_8",%%mm2\n\t" \
+  OC_IDCT_BEGIN(_y,_y) \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
   /*r1=H'+H'*/ \
   "paddw %%mm1,%%mm1\n\t" \
   /*r1=R1=A''+H'*/ \
@@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r1=NR1*/ \
   "psraw $4,%%mm1\n\t" \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r7=G+G*/ \
   "paddw %%mm7,%%mm7\n\t" \
   /*Store NR2 at I(2).*/ \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   /*r7=G'=E+G*/ \
   "paddw %%mm4,%%mm7\n\t" \
   /*Store NR1 at I(1).*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r4=R4=E'-D'*/ \
   "psubw %%mm3,%%mm4\n\t" \
-  "paddw "OC_8",%%mm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
   /*r3=D'+D'*/ \
   "paddw %%mm3,%%mm3\n\t" \
   /*r3=R3=E'+D'*/ \
@@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "psubw %%mm5,%%mm6\n\t" \
   /*r3=NR3*/ \
   "psraw $4,%%mm3\n\t" \
-  "paddw "OC_8",%%mm6\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
   /*r5=B''+B''*/ \
   "paddw %%mm5,%%mm5\n\t" \
   /*r5=R5=F'+B''*/ \
@@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r6=NR6*/ \
   "psraw $4,%%mm6\n\t" \
   /*Store NR4 at J(4).*/ \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   /*r5=NR5*/ \
   "psraw $4,%%mm5\n\t" \
   /*Store NR3 at I(3).*/ \
-  "movq %%mm3,"OC_I(3)"\n\t" \
+  "movq %%mm3,"OC_I(3,_y)"\n\t" \
   /*r7=R7=G'-C'*/ \
   "psubw %%mm0,%%mm7\n\t" \
-  "paddw "OC_8",%%mm7\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
   /*r0=C'+C'*/ \
   "paddw %%mm0,%%mm0\n\t" \
   /*r0=R0=G'+C'*/ \
@@ -295,113 +272,121 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r7=NR7*/ \
   "psraw $4,%%mm7\n\t" \
   /*Store NR6 at J(6).*/ \
-  "movq %%mm6,"OC_J(6)"\n\t" \
+  "movq %%mm6,"OC_J(6,_y)"\n\t" \
   /*r0=NR0*/ \
   "psraw $4,%%mm0\n\t" \
   /*Store NR5 at J(5).*/ \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   /*Store NR7 at J(7).*/ \
-  "movq %%mm7,"OC_J(7)"\n\t" \
+  "movq %%mm7,"OC_J(7,_y)"\n\t" \
   /*Store NR0 at I(0).*/ \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "#end OC_COLUMN_IDCT\n\t" \
 
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm__ __volatile__(
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=0;i<4;i++){
+    __asm__ __volatile__(
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+      :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+    );
+  }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
+#define OC_IDCT_BEGIN_10(_y,_x) \
  "#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
  "nop\n\t" \
- "movq "OC_C(3)",%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
  "movq %%mm2,%%mm4\n\t" \
- "movq "OC_C(5)",%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
  "pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
  "pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_C(1)",%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  "pxor %%mm6,%%mm6\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
  "pmulhw %%mm3,%%mm0\n\t" \
  "movq %%mm5,%%mm1\n\t" \
  "paddw %%mm3,%%mm0\n\t" \
- "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
- "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
  "paddw %%mm4,%%mm4\n\t" \
  "paddw %%mm5,%%mm7\n\t" \
  "paddw %%mm0,%%mm4\n\t" \
- "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
  "paddw %%mm6,%%mm6\n\t" \
- "movq "OC_C(4)",%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "movq %%mm3,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
  "movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
  "pmulhw %%mm4,%%mm0\n\t" \
  "paddw %%mm3,%%mm5\n\t" \
  "paddw %%mm0,%%mm2\n\t" \
  "psubw %%mm1,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
  "paddw %%mm1,%%mm1\n\t" \
  "movq %%mm6,%%mm4\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
  "paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
  "paddw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm2\n\t" \
  "nop\n\t" \
  "#end OC_IDCT_BEGIN_10\n\t" \
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
+#define OC_ROW_IDCT_10(_y,_x) \
  "#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
+ OC_IDCT_BEGIN_10(_y,_x) \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r4=E'=E-G*/ \
  "psubw %%mm7,%%mm4\n\t" \
  /*r1=H'+H'*/ \
@@ -426,16 +411,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  "psubw %%mm0,%%mm7\n\t" \
  "paddw %%mm0,%%mm0\n\t" \
  /*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r0=R0=G'+C'*/ \
  "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT_10\n\t" \
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
+#define OC_COLUMN_IDCT_10(_y) \
  "#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw "OC_8",%%mm2\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
  /*r1=H'+H'*/ \
  "paddw %%mm1,%%mm1\n\t" \
  /*r1=R1=A''+H'*/ \
@@ -447,18 +432,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r1=NR1*/ \
  "psraw $4,%%mm1\n\t" \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r7=G+G*/ \
  "paddw %%mm7,%%mm7\n\t" \
  /*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
  /*r7=G'=E+G*/ \
  "paddw %%mm4,%%mm7\n\t" \
  /*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r4=R4=E'-D'*/ \
  "psubw %%mm3,%%mm4\n\t" \
- "paddw "OC_8",%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
  /*r3=D'+D'*/ \
  "paddw %%mm3,%%mm3\n\t" \
  /*r3=R3=E'+D'*/ \
@@ -469,7 +454,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  "psubw %%mm5,%%mm6\n\t" \
  /*r3=NR3*/ \
  "psraw $4,%%mm3\n\t" \
- "paddw "OC_8",%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
  /*r5=B''+B''*/ \
  "paddw %%mm5,%%mm5\n\t" \
  /*r5=R5=F'+B''*/ \
@@ -477,14 +462,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r6=NR6*/ \
  "psraw $4,%%mm6\n\t" \
  /*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
  /*r5=NR5*/ \
  "psraw $4,%%mm5\n\t" \
  /*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
  /*r7=R7=G'-C'*/ \
  "psubw %%mm0,%%mm7\n\t" \
- "paddw "OC_8",%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
  /*r0=C'+C'*/ \
  "paddw %%mm0,%%mm0\n\t" \
  /*r0=R0=G'+C'*/ \
@@ -492,46 +477,55 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r7=NR7*/ \
  "psraw $4,%%mm7\n\t" \
  /*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
  /*r0=NR0*/ \
  "psraw $4,%%mm0\n\t" \
  /*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
  /*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
  /*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "#end OC_COLUMN_IDCT_10\n\t" \
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+  );
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
   );
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -557,8 +551,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+  else oc_idct8x8_slow_mmx(_y,_x);
 }
 
 #endif
diff --git a/thirdparty/libtheora/x86/mmxloop.h b/thirdparty/libtheora/x86/mmxloop.h
index 2e870c795d..1f6090b567 100644
--- a/thirdparty/libtheora/x86/mmxloop.h
+++ b/thirdparty/libtheora/x86/mmxloop.h
@@ -9,88 +9,191 @@
   On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
 #define OC_LOOP_FILTER8_MMX \
- "#OC_LOOP_FILTER8_MMX\n\t" \
- /*mm7=0*/ \
- "pxor %%mm7,%%mm7\n\t" \
- /*mm6:mm0={a0,...,a7}*/ \
- "movq %%mm0,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "punpckhbw %%mm7,%%mm6\n\t" \
- /*mm3:mm5={d0,...,d7}*/ \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
- "psubw %%mm3,%%mm0\n\t" \
- "psubw %%mm5,%%mm6\n\t" \
- /*mm3:mm1={b0,...,b7}*/ \
- "movq %%mm1,%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm3\n\t" \
- /*mm5:mm4={c0,...,c7}*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
- "pcmpeqw %%mm7,%%mm7\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm3,%%mm5\n\t" \
- /*Scale by 3.*/ \
- "pmullw %%mm7,%%mm4\n\t" \
- "pmullw %%mm7,%%mm5\n\t" \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "psllw $2,%%mm7\n\t" \
- "movq (%[ll]),%%mm0\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
- "psubw %%mm7,%%mm4\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "psraw $3,%%mm4\n\t" \
- "psraw $3,%%mm5\n\t" \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "packsswb %%mm5,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "pxor %%mm7,%%mm4\n\t" \
- "packuswb %%mm3,%%mm1\n\t" \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
- "pcmpgtb %%mm4,%%mm6\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "pxor %%mm6,%%mm4\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "psubb %%mm6,%%mm4\n\t" \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
- "paddusb %%mm4,%%mm7\n\t" \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
- "paddusb %%mm7,%%mm4\n\t" \
- "psubusb %%mm7,%%mm4\n\t" \
- /*Now split mm4 by the original sign of -R_i.*/ \
- "movq %%mm4,%%mm5\n\t" \
- "pand %%mm6,%%mm4\n\t" \
- "pandn %%mm5,%%mm6\n\t" \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
- "paddusb %%mm4,%%mm1\n\t" \
- "psubusb %%mm4,%%mm2\n\t" \
- "psubusb %%mm6,%%mm1\n\t" \
- "paddusb %%mm6,%%mm2\n\t" \
+  "#OC_LOOP_FILTER8_MMX\n\t" \
+  /*mm7=0*/ \
+  "pxor %%mm7,%%mm7\n\t" \
+  /*mm6:mm0={a0,...,a7}*/ \
+  "movq %%mm0,%%mm6\n\t" \
+  "punpcklbw %%mm7,%%mm0\n\t" \
+  "punpckhbw %%mm7,%%mm6\n\t" \
+  /*mm3:mm5={d0,...,d7}*/ \
+  "movq %%mm3,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm3\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  "psubw %%mm3,%%mm0\n\t" \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*mm3:mm1={b0,...,b7}*/ \
+  "movq %%mm1,%%mm3\n\t" \
+  "punpcklbw %%mm7,%%mm1\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm3\n\t" \
+  /*mm5:mm4={c0,...,c7}*/ \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  "pcmpeqw %%mm7,%%mm7\n\t" \
+  "psubw %%mm1,%%mm4\n\t" \
+  "psrlw $14,%%mm7\n\t" \
+  "psubw %%mm3,%%mm5\n\t" \
+  /*Scale by 3.*/ \
+  "pmullw %%mm7,%%mm4\n\t" \
+  "pmullw %%mm7,%%mm5\n\t" \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  "psrlw $1,%%mm7\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psllw $2,%%mm7\n\t" \
+  "movq (%[ll]),%%mm0\n\t" \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm7,%%mm5\n\t" \
+  "psraw $3,%%mm4\n\t" \
+  "psraw $3,%%mm5\n\t" \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "packsswb %%mm5,%%mm4\n\t" \
+  "pxor %%mm6,%%mm6\n\t" \
+  "pxor %%mm7,%%mm4\n\t" \
+  "packuswb %%mm3,%%mm1\n\t" \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  "pcmpgtb %%mm4,%%mm6\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "pxor %%mm6,%%mm4\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "psubb %%mm6,%%mm4\n\t" \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  "paddusb %%mm4,%%mm7\n\t" \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  "paddusb %%mm7,%%mm4\n\t" \
+  "psubusb %%mm7,%%mm4\n\t" \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  "movq %%mm4,%%mm5\n\t" \
+  "pand %%mm6,%%mm4\n\t" \
+  "pandn %%mm5,%%mm6\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm4,%%mm1\n\t" \
+  "psubusb %%mm4,%%mm2\n\t" \
+  "psubusb %%mm6,%%mm1\n\t" \
+  "paddusb %%mm6,%%mm2\n\t" \
 
-#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+  All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+  /*This first part is based on the transformation \
+      f = -(3*(c-b)+a-d+4>>3) \
+        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+        = -(3*(c+~b)+(a+~d)-1016>>3) \
+        = 127-(3*(c+~b)+(a+~d)>>3) \
+        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+    Using this, the last expression above can be computed in 8 bits of working \
+     precision via: \
+      u = ~pavgb(~b,c); \
+      v = pavgb(b,~c); \
+      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+      m = u-v; \
+      t = m^pavgb(m^~a,m^d); \
+      f = 128+pavgb(pavgb(t,u),v); \
+    This required some careful analysis to ensure that carries are propagated \
+     correctly in all cases, but has been checked exhaustively.*/ \
+  /*input (a, b, c, d, ., ., ., .)*/ \
+  /*ff=0xFF; \
+    u=b; \
+    v=c; \
+    ll=255-2*L;*/ \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "movq %%mm1,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "movq (%[ll]),%%mm6\n\t" \
+  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u^=ff; \
+    v^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm5\n\t" \
+  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u=pavgb(u,c); \
+    v=pavgb(v,b);*/ \
+  "pavgb %%mm2,%%mm4\n\t" \
+  "pavgb %%mm1,%%mm5\n\t" \
+  /*u^=ff; \
+    a^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm0\n\t" \
+  /*m=u-v;*/ \
+  "psubb %%mm5,%%mm4\n\t" \
+  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+  /*a^=m; \
+    d^=m;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "pxor %%mm4,%%mm3\n\t" \
+  /*t=pavgb(a,d);*/ \
+  "pavgb %%mm3,%%mm0\n\t" \
+  "psllw $7,%%mm7\n\t" \
+  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+  /*t^=m; \
+    u=m+v;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "paddb %%mm5,%%mm4\n\t" \
+  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+  /*f=pavgb(f,u); \
+    of=128;*/ \
+  "pavgb %%mm4,%%mm0\n\t" \
+  "packsswb %%mm7,%%mm7\n\t" \
+  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+  /*f=pavgb(f,v);*/ \
+  "pavgb %%mm5,%%mm0\n\t" \
+  "movq %%mm7,%%mm3\n\t" \
+  "movq %%mm6,%%mm4\n\t" \
+  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but staying in 8 bits gives much better parallelism).*/ \
+  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+    This is the same number of instructions as computing a mask and splitting \
+     after the lflim computation, but has shorter dependency chains.*/ \
+  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+  "psubusb %%mm0,%%mm3\n\t" \
+  "psubusb %%mm7,%%mm0\n\t" \
+  /*mm6=255-max(2*L-abs(R_i<0),0) \
+    mm4=255-max(2*L-abs(R_i>0),0)*/ \
+  "paddusb %%mm3,%%mm4\n\t" \
+  "paddusb %%mm0,%%mm6\n\t" \
+  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+  "paddusb %%mm4,%%mm3\n\t" \
+  "paddusb %%mm6,%%mm0\n\t" \
+  "psubusb %%mm4,%%mm3\n\t" \
+  "psubusb %%mm6,%%mm0\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm3,%%mm1\n\t" \
+  "psubusb %%mm3,%%mm2\n\t" \
+  "psubusb %%mm0,%%mm1\n\t" \
+  "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
   do{ \
     ptrdiff_t ystride3__; \
     __asm__ __volatile__( \
@@ -104,7 +207,7 @@
       "movq (%[pix],%[ystride]),%%mm1\n\t" \
       /*mm2={c0,...,c7}*/ \
       "movq (%[pix],%[ystride],2),%%mm2\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*Write it back out.*/ \
       "movq %%mm1,(%[pix],%[ystride])\n\t" \
       "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
@@ -116,7 +219,7 @@
   } \
   while(0)
 
-#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
   do{ \
     unsigned char *pix__; \
     ptrdiff_t      ystride3__; \
@@ -174,7 +277,7 @@
       "punpckldq %%mm5,%%mm2\n\t" \
       /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
       "punpckhdq %%mm5,%%mm3\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*mm2={b0+R_0'',...,b7+R_7''}*/ \
       "movq %%mm1,%%mm0\n\t" \
       /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
diff --git a/thirdparty/libtheora/x86/mmxstate.c b/thirdparty/libtheora/x86/mmxstate.c
index 808b0a789b..eebea14fba 100644
--- a/thirdparty/libtheora/x86/mmxstate.c
+++ b/thirdparty/libtheora/x86/mmxstate.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,23 +19,23 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            mb_mode;
+  int            refi;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
     /*Note that this value must be unsigned, to keep the __asm__ block from
        sign-extending it when it puts it in a register.*/
     ogg_uint16_t p;
+    int          i;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
@@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
       "punpcklwd %%mm0,%%mm0\n\t"
       /*mm0=AAAA AAAA AAAA AAAA*/
       "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
       :
-      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
-      :"memory"
+      :[p]"r"((unsigned)p)
     );
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+      );
+    }
   }
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,_flimit,8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   OC_ALIGN8(unsigned char   ll[8]);
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
@@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
       if(frags[fragi].coded){
         unsigned char *ref;
         ref=ref_frame_data+frag_buf_offs[fragi];
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
         }
       }
       fragi++;
diff --git a/thirdparty/libtheora/x86/sse2encfrag.c b/thirdparty/libtheora/x86/sse2encfrag.c
new file mode 100644
index 0000000000..43aeb17711
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2encfrag.c
@@ -0,0 +1,501 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+#include "sse2trans.h"
+
+#if defined(OC_X86_ASM)
+
+/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
+   16-bit differences.
+  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
+  xmm4 and xmm5 are clobbered.*/
+#define OC_LOAD_SUB_4x8(_m0) \
+ "#OC_LOAD_SUB_4x8\n\t" \
+ /*Load the first three rows.*/ \
+ "movq (%[src]),"_m0"\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
+ /*Unpack and subtract.*/ \
+ "punpcklbw %%xmm4,"_m0"\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm4,"_m0"\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+ /*Load the last row.*/ \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
+ /*Unpack, subtract, and advance the pointers.*/ \
+ "punpcklbw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ystride],4),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm3\n\t" \
+
+/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
+  On output, xmm0 contains the sum of two of the rows, and the other two are
+   added to xmm7.*/
+#define OC_SSD_4x8(_m0) \
+ "pmaddwd "_m0","_m0"\n\t" \
+ "pmaddwd %%xmm1,%%xmm1\n\t" \
+ "pmaddwd %%xmm2,%%xmm2\n\t" \
+ "pmaddwd %%xmm3,%%xmm3\n\t" \
+ "paddd %%xmm1,"_m0"\n\t" \
+ "paddd %%xmm3,%%xmm2\n\t" \
+ "paddd %%xmm2,%%xmm7\n\t" \
+
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_4x8("%%xmm7")
+    OC_SSD_4x8("%%xmm7")
+    OC_LOAD_SUB_4x8("%%xmm0")
+    OC_SSD_4x8("%%xmm0")
+    "paddd %%xmm0,%%xmm7\n\t"
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+  );
+  return ret;
+}
+
+static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+};
+
+/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
+   horizontal sums as well as their 16-bit differences subject to a mask.
+  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+#define OC_LOAD_SUB_MASK_2x8 \
+ "#OC_LOAD_SUB_MASK_2x8\n\t" \
+ /*Start the loads and expand the next 8 bits of the mask.*/ \
+ "shl $8,%[m]\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "movq (%[ref]),%%xmm2\n\t" \
+ "movd %[m],%%xmm4\n\t" \
+ "shr $8,%[m]\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ /*Perform the masking.*/ \
+ "pand %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm2\n\t" \
+ /*Finish the loads while unpacking the first set of rows, and expand the next
+    8 bits of the mask.*/ \
+ "movd %[m],%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm0\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm2\n\t" \
+ /*Mask and unpack the second set of rows.*/ \
+ "pand %%xmm4,%%xmm1\n\t" \
+ "pand %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  ptrdiff_t ystride;
+  unsigned  ret;
+  int       i;
+  ystride=_ystride;
+  __asm__ __volatile__(
+    "pxor %%xmm7,%%xmm7\n\t"
+    "movq %[c],%%xmm6\n\t"
+    :
+    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+  );
+  for(i=0;i<4;i++){
+    unsigned m;
+    m=_mask&0xFFFF;
+    _mask>>=16;
+    if(m){
+      __asm__ __volatile__(
+        OC_LOAD_SUB_MASK_2x8
+        "pmaddwd %%xmm0,%%xmm0\n\t"
+        "pmaddwd %%xmm1,%%xmm1\n\t"
+        "paddd %%xmm0,%%xmm7\n\t"
+        "paddd %%xmm1,%%xmm7\n\t"
+        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+      );
+    }
+    _src+=2*ystride;
+    _ref+=2*ystride;
+  }
+  __asm__ __volatile__(
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+  );
+  return ret;
+}
+
+
+/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
+   16-bit difference in %%xmm0...%%xmm7.*/
+#define OC_LOAD_SUB_8x8 \
+ "#OC_LOAD_SUB_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movq (%[src]),%%xmm2\n\t" \
+ "movq (%[ref]),%%xmm7\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
+ "punpcklbw %%xmm4,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm0\n\t" \
+ "movq (%[src]),%%xmm4\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm5,%%xmm1\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm5,%%xmm1\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm2\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm6,%%xmm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm3\n\t" \
+ "movq (%[src]),%%xmm6\n\t" \
+ "punpcklbw %%xmm0,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm7,%%xmm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm5\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%xmm0,%%xmm6\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
+ "punpcklbw %%xmm0,%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+
+/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
+#define OC_LOAD_8x8 \
+ "#OC_LOAD_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "pxor %%xmm7,%%xmm7\n\t" \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "punpcklbw %%xmm7,%%xmm0\n\t" \
+ "movq (%[src4]),%%xmm4\n\t" \
+ "punpcklbw %%xmm7,%%xmm1\n\t" \
+ "movq (%[src4],%[ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm3\n\t" \
+ "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psrlw $8,%%xmm4\n\t" \
+ "psrlw $8,%%xmm5\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psrlw $8,%%xmm6\n\t" \
+ "psrlw $8,%%xmm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x8 \
+ "#OC_HADAMARD_AB_8x8\n\t" \
+ /*Stage A:*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm1,%%xmm5\n\t" \
+ "psubw %%xmm2,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm3\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "psubw %%xmm3,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*Stage B:*/ \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm5\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm6\n\t" \
+ "psubw %%xmm5,%%xmm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x8 \
+ "#OC_HADAMARD_C_8x8\n\t" \
+ /*Stage C:*/ \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm1\n\t" \
+ "psubw %%xmm2,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm5\n\t" \
+ "psubw %%xmm6,%%xmm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_8x8 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, %%xmm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ /*We use the fact that \
+     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+    to merge the final butterfly with the abs and the first stage of \
+    accumulation. \
+   Thus we can avoid using pabsw, which is not available until SSSE3. \
+   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
+    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+    registers). \
+   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+   This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm7={0x7FFF}x4 \
+   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
+ "pcmpeqb %%xmm7,%%xmm7\n\t" \
+ "movdqa %%xmm4,%%xmm6\n\t" \
+ "psrlw $1,%%xmm7\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm4\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm4\n\t" \
+ /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
+   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
+ "movdqa %%xmm2,%%xmm6\n\t" \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ "pmaxsw %%xmm3,%%xmm2\n\t" \
+ "pmaxsw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddsw %%xmm7,%%xmm1\n\t" \
+ "psubw %%xmm6,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm0\n\t" \
+ /*xmm7={1}x4 (needed for the horizontal add that follows) \
+   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm3,%%xmm0\n\t" \
+ "psrlw $14,%%xmm7\n\t" \
+ "psubw %%xmm6,%%xmm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into xmm0.*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into xmm0.
+  Note that xmm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_8x8
+
+static unsigned oc_int_frag_satd_sse2(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
+       latency of pmaddwd by starting to compute abs(dc) here.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    "movsx %w[dc],%[dc]\n\t"
+    "cdq\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    "xor %[dc],%[ret2]\n\t"
+    "sub %[ret2],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+       constraints, otherewise if gcc can prove they're equal it will allocate
+       them to the same register (which is bad); _src and _ref face a similar
+       problem.
+      All four are destructively modified, but if we list them as output
+       constraints, gcc can't alias them with other outputs.*/
+    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
+    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+    /*We have to use neg, so we actually clobber the condition codes for once
+       (not to mention sub, and add).*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
+}
+
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
+}
+
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  int      dc;
+  __asm__ __volatile__(
+    OC_LOAD_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    /*We assume that the DC coefficient is always positive (which is true,
+       because the input to the INTRA transform was not a difference).*/
+    "movzx %w[dc],%[dc]\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    "lea -64(%[ret],%[ret]),%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    :[ret]"=a"(ret),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
+    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+    /*We have to use sub, so we actually clobber the condition codes for once.*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/sse2fdct.c b/thirdparty/libtheora/x86/sse2fdct.c
index 86c17d68b1..64c1d27372 100644
--- a/thirdparty/libtheora/x86/sse2fdct.c
+++ b/thirdparty/libtheora/x86/sse2fdct.c
@@ -13,12 +13,14 @@
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include <stddef.h>
 #include "x86enc.h"
+#include "x86zigzag.h"
+#include "sse2trans.h"
 
 #if defined(OC_X86_64_ASM)
 
-# define OC_FDCT8x8 \
+# define OC_FDCT_8x8 \
  /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
- "#OC_FDCT8x8\n\t" \
+ "#OC_FDCT_8x8\n\t" \
  /*Stage 1:*/ \
  "movdqa %%xmm0,%%xmm11\n\t" \
  "movdqa %%xmm1,%%xmm10\n\t" \
@@ -349,81 +351,6 @@
  "psubw %%xmm14,%%xmm10\n\t" \
  "paddw %%xmm10,%%xmm7\n\t " \
 
-# define OC_TRANSPOSE8x8 \
- "#OC_TRANSPOSE8x8\n\t" \
- "movdqa %%xmm4,%%xmm8\n\t" \
- /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
- "punpcklwd %%xmm5,%%xmm4\n\t" \
- /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
- "punpckhwd %%xmm5,%%xmm8\n\t" \
- /*xmm5 is free.*/ \
- "movdqa %%xmm0,%%xmm5\n\t" \
- /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
- "punpcklwd %%xmm1,%%xmm0\n\t" \
- /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
- "punpckhwd %%xmm1,%%xmm5\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
- "punpcklwd %%xmm7,%%xmm6\n\t" \
- /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
- "punpckhwd %%xmm7,%%xmm1\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm2,%%xmm7\n\t" \
- /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
- "punpcklwd %%xmm3,%%xmm7\n\t" \
- /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "punpckhwd %%xmm3,%%xmm2\n\t" \
- /*xmm3 is free.*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
- "punpckldq %%xmm7,%%xmm0\n\t" \
- /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
- "punpckhdq %%xmm7,%%xmm3\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
- "punpckldq %%xmm2,%%xmm5\n\t" \
- /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
- "punpckhdq %%xmm2,%%xmm7\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm4,%%xmm2\n\t" \
- /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
- "punpckldq %%xmm6,%%xmm2\n\t" \
- /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
- "punpckhdq %%xmm6,%%xmm4\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm8,%%xmm6\n\t" \
- /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
- "punpckldq %%xmm1,%%xmm6\n\t" \
- /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "punpckhdq %%xmm1,%%xmm8\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm0,%%xmm1\n\t" \
- /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "punpcklqdq %%xmm2,%%xmm0\n\t" \
- /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
- "punpckhqdq %%xmm2,%%xmm1\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm3,%%xmm2\n\t" \
- /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
- "punpcklqdq %%xmm4,%%xmm2\n\t" \
- /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
- "punpckhqdq %%xmm4,%%xmm3\n\t" \
- /*xmm4 is free.*/ \
- "movdqa %%xmm5,%%xmm4\n\t" \
- /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
- "punpcklqdq %%xmm6,%%xmm4\n\t" \
- /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
- "punpckhqdq %%xmm6,%%xmm5\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm7,%%xmm6\n\t" \
- /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
- "punpcklqdq %%xmm8,%%xmm6\n\t" \
- /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
- "punpckhqdq %%xmm8,%%xmm7\n\t" \
- /*xmm8 is free.*/ \
-
 /*SSE2 implementation of the fDCT for x86-64 only.
   Because of the 8 extra XMM registers on x86-64, this version can operate
    without any temporary stack access at all.*/
@@ -482,12 +409,10 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
     "psubw %%xmm9,%%xmm1\n\t"
     /*Transform columns.*/
-    OC_FDCT8x8
+    OC_FDCT_8x8
     /*Transform rows.*/
-    OC_TRANSPOSE8x8
-    OC_FDCT8x8
-    /*TODO: zig-zag ordering?*/
-    OC_TRANSPOSE8x8
+    OC_TRANSPOSE_8x8
+    OC_FDCT_8x8
     /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
     "paddw %%xmm14,%%xmm14\n\t"
     "psubw %%xmm14,%%xmm0\n\t"
@@ -506,15 +431,19 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "psubw %%xmm14,%%xmm7\n\t"
     "psraw $2,%%xmm6\n\t"
     "psraw $2,%%xmm7\n\t"
-    /*Store the result.*/
-    "movdqa %%xmm0,0x00(%[y])\n\t"
-    "movdqa %%xmm1,0x10(%[y])\n\t"
-    "movdqa %%xmm2,0x20(%[y])\n\t"
-    "movdqa %%xmm3,0x30(%[y])\n\t"
-    "movdqa %%xmm4,0x40(%[y])\n\t"
-    "movdqa %%xmm5,0x50(%[y])\n\t"
-    "movdqa %%xmm6,0x60(%[y])\n\t"
-    "movdqa %%xmm7,0x70(%[y])\n\t"
+    /*Transpose, zig-zag, and store the result.*/
+    /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
+       version will do for now.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
     :[a]"=&r"(a)
     :[y]"r"(_y),[x]"r"(_x)
     :"memory"
diff --git a/thirdparty/libtheora/x86/sse2idct.c b/thirdparty/libtheora/x86/sse2idct.c
new file mode 100644
index 0000000000..4597ab074f
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2idct.c
@@ -0,0 +1,456 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86int.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*A table of constants used by the MMX routines.*/
+const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+        8,      8,      8,      8,      8,      8,      8,      8,
+  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
+/*Performs the first three stages of the iDCT.
+  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+   (accessed in that order).
+  The remaining rows must be in _x at their corresponding locations.
+  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC(_x) \
+  "#OC_IDCT_8x8_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
+  "movdqa %%xmm1,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm1\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm6,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  "paddw %%xmm4,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+  "movdqa %%xmm4,%%xmm2\n\t" \
+  "movdqa %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm4\n\t" \
+  "pmulhw %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm6\n\t" \
+  "pmulhw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm6,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  "psubw %%xmm4,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+  "movdqa %%xmm3,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm5,%%xmm3\n\t" \
+  "pmulhw %%xmm5,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm7\n\t" \
+  "psubw %%xmm4,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+  "paddw %%xmm7,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm7\n\t" \
+  "psubw %%xmm6,%%xmm7\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm3\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "movdqa %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm5\n\t" \
+  "paddw %%xmm7,%%xmm5\n\t" \
+  "movdqa %%xmm0,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+  "#OC_IDCT_8x8_D\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+  "#OC_IDCT_8x8_D_STORE\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm4,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm5\n\t" \
+  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "psraw $4,%%xmm0\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
+  "psraw $4,%%xmm1\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
+  "psraw $4,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
+  "psraw $4,%%xmm3\n\t" \
+  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
+  "psraw $4,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "psraw $4,%%xmm5\n\t" \
+  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
+  "psraw $4,%%xmm6\n\t" \
+  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
+  "psraw $4,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  int i;
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+    OC_IDCT_8x8_ABC(x)
+    OC_IDCT_8x8_D
+    OC_TRANSPOSE_8x8
+    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+    OC_IDCT_8x8_ABC(y)
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+  /*Clear input data for next block (decoder only).*/
+  for(i=0;i<2;i++){
+    __asm__ __volatile__(
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+    );
+  }
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+   need to work with four columns at a time.
+  Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+  "#OC_IDCT_8x8_10_MMX\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
+  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+  "pmulhw %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "paddw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+  "pmulhw %%mm1,%%mm3\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+  "movq %%mm3,%%mm6\n\t" \
+  "paddw %%mm1,%%mm7\n\t" \
+  /*0-1 butterfly. \
+    mm4=C4, mm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+  "psubw %%mm5,%%mm3\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "pmulhw %%mm0,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "movq %%mm7,%%mm0\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "paddw %%mm2,%%mm0\n\t" \
+  "psubw %%mm2,%%mm7\n\t" \
+  "movq %%mm1,%%mm2\n\t" \
+  "pmulhw %%mm6,%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm2\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+  "paddw %%mm7,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "paddw %%mm4,%%mm7\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm6,%%mm5\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "psubw %%mm1,%%mm6\n\t" \
+  "psubw %%mm2,%%mm5\n\t" \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "paddw %%mm5,%%mm2\n\t" \
+  "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+  "#OC_IDCT_8x8_10_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
+  "paddw %%xmm6,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+  "pmulhw %%xmm3,%%xmm5\n\t" \
+  "pmulhw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
+  "paddw %%xmm3,%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+  "pmulhw %%xmm1,%%xmm3\n\t" \
+  "pmulhw %%xmm1,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=C4, xmm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+  "psubw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm1\n\t" \
+  "pmulhw %%xmm0,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm4\n\t" \
+  "movdqa %%xmm7,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
+    OC_IDCT_8x8_10_MMX
+    OC_TRANSPOSE_8x4_MMX2SSE
+    OC_IDCT_8x8_10_ABC
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  /*Clear input data for next block (decoder only).*/
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+  );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+  else oc_idct8x8_slow_sse2(_y,_x);
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/sse2trans.h b/thirdparty/libtheora/x86/sse2trans.h
new file mode 100644
index 0000000000..e76da5140b
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2trans.h
@@ -0,0 +1,242 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "x86int.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+  By clever choices of the order to apply the butterflies and the order of
+   their outputs, we can take the rows in order and output the columns in order
+   without any extra operations and using just one temporary register.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+  Again, the butterflies are carefully arranged to get the columns to come out
+   in order, minimizing register spills and maximizing the delay between a load
+   and when the value loaded is actually used.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
+
+# endif
+
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+   four SSE registers.
+  No need to be clever here; we have plenty of room.*/
+#  define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
+#endif
diff --git a/thirdparty/libtheora/x86/x86cpu.c b/thirdparty/libtheora/x86/x86cpu.c
new file mode 100644
index 0000000000..49fd76d0ac
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86cpu.c
@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# else
+/*On x86-32, not so much.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif
diff --git a/thirdparty/libtheora/cpu.h b/thirdparty/libtheora/x86/x86cpu.h
index a43c957a39..e0192d52e2 100644
--- a/thirdparty/libtheora/cpu.h
+++ b/thirdparty/libtheora/x86/x86cpu.h
@@ -10,13 +10,13 @@
  *                                                                  *
  ********************************************************************
  function:
-    last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "internal.h"
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
 
 #define OC_CPU_X86_MMX      (1<<0)
 #define OC_CPU_X86_3DNOW    (1<<1)
@@ -31,4 +31,6 @@
 #define OC_CPU_X86_SSE4A    (1<<10)
 #define OC_CPU_X86_SSE5     (1<<11)
 
+ogg_uint32_t oc_cpu_flags_get(void);
+
 #endif
diff --git a/thirdparty/libtheora/x86/x86enc.c b/thirdparty/libtheora/x86/x86enc.c
index 43b7be3ea3..ffa9c14a42 100644
--- a/thirdparty/libtheora/x86/x86enc.c
+++ b/thirdparty/libtheora/x86/x86enc.c
@@ -18,32 +18,46 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
-  cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_enc);
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
   if(cpu_flags&OC_CPU_X86_MMX){
     _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
     _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
     _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
-# if defined(OC_X86_64_ASM)
-    /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
+#  if defined(OC_X86_64_ASM)
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+#  endif
+    _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
+    _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
+    _enc->opt_vtable.quantize=oc_enc_quantize_sse2;
+# else
+    (void) cpu_flags;
 # endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+# if defined(OC_ENC_USE_VTABLE)
   }
+# endif
 }
 #endif
diff --git a/thirdparty/libtheora/x86/x86enc.h b/thirdparty/libtheora/x86/x86enc.h
index 06c3908bcd..c258247d67 100644
--- a/thirdparty/libtheora/x86/x86enc.h
+++ b/thirdparty/libtheora/x86/x86enc.h
@@ -17,11 +17,62 @@
 
 #if !defined(_x86_x86enc_H)
 # define _x86_x86enc_H (1)
-# include "../encint.h"
 # include "x86int.h"
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
+  oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
+#   define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
+  oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  oc_enc_enquant_table_init_x86(_enquant,_dequant)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
+#  define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_enc_fdct8x8(_enc,_y,_x) \
+  oc_enc_fdct8x8_x86_64sse2(_y,_x)
+#  else
+#   define OC_ENC_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../encint.h"
 
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
@@ -29,19 +80,35 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
-void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,const unsigned char *_y,int _stride);
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,int _stride);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+# if defined(OC_X86_64_ASM)
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+# endif
 
 #endif
diff --git a/thirdparty/libtheora/x86/x86enquant.c b/thirdparty/libtheora/x86/x86enquant.c
new file mode 100644
index 0000000000..39477ecc21
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86enquant.c
@@ -0,0 +1,149 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+
+ ********************************************************************/
+
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+
+
+/*The default enquant table is not quite suitable for SIMD purposes.
+  First, the m and l parameters need to be separated so that an entire row full
+   of m's or l's can be loaded at a time.
+  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
+   emulate one with a multiply.
+  Therefore we translate the shift count into a scale factor.*/
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  ogg_int16_t *m;
+  ogg_int16_t *l;
+  int          zzi;
+  m=(ogg_int16_t *)_enquant;
+  l=m+64;
+  for(zzi=0;zzi<64;zzi++){
+    oc_iquant q;
+    oc_iquant_init(&q,_dequant[zzi]);
+    m[zzi]=q.m;
+    /*q.l must be at least 2 for this to work; fortunately, once all the scale
+       factors are baked in, the minimum quantizer is much larger than that.*/
+    l[zzi]=1<<16-q.l;
+  }
+}
+
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+  int pli;
+  int qii;
+  int qti;
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
+  }
+}
+
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+  ptrdiff_t r;
+  __asm__ __volatile__(
+    "xor %[r],%[r]\n\t"
+    /*Loop through two rows at a time.*/
+    ".p2align 4\n\t"
+    "0:\n\t"
+    /*Load the first two rows of the data and the quant matrices.*/
+    "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
+    "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
+    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
+    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
+    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
+    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
+    /*Double the input and propagate its sign to the rounding factor.
+      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
+    "movdqa %%xmm0,%%xmm6\n\t"
+    "psraw $15,%%xmm0\n\t"
+    "movdqa %%xmm1,%%xmm7\n\t"
+    "paddw %%xmm6,%%xmm6\n\t"
+    "psraw $15,%%xmm1\n\t"
+    "paddw %%xmm7,%%xmm7\n\t"
+    "paddw %%xmm0,%%xmm2\n\t"
+    "paddw %%xmm1,%%xmm3\n\t"
+    "pxor %%xmm0,%%xmm2\n\t"
+    "pxor %%xmm1,%%xmm3\n\t"
+    /*Add the rounding factor and perform the first multiply.*/
+    "paddw %%xmm2,%%xmm6\n\t"
+    "paddw %%xmm3,%%xmm7\n\t"
+    "pmulhw %%xmm6,%%xmm4\n\t"
+    "pmulhw %%xmm7,%%xmm5\n\t"
+    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
+    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
+    "paddw %%xmm4,%%xmm6\n\t"
+    "paddw %%xmm5,%%xmm7\n\t"
+    /*Emulate an element-wise right-shift via a second multiply.*/
+    "pmulhw %%xmm2,%%xmm6\n\t"
+    "pmulhw %%xmm3,%%xmm7\n\t"
+    "add $32,%[r]\n\t"
+    "cmp $96,%[r]\n\t"
+    /*Correct for the sign.*/
+    "psubw %%xmm0,%%xmm6\n\t"
+    "psubw %%xmm1,%%xmm7\n\t"
+    /*Save the result.*/
+    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
+    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
+    "jle 0b\n\t"
+    /*Now find the location of the last non-zero value.*/
+    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pxor %%xmm0,%%xmm0\n\t"
+    "mov $-1,%k[dq]\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "mov $32,%[r]\n\t"
+    /*We have to use xor here instead of not in order to set the flags.*/
+    "xor %k[dq],%k[q]\n\t"
+    "jnz 1f\n\t"
+    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
+    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
+    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "xor %[r],%[r]\n\t"
+    "not %k[q]\n\t"
+    "or $1,%k[q]\n\t"
+    "1:\n\t"
+    "bsr %k[q],%k[q]\n\t"
+    "add %k[q],%k[r]\n\t"
+    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+    :[dct]"r"(_dct),[qdct]"r"(_qdct)
+    :"cc","memory"
+  );
+  return (int)r;
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/x86int.h b/thirdparty/libtheora/x86/x86int.h
index ede724f5aa..ceb2dbb0ec 100644
--- a/thirdparty/libtheora/x86/x86int.h
+++ b/thirdparty/libtheora/x86/x86int.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,24 +19,104 @@
 # define _x86_x86int_H (1)
 # include "../internal.h"
 
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_mmx(_dst,_src,_ystride)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
+#   define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_sse2(_y,_x,_last_zzi)
+#   define oc_state_frag_recon oc_state_frag_recon_mmx
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_mmxext(_bv,_flimit)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
+#   define oc_restore_fpu(_state) \
+  oc_restore_fpu_mmx()
+#  else
+#   define OC_STATE_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../state.h"
+# include "x86cpu.h"
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+  To avoid warnings, we force an offset with %H (which adds 8).*/
+# if __GNUC_PREREQ(4,0)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs-8+%H[_name])
+# endif
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+  Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+   whole offset, instead of substituting in 0 for the missing operand to +.*/
+# if !defined(OC_MEM_OFFS)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs+%[_name])
+# endif
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
+    array_addr__; \
+  }))
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    const struct{_type array_value__[(_size)];} *array_addr__= \
+     (const void *)(_ptr); \
+    array_addr__; \
+  }))
+
+extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
+void oc_state_accel_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif
diff --git a/thirdparty/libtheora/x86/x86state.c b/thirdparty/libtheora/x86/x86state.c
index a786bec284..9f8bceb534 100644
--- a/thirdparty/libtheora/x86/x86state.c
+++ b/thirdparty/libtheora/x86/x86state.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,8 +19,7 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
+#if defined(OC_STATE_USE_VTABLE)
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@@ -39,24 +38,60 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+#endif
+
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
 };
 
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
   _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
-  else oc_state_vtable_init_c(_state);
+  if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmxext;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_SSE2){
+    _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+# endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+# if defined(OC_STATE_USE_VTABLE)
+  }
+# endif
 }
 #endif
diff --git a/thirdparty/libtheora/x86/x86zigzag.h b/thirdparty/libtheora/x86/x86zigzag.h
new file mode 100644
index 0000000000..fb21e0bb43
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86zigzag.h
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86zigzag_H)
+# define _x86_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in %[y].
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
+   the first four and second four entries of each row into the specified
+   register, respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO(0,"%%mm0")   /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,"%%mm1")   /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,"%%mm2")   /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,"%%mm3")   /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,"%%mm4")   /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,"%%mm5")   /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,"%%mm6")   /*mm6=23 22 21 20*/ \
+  "movq %%mm0,%%mm7\n\t"         /*mm7=03 02 01 00*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=11 10 03 02*/ \
+  "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
+  "punpcklwd %%mm0,%%mm1\n\t"    /*mm1=03 09 02 08*/ \
+  "pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
+  "punpcklwd %%mm1,%%mm7\n\t"    /*mm7=02 01 08 00 *A*/ \
+  "movq %%mm7,0x00(%[y])\n\t" \
+  "punpckhwd %%mm4,%%mm1\n\t"    /*mm1=04 03 07 09*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=19 18 17 16*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=04 03 11 10*/ \
+  "punpckhwd %%mm5,%%mm7\n\t"    /*mm7=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=25 07 24 09*/ \
+  "punpcklwd %%mm6,%%mm5\n\t"    /*mm5=21 14 20 13*/ \
+  "punpcklwd %%mm2,%%mm1\n\t"    /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO(4,"%%mm2")   /*mm2=35 34 33 32*/ \
+  "movq %%mm1,0x08(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO(5,"%%mm1")   /*mm1=43 42 41 40*/ \
+  "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
+  "movq %%mm0,0x10(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=?? 07 23 22*/ \
+  "punpckldq %%mm5,%%mm4\n\t"    /*mm4=20 13 06 05 *D*/ \
+  "movq %%mm4,0x28(%[y])\n\t" \
+  "psrlq $16,%%mm3\n\t"          /*mm3=.. 27 26 25*/ \
+  "pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
+  "movq %%mm7,%%mm4\n\t"         /*mm4=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=26 33 25 32*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=41 15 40 18*/ \
+  "punpckhwd %%mm1,%%mm3\n\t"    /*mm3=43 .. 42 27*/ \
+  "punpckldq %%mm2,%%mm4\n\t"    /*mm4=25 32 40 18*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO(6,"%%mm0")   /*mm0=51 50 49 48*/ \
+  "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
+  "movq %%mm4,0x18(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO(7,"%%mm4")   /*mm4=59 58 57 56*/ \
+  "punpckhdq %%mm7,%%mm2\n\t"    /*mm2=12 19 26 33 *F*/ \
+  "movq %%mm2,0x20(%[y])\n\t" \
+  "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
+  "pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
+  "movq %%mm3,%%mm2\n\t"         /*mm2=35 42 34 27*/ \
+  "punpckhwd %%mm0,%%mm1\n\t"    /*mm1=50 43 48 41*/ \
+  "pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=48 41 34 27 *G*/ \
+  "movq %%mm3,0x30(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm1\n\t"    /*mm1=58 57 50 43 *H*/ \
+  "movq %%mm1,0x50(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_HI(7,"%%mm1")   /*mm1=63 62 61 60*/ \
+  "punpcklwd %%mm0,%%mm4\n\t"    /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI(6,"%%mm0")   /*mm0=55 54 53 52*/ \
+  "psllq $16,%%mm6\n\t"          /*mm6=07 23 22 ..*/ \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=49 56 51 59*/ \
+  "punpckhdq %%mm2,%%mm4\n\t"    /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI(3,"%%mm2")   /*mm2=31 30 29 28*/ \
+  "movq %%mm4,0x38(%[y])\n\t" \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=61 51 60 59*/ \
+  "punpcklwd %%mm6,%%mm7\n\t"    /*mm7=22 15 .. ??*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=61 51 60 59*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=53 60 52 59*/ \
+  "punpckhwd %%mm0,%%mm4\n\t"    /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI(4,"%%mm0")   /*mm0=39 38 37 36*/ \
+  "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
+  "movq %%mm3,0x68(%[y])\n\t" \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=?? ?? 54 51*/ \
+  "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
+  "punpckhwd %%mm1,%%mm4\n\t"    /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI(5,"%%mm1")   /*mm1=47 46 45 44*/ \
+  "movq %%mm4,0x78(%[y])\n\t" \
+  "punpckhwd %%mm2,%%mm6\n\t"    /*mm6=28 07 31 23*/ \
+  "punpcklwd %%mm0,%%mm2\n\t"    /*mm2=37 30 36 29*/ \
+  "punpckhdq %%mm6,%%mm5\n\t"    /*mm5=28 07 21 14*/ \
+  "pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
+  "pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
+  "movq %%mm5,0x40(%[y])\n\t" \
+  "punpckhdq %%mm2,%%mm7\n\t"    /*mm7=36 29 22 15 *M*/ \
+  "movq %%mm7,0x48(%[y])\n\t" \
+  "pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
+  "punpckhwd %%mm1,%%mm0\n\t"    /*mm0=46 39 45 38*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=47 54 44 51*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=45 38 31 23 *N*/ \
+  "movq %%mm6,0x60(%[y])\n\t" \
+  "punpckhdq %%mm3,%%mm0\n\t"    /*mm0=47 54 46 39*/ \
+  "punpckldq %%mm2,%%mm3\n\t"    /*mm3=30 37 44 51 *O*/ \
+  "movq %%mm3,0x58(%[y])\n\t" \
+  "pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
+  "movq %%mm0,0x70(%[y])\n\t" \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
index 94f1d06513..a6be819135 100644
--- a/thirdparty/libtheora/x86_vc/mmxencfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -266,7 +266,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
   The transform is performed in place, except that outputs 0-3 are swapped with
    outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
    perform this stage in place with no temporary registers).*/
 #define OC_HADAMARD_AB_8x4 __asm{ \
   /*Stage A: \
@@ -299,7 +299,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 }
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
    place with no temporary registers).*/
 #define OC_HADAMARD_C_8x4 __asm{ \
   /*Stage C:*/ \
@@ -468,12 +468,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
     mm7 = d3 c3 b3 a3*/ \
 }
 
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret;
+  unsigned     ret2;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC esi
@@ -481,8 +483,10 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
 #define SRC_YSTRIDE ecx
 #define REF_YSTRIDE edx
 #define BUF edi
-#define RET eax
-#define RET2 edx
+#define RET edx
+#define RET2 ecx
+#define DC eax
+#define DC_WORD ax
     mov SRC,_src
     mov SRC_YSTRIDE,_src_ystride
     mov REF,_ref
@@ -508,14 +512,18 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
     movq mm2,[0x20+BUF]
     movq mm3,[0x30+BUF]
     movq mm0,[0x00+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd DC,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
       Now we finally have to promote things to dwords.
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
-    mov RET2,_thresh
     pmaddwd mm0,mm7
     movq mm1,[0x50+BUF]
     movq mm5,[0x58+BUF]
@@ -525,29 +533,28 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
     movq mm6,[0x68+BUF]
     paddd mm4,mm0
     movq mm3,[0x70+BUF]
-    movd RET,mm4
+    movd RET2,mm4
     movq mm7,[0x78+BUF]
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    lea RET,[RET+RET-32]
     movq mm0,[0x40+BUF]
-    cmp RET,RET2
     movq mm4,[0x48+BUF]
-    jae at_end
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     pmaddwd mm0,mm7
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    sub RET,32
+    /*Subtract abs(dc) from 2*ret2.*/
+    movsx DC,DC_WORD
+    cdq
+    lea RET2,[RET+RET2*2]
     movq mm4,mm0
     punpckhdq mm0,mm0
+    xor RET,DC
     paddd mm4,mm0
-    movd RET2,mm4
-    lea RET,[RET+RET2*2]
-    align 16
-at_end:
-    mov ret1,RET
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    sub RET2,RET
+    movd RET,mm4
+    lea RET,[RET2+RET*2-64]
+    mov ret,RET
+    mov dc,DC
 #undef SRC
 #undef REF
 #undef SRC_YSTRIDE
@@ -555,18 +562,21 @@ at_end:
 #undef BUF
 #undef RET
 #undef RET2
+#undef DC
+#undef DC_WORD
   }
-  return ret1;
+  *_dc=dc;
+  return ret;
 }
 
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
 
 
 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+   we can share code with oc_enc_frag_satd2_mmxext().*/
 static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
   __asm{
@@ -694,30 +704,31 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
   }
 }
 
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
  int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret1;
+  unsigned     ret2;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC eax
 #define SRC4 esi
 #define BUF edi
-#define RET eax
-#define RET_WORD ax
-#define RET2 ecx
 #define YSTRIDE edx
 #define YSTRIDE3 ecx
+#define RET eax
+#define RET2 ecx
+#define DC edx
+#define DC_WORD dx
     mov SRC,_src
     mov BUF,bufp
     mov YSTRIDE,_ystride
@@ -749,7 +760,7 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
       middle.*/
     OC_HADAMARD_AB_8x4
     OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
-    movd RET,mm1
+    movd DC,mm1
     OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
@@ -767,31 +778,34 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
     movq mm3,[0x70+BUF]
     paddd mm4,mm0
     movq mm7,[0x78+BUF]
-    movd RET2,mm4
+    movd RET,mm4
     movq mm0,[0x40+BUF]
     movq mm4,[0x48+BUF]
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     pmaddwd mm0,mm7
     /*We assume that the DC coefficient is always positive (which is true,
     because the input to the INTRA transform was not a difference).*/
-    movzx RET,RET_WORD
-    add RET2,RET2
-    sub RET2,RET
+    movzx DC,DC_WORD
+    add RET,RET
+    sub RET,DC
     movq mm4,mm0
     punpckhdq mm0,mm0
     paddd mm4,mm0
-    movd RET,mm4
-    lea RET,[-64+RET2+RET*2]
+    movd RET2,mm4
+    lea RET,[-64+RET+RET2*2]
+    mov [dc],DC
     mov [ret1],RET
 #undef SRC
 #undef SRC4
 #undef BUF
-#undef RET
-#undef RET_WORD
-#undef RET2
 #undef YSTRIDE
 #undef YSTRIDE3
+#undef RET
+#undef RET2
+#undef DC
+#undef DC_WORD
   }
+  *_dc=dc;
   return ret1;
 }
 
diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c
index d908ce2413..c9ee530ea2 100644
--- a/thirdparty/libtheora/x86_vc/mmxfdct.c
+++ b/thirdparty/libtheora/x86_vc/mmxfdct.c
@@ -12,6 +12,7 @@
  /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -462,18 +463,22 @@
 }
 
 /*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  bufp=buf;
   __asm{
+#define X edx
 #define Y eax
 #define A ecx
-#define X edx
+#define BUF esi
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
     /*We also add biases to correct for some systematic error that remains in
        the full fDCT->iDCT round trip.*/
     mov X, _x
     mov Y, _y
+	mov BUF, bufp
     movq mm0,[0x00+X]
     movq mm1,[0x10+X]
     movq mm2,[0x20+X]
@@ -591,79 +596,90 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     movq mm3,[0x30+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
-    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x18+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x08+Y]
-    psraw mm3,2
-    psubw mm5,mm0
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x10]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
-    psraw mm5,2
-    psubw mm7,mm0
+    psubw mm0,mm2
+    movq [BUF+0x00],mm4
+    movq mm4,[Y+0x30]
     psraw mm6,2
-    psubw mm1,mm0
+    psubw mm5,mm2
+    movq [BUF+0x20],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x40],mm0
+    psraw mm5,2
+    psubw mm1,mm2
+    movq [BUF+0x50],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x60],mm3
+    psraw mm1,2
+    psubw mm4,mm2
+    movq [BUF+0x70],mm1
     psraw mm7,2
+    movq [BUF+0x10],mm7
+    psraw mm4,2
+    movq [BUF+0x30],mm4
+    /*Load the next block.*/
     movq mm0,[0x40+Y]
-    psraw mm1,2
-    movq [0x30+Y],mm7
     movq mm7,[0x78+Y]
-    movq [0x08+Y],mm1
     movq mm1,[0x50+Y]
-    movq [0x20+Y],mm6
     movq mm6,[0x68+Y]
-    movq [0x28+Y],mm2
     movq mm2,[0x60+Y]
-    movq [0x10+Y],mm5
     movq mm5,[0x58+Y]
-    movq [0x38+Y],mm3
     movq mm3,[0x70+Y]
-    movq [0x00+Y],mm4
     movq mm4,[0x48+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
-    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x58+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x48+Y]
-    psraw mm3,2
-    psubw mm5,mm0
-    movq [0x68+Y],mm2
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x50]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
-    movq [0x78+Y],mm3
-    psraw mm5,2
-    psubw mm7,mm0
-    movq [0x40+Y],mm4
+    psubw mm0,mm2
+    movq [BUF+0x08],mm4
+    movq mm4,[Y+0x70]
     psraw mm6,2
-    psubw mm1,mm0
-    movq [0x50+Y],mm5
-    psraw mm7,2
-    movq [0x60+Y],mm6
+    psubw mm5,mm2
+    movq [BUF+0x28],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x48],mm0
+    psraw mm5,2
+    psubw mm1,mm2
+    movq [BUF+0x58],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x68],mm3
     psraw mm1,2
-    movq [0x70+Y],mm7
-    movq [0x48+Y],mm1
+    psubw mm4,mm2
+    movq [BUF+0x78],mm1
+    psraw mm7,2
+    movq [BUF+0x18],mm7
+    psraw mm4,2
+    movq [BUF+0x38],mm4
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)] \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)+8] \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+#undef X
 #undef Y
 #undef A
-#undef X
+#undef BUF
   }
 }
 
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.c b/thirdparty/libtheora/x86_vc/mmxfrag.c
index 4eb2084dc6..248312ff90 100644
--- a/thirdparty/libtheora/x86_vc/mmxfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxfrag.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,12 +22,63 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
 #define SRC edx
@@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst,
 #undef YSTRIDE3
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm{
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.h b/thirdparty/libtheora/x86_vc/mmxfrag.h
deleted file mode 100644
index 45ee93e777..0000000000
--- a/thirdparty/libtheora/x86_vc/mmxfrag.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#if !defined(_x86_vc_mmxfrag_H)
-# define _x86_vc_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm  mov SRC,src \
-    __asm  mov DST,dst \
-    __asm  mov YSTRIDE,_ystride \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*ystride3=ystride*3*/ \
-    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*Pointer to next 4.*/ \
-    __asm  lea SRC,[SRC+YSTRIDE*4] \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-    /*Pointer to next 4.*/ \
-    __asm  lea DST,[DST+YSTRIDE*4] \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxidct.c b/thirdparty/libtheora/x86_vc/mmxidct.c
index 8f5ff6803c..55e00aedcf 100644
--- a/thirdparty/libtheora/x86_vc/mmxidct.c
+++ b/thirdparty/libtheora/x86_vc/mmxidct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -24,15 +24,15 @@
 
 /*These are offsets into the table of constants below.*/
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
+#define OC_COSINE_OFFSET (8)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (56)
+#define OC_EIGHT_OFFSET  (0)
 
 
 
 /*A table of constants used by the MMX routines.*/
-static const __declspec(align(16))ogg_uint16_t
- OC_IDCT_CONSTS[(7+1)*4]={
+static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
+      8,    8,    8,    8,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t
   (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
   (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
   (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
 };
 
 /*38 cycles*/
-#define OC_IDCT_BEGIN __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
-  __asm movq mm7,OC_J(5) \
+  __asm movq mm7,OC_J(5,_x) \
   __asm pmulhw mm4,mm6 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm6,mm7 \
   __asm movq mm5,mm1 \
   __asm pmulhw mm1,mm2 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm5,mm7 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm paddw mm6,mm7 \
   __asm paddw mm2,mm1 \
-  __asm movq mm1,OC_J(7) \
+  __asm movq mm1,OC_J(7,_x) \
   __asm paddw mm7,mm5 \
   __asm movq mm5,mm0 \
   __asm pmulhw mm0,mm3 \
@@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t
   __asm psubw mm6,mm2 \
   __asm paddw mm0,mm3 \
   __asm pmulhw mm3,mm7 \
-  __asm movq mm2,OC_I(2) \
+  __asm movq mm2,OC_I(2,_x) \
   __asm pmulhw mm7,mm1 \
   __asm paddw mm5,mm1 \
   __asm movq mm1,mm2 \
   __asm pmulhw mm2,OC_C(2) \
   __asm psubw mm3,mm5 \
-  __asm movq mm5,OC_J(6) \
+  __asm movq mm5,OC_J(6,_x) \
   __asm paddw mm0,mm7 \
   __asm movq mm7,mm5 \
   __asm psubw mm0,mm4 \
@@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t
   __asm paddw mm6,mm6 \
   __asm pmulhw mm7,OC_C(6) \
   __asm paddw mm6,mm3 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm psubw mm1,mm5 \
   __asm movq mm4,OC_C(4) \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
   __asm paddw mm7,mm2 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
-  __asm movq mm3,OC_J(4) \
+  __asm movq mm3,OC_J(4,_x) \
   __asm psubw mm5,mm1 \
   __asm paddw mm2,mm0 \
   __asm psubw mm6,mm3 \
@@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t
   __asm paddw mm6,mm0 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm paddw mm4,mm3 \
   __asm psubw mm2,mm1 \
 }
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_ROW_IDCT(_y,_x) __asm{ \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  __asm  movq mm3,OC_I(2) \
+  __asm  movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
   __asm  psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t
   __asm  psubw mm7,mm0 \
   __asm  paddw mm0,mm0 \
   /*Save R1.*/ \
-  __asm  movq OC_I(1),mm1 \
+  __asm  movq OC_I(1,_y),mm1 \
   /*r0=R0=G.+C.*/ \
   __asm  paddw mm0,mm7 \
 }
@@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm{ \
+#define OC_TRANSPOSE(_y) __asm{ \
   __asm movq mm1,mm4 \
   __asm punpcklwd mm4,mm5 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm1,mm5 \
   __asm movq mm0,mm6 \
   __asm punpcklwd mm6,mm7 \
@@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t
   __asm punpckldq mm4,mm6 \
   __asm punpckhdq mm5,mm6 \
   __asm movq mm6,mm1 \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   __asm punpckhwd mm0,mm7 \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   __asm punpckhdq mm6,mm0 \
-  __asm movq mm4,OC_I(0) \
+  __asm movq mm4,OC_I(0,_y) \
   __asm punpckldq mm1,mm0 \
-  __asm movq mm5,OC_I(1) \
+  __asm movq mm5,OC_I(1,_y) \
   __asm movq mm0,mm4 \
-  __asm movq OC_J(7),mm6 \
+  __asm movq OC_J(7,_y),mm6 \
   __asm punpcklwd mm0,mm5 \
-  __asm movq OC_J(6),mm1 \
+  __asm movq OC_J(6,_y),mm1 \
   __asm punpckhwd mm4,mm5 \
   __asm movq mm5,mm2 \
   __asm punpcklwd mm2,mm3 \
@@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t
   __asm punpckldq mm0,mm2 \
   __asm punpckhdq mm1,mm2 \
   __asm movq mm2,mm4 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm5,mm3 \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   __asm punpckhdq mm4,mm5 \
   __asm punpckldq mm2,mm5 \
-  __asm movq OC_I(3),mm4 \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(3,_y),mm4 \
+  __asm movq OC_I(2,_y),mm2 \
 }
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_COLUMN_IDCT(_y) __asm{ \
+  OC_IDCT_BEGIN(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -288,71 +287,89 @@ static const __declspec(align(16))ogg_uint16_t
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      [Y+(_k-4)*16+8]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+    mov X,_x
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+8]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+(_k*16)+64]
-#define OC_J(_k)      [Y+(_k-4)*16+72]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   [(_y)+(_k)*16+64]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+72]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16+8]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16+8]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+  __asm pxor mm0,mm0;
+  for(i=0;i<4;i++){
+    ogg_int16_t *x;
+    x=_x+16*i;
+#define X ecx
+    __asm{
+      mov X,x
+      movq [X+0x00],mm0
+      movq [X+0x08],mm0
+      movq [X+0x10],mm0
+      movq [X+0x18],mm0
+    }
+#undef  X
   }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm nop \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm4,mm6 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm1,mm2 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm pxor mm6,mm6 \
   __asm paddw mm2,mm1 \
-  __asm movq mm5,OC_I(2) \
+  __asm movq mm5,OC_I(2,_x) \
   __asm pmulhw mm0,mm3 \
   __asm movq mm1,mm5 \
   __asm paddw mm0,mm3 \
@@ -360,43 +377,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   __asm psubw mm6,mm2 \
   __asm pmulhw mm5,OC_C(2) \
   __asm psubw mm0,mm4 \
-  __asm movq mm7,OC_I(2) \
+  __asm movq mm7,OC_I(2,_x) \
   __asm paddw mm4,mm4 \
   __asm paddw mm7,mm5 \
   __asm paddw mm4,mm0 \
   __asm pmulhw mm1,OC_C(6) \
   __asm psubw mm3,mm6 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm paddw mm6,mm6 \
   __asm movq mm4,OC_C(4) \
   __asm paddw mm6,mm3 \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
   __asm paddw mm2,mm0 \
   __asm psubw mm5,mm1 \
   __asm pmulhw mm6,mm4 \
-  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm6,OC_I(0,_x) \
   __asm paddw mm1,mm1 \
   __asm movq mm4,mm6 \
   __asm paddw mm1,mm5 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm psubw mm2,mm1 \
   __asm nop \
 }
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_ROW_IDCT_10(_y,_x) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_x) \
   /*r3=D'*/ \
-   __asm movq mm3,OC_I(2) \
+   __asm movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
    __asm psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -421,14 +438,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
    __asm psubw mm7,mm0 \
    __asm paddw mm0,mm0 \
   /*Save R1.*/ \
-   __asm movq OC_I(1),mm1 \
+   __asm movq OC_I(1,_y),mm1 \
   /*r0=R0=G'+C'*/ \
    __asm paddw mm0,mm7 \
 }
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10(_y) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -441,15 +458,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -471,11 +488,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -486,50 +503,63 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
+    mov X,_x
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+#define X ecx
+  __asm{
+    pxor mm0,mm0;
+    mov X,_x
+    movq [X+0x00],mm0
+    movq [X+0x10],mm0
+    movq [X+0x20],mm0
+    movq [X+0x30],mm0
   }
+#undef  X
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -555,8 +585,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/mmxstate.c b/thirdparty/libtheora/x86_vc/mmxstate.c
index 73bd1981cf..f532ee1b6f 100644
--- a/thirdparty/libtheora/x86_vc/mmxstate.c
+++ b/thirdparty/libtheora/x86_vc/mmxstate.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,17 +19,16 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            mb_mode;
+  int            refi;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
 #define P ecx
       mov Y,_dct_coeffs
       movzx P,p
+      lea Y,[Y+128]
       /*mm0=0000 0000 0000 AAAA*/
       movd mm0,P
       /*mm0=0000 0000 AAAA AAAA*/
@@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-#define SRC edx
-#define DST eax
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-#undef SRC
-#undef DST
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char  ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
   ptrdiff_t                fragi0_end;
   int                      ystride;
   int                      nhfrags;
-  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
@@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
 #define LL edx
 #define D esi
 #define D_WORD si
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
         }
 #undef PIX
 #undef YSTRIDE3
diff --git a/thirdparty/libtheora/cpu.c b/thirdparty/libtheora/x86_vc/x86cpu.c
index a863aad7f3..6a1d8d850c 100644
--- a/thirdparty/libtheora/cpu.c
+++ b/thirdparty/libtheora/x86_vc/x86cpu.c
@@ -14,41 +14,17 @@
   Originally written by Rudolf Marek.
 
  function:
-  last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
+  last mod: $Id$
 
  ********************************************************************/
 
-#include "cpu.h"
+#include "x86cpu.h"
 
 #if !defined(OC_X86_ASM)
-static ogg_uint32_t oc_cpu_flags_get(void){
+ogg_uint32_t oc_cpu_flags_get(void){
   return 0;
 }
 #else
-# if !defined(_MSC_VER)
-#  if defined(__amd64__)||defined(__x86_64__)
-/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
-   compiling with -fPIC.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "cpuid\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  else
-/*On x86-32, not so much.*/
-#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   "cpuid\n\t" \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-#  endif
-# else
 /*Why does MSVC need this complicated rigamarole?
   At this point I honestly do not care.*/
 
@@ -95,7 +71,6 @@ static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
     mov [ecx],ebx
   }
 }
-# endif
 
 static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
   ogg_uint32_t flags;
@@ -124,7 +99,7 @@ static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
   return flags;
 }
 
-static ogg_uint32_t oc_cpu_flags_get(void){
+ogg_uint32_t oc_cpu_flags_get(void){
   ogg_uint32_t flags;
   ogg_uint32_t eax;
   ogg_uint32_t ebx;
@@ -132,25 +107,7 @@ static ogg_uint32_t oc_cpu_flags_get(void){
   ogg_uint32_t edx;
 # if !defined(__amd64__)&&!defined(__x86_64__)
   /*Not all x86-32 chips support cpuid, so we have to check.*/
-#  if !defined(_MSC_VER)
-  __asm__ __volatile__(
-   "pushfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "movl %[a],%[b]\n\t"
-   "xorl $0x200000,%[a]\n\t"
-   "pushl %[a]\n\t"
-   "popfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "popfl\n\t"
-   :[a]"=r"(eax),[b]"=r"(ebx)
-   :
-   :"cc"
-  );
-#  else
   oc_detect_cpuid_helper(&eax,&ebx);
-#  endif
   /*No cpuid.*/
   if(eax==ebx)return 0;
 # endif
@@ -159,9 +116,18 @@ static ogg_uint32_t oc_cpu_flags_get(void){
   if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
    /*      6 8 x M          T e n i          u n e G*/
    ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
     /*Intel, Transmeta (tested with Crusoe TM5800):*/
     cpuid(1,eax,ebx,ecx,edx);
     flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
   }
   /*              D M A c          i t n e          h t u A*/
   else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
diff --git a/thirdparty/libtheora/x86_vc/x86cpu.h b/thirdparty/libtheora/x86_vc/x86cpu.h
new file mode 100644
index 0000000000..eea261d448
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86cpu.h
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86cpu_H)
+# define _x86_vc_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86enc.c b/thirdparty/libtheora/x86_vc/x86enc.c
index e1960e1f0b..e9d59e85e3 100644
--- a/thirdparty/libtheora/x86_vc/x86enc.c
+++ b/thirdparty/libtheora/x86_vc/x86enc.c
@@ -18,27 +18,25 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
-  cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_enc);
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
   if(cpu_flags&OC_CPU_X86_MMX){
     _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
     _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
     _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
diff --git a/thirdparty/libtheora/x86_vc/x86enc.h b/thirdparty/libtheora/x86_vc/x86enc.h
index 581484641f..885406a54d 100644
--- a/thirdparty/libtheora/x86_vc/x86enc.h
+++ b/thirdparty/libtheora/x86_vc/x86enc.h
@@ -17,10 +17,14 @@
 
 #if !defined(_x86_vc_x86enc_H)
 # define _x86_vc_x86enc_H (1)
-# include "../encint.h"
 # include "x86int.h"
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+# include "../encint.h"
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
 
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
@@ -29,19 +33,19 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
 void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
  const unsigned char *_x,const unsigned char *_y,int _stride);
 void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
  const unsigned char *_x,int _stride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86int.h b/thirdparty/libtheora/x86_vc/x86int.h
index 4cca485311..318a09dca0 100644
--- a/thirdparty/libtheora/x86_vc/x86int.h
+++ b/thirdparty/libtheora/x86_vc/x86int.h
@@ -11,32 +11,39 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
 #if !defined(_x86_vc_x86int_H)
 # define _x86_vc_x86int_H (1)
 # include "../internal.h"
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+# include "../state.h"
+# include "x86cpu.h"
 
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+void oc_state_accel_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86state.c b/thirdparty/libtheora/x86_vc/x86state.c
index a786bec284..fa3a0d42fc 100644
--- a/thirdparty/libtheora/x86_vc/x86state.c
+++ b/thirdparty/libtheora/x86_vc/x86state.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,8 +19,6 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
   64,64,64,64,64,64,64,64,
 };
 
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
-  else oc_state_vtable_init_c(_state);
+  else oc_state_accel_init_c(_state);
 }
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86zigzag.h b/thirdparty/libtheora/x86_vc/x86zigzag.h
new file mode 100644
index 0000000000..26f5ed2ea5
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86zigzag.h
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86zigzag_H)
+# define _x86_vc_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in Y.
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+   first four and second four entries of each row into the specified register,
+   respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO(0,mm0)  /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,mm1)  /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,mm2)  /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,mm3)  /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,mm4)  /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,mm5)  /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,mm6)  /*mm6=23 22 21 20*/ \
+  __asm movq mm7,mm0        /*mm7=03 02 01 00*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=11 10 03 02*/ \
+  __asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
+  __asm punpcklwd mm1,mm0   /*mm1=03 09 02 08*/ \
+  __asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
+  __asm punpcklwd mm7,mm1   /*mm7=02 01 08 00 *A*/ \
+  __asm movq [Y+0x00],mm7 \
+  __asm punpckhwd mm1,mm4   /*mm1=04 03 07 09*/ \
+  __asm movq mm7,mm2        /*mm7=19 18 17 16*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=04 03 11 10*/ \
+  __asm punpckhwd mm7,mm5   /*mm7=12 19 15 18*/ \
+  __asm punpcklwd mm1,mm3   /*mm1=25 07 24 09*/ \
+  __asm punpcklwd mm5,mm6   /*mm5=21 14 20 13*/ \
+  __asm punpcklwd mm1,mm2   /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO(4,mm2)  /*mm2=35 34 33 32*/ \
+  __asm movq [Y+0x08],mm1 \
+  OC_ZZ_LOAD_ROW_LO(5,mm1)  /*mm1=43 42 41 40*/ \
+  __asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
+  __asm movq [Y+0x10],mm0 \
+  __asm punpckhdq mm6,mm4   /*mm6=?? 07 23 22*/ \
+  __asm punpckldq mm4,mm5   /*mm4=20 13 06 05 *D*/ \
+  __asm movq [Y+0x28],mm4 \
+  __asm psrlq mm3,16        /*mm3=.. 27 26 25*/ \
+  __asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
+  __asm movq mm4,mm7        /*mm4=12 19 15 18*/ \
+  __asm punpcklwd mm2,mm3   /*mm2=26 33 25 32*/ \
+  __asm punpcklwd mm4,mm1   /*mm4=41 15 40 18*/ \
+  __asm punpckhwd mm3,mm1   /*mm3=43 .. 42 27*/ \
+  __asm punpckldq mm4,mm2   /*mm4=25 32 40 18*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO(6,mm0)  /*mm0=51 50 49 48*/ \
+  __asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
+  __asm movq [Y+0x18],mm4 \
+  OC_ZZ_LOAD_ROW_LO(7,mm4)  /*mm4=59 58 57 56*/ \
+  __asm punpckhdq mm2,mm7   /*mm2=12 19 26 33 *F*/ \
+  __asm movq [Y+0x20],mm2 \
+  __asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
+  __asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
+  __asm movq mm2,mm3        /*mm2=35 42 34 27*/ \
+  __asm punpckhwd mm1,mm0   /*mm1=50 43 48 41*/ \
+  __asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
+  __asm punpckldq mm3,mm1   /*mm3=48 41 34 27 *G*/ \
+  __asm movq [Y+0x30],mm3 \
+  __asm punpckhdq mm1,mm4   /*mm1=58 57 50 43 *H*/ \
+  __asm movq [Y+0x50],mm1 \
+  OC_ZZ_LOAD_ROW_HI(7,mm1)  /*mm1=63 62 61 60*/ \
+  __asm punpcklwd mm4,mm0   /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI(6,mm0)  /*mm0=55 54 53 52*/ \
+  __asm psllq mm6,16        /*mm6=07 23 22 ..*/ \
+  __asm movq mm3,mm4        /*mm3=49 56 51 59*/ \
+  __asm punpckhdq mm4,mm2   /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI(3,mm2)  /*mm2=31 30 29 28*/ \
+  __asm movq [Y+0x38],mm4 \
+  __asm punpcklwd mm3,mm1   /*mm3=61 51 60 59*/ \
+  __asm punpcklwd mm7,mm6   /*mm7=22 15 .. ??*/ \
+  __asm movq mm4,mm3        /*mm4=61 51 60 59*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=53 60 52 59*/ \
+  __asm punpckhwd mm4,mm0   /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI(4,mm0)  /*mm0=39 38 37 36*/ \
+  __asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
+  __asm movq [Y+0x68],mm3 \
+  __asm movq mm3,mm4        /*mm3=?? ?? 54 51*/ \
+  __asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
+  __asm punpckhwd mm4,mm1   /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI(5,mm1)  /*mm1=47 46 45 44*/ \
+  __asm movq [Y+0x78],mm4 \
+  __asm punpckhwd mm6,mm2   /*mm6=28 07 31 23*/ \
+  __asm punpcklwd mm2,mm0   /*mm2=37 30 36 29*/ \
+  __asm punpckhdq mm5,mm6   /*mm5=28 07 21 14*/ \
+  __asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
+  __asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
+  __asm movq [Y+0x40],mm5 \
+  __asm punpckhdq mm7,mm2   /*mm7=36 29 22 15 *M*/ \
+  __asm movq [Y+0x48],mm7 \
+  __asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
+  __asm punpckhwd mm0,mm1   /*mm0=46 39 45 38*/ \
+  __asm punpcklwd mm3,mm1   /*mm3=47 54 44 51*/ \
+  __asm punpckldq mm6,mm0   /*mm6=45 38 31 23 *N*/ \
+  __asm movq [Y+0x60],mm6 \
+  __asm punpckhdq mm0,mm3   /*mm0=47 54 46 39*/ \
+  __asm punpckldq mm3,mm2   /*mm3=30 37 44 51 *O*/ \
+  __asm movq [Y+0x58],mm3 \
+  __asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
+  __asm movq [Y+0x70],mm0 \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif