From ae74e78909ae0bc476112fb43b9580e969879dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Fri, 15 Oct 2021 12:05:32 +0200 Subject: Remove WebM support (and deps libvpx and opus) We've had many issues with WebM support and specifically the libvpx library over the years, mostly due to its poor integration in Godot's buildsystem, but without anyone really interested in improving this state. With the new GDExtensions in Godot 4.0, we intend to move video decoding to first-party extensions, and this would likely be done using something like libvlc to expose more codecs. Removing the `webm` module means we can remove libsimplewebm, libvpx and opus, which we were only used for that purpose. Both libvpx and opus were fairly complex pieces of the buildsystem, so this is a nice cleanup. This also removes the compile-time dependency on `yasm`. Fixes lots of compilation or non-working WebM issues which will be linked in the PR. --- .../libvpx/third_party/android/cpu-features.c | 1313 ---------------- .../libvpx/third_party/android/cpu-features.h | 323 ---- thirdparty/libvpx/third_party/x86inc/LICENSE | 18 - thirdparty/libvpx/third_party/x86inc/README.libvpx | 20 - thirdparty/libvpx/third_party/x86inc/x86inc.asm | 1649 -------------------- 5 files changed, 3323 deletions(-) delete mode 100644 thirdparty/libvpx/third_party/android/cpu-features.c delete mode 100644 thirdparty/libvpx/third_party/android/cpu-features.h delete mode 100644 thirdparty/libvpx/third_party/x86inc/LICENSE delete mode 100644 thirdparty/libvpx/third_party/x86inc/README.libvpx delete mode 100644 thirdparty/libvpx/third_party/x86inc/x86inc.asm (limited to 'thirdparty/libvpx/third_party') diff --git a/thirdparty/libvpx/third_party/android/cpu-features.c b/thirdparty/libvpx/third_party/android/cpu-features.c deleted file mode 100644 index e2bd749b01..0000000000 --- a/thirdparty/libvpx/third_party/android/cpu-features.c +++ /dev/null @@ -1,1313 +0,0 @@ -/* - * Copyright (C) 2010 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* ChangeLog for this library: - * - * NDK r10e?: Add MIPS MSA feature. - * - * NDK r10: Support for 64-bit CPUs (Intel, ARM & MIPS). - * - * NDK r8d: Add android_setCpu(). - * - * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16, - * VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt. - * - * Rewrite the code to parse /proc/self/auxv instead of - * the "Features" field in /proc/cpuinfo. - * - * Dynamically allocate the buffer that hold the content - * of /proc/cpuinfo to deal with newer hardware. - * - * NDK r7c: Fix CPU count computation. The old method only reported the - * number of _active_ CPUs when the library was initialized, - * which could be less than the real total. - * - * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7 - * for an ARMv6 CPU (see below). - * - * Handle kernels that only report 'neon', and not 'vfpv3' - * (VFPv3 is mandated by the ARM architecture is Neon is implemented) - * - * Handle kernels that only report 'vfpv3d16', and not 'vfpv3' - * - * Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in - * android_getCpuFamily(). - * - * NDK r4: Initial release - */ - -#include "cpu-features.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static pthread_once_t g_once; -static int g_inited; -static AndroidCpuFamily g_cpuFamily; -static uint64_t g_cpuFeatures; -static int g_cpuCount; - -#ifdef __arm__ -static uint32_t g_cpuIdArm; -#endif - -static const int android_cpufeatures_debug = 0; - -#define D(...) \ - do { \ - if (android_cpufeatures_debug) { \ - printf(__VA_ARGS__); fflush(stdout); \ - } \ - } while (0) - -#ifdef __i386__ -static __inline__ void x86_cpuid(int func, int values[4]) -{ - int a, b, c, d; - /* We need to preserve ebx since we're compiling PIC code */ - /* this means we can't use "=b" for the second output register */ - __asm__ __volatile__ ( \ - "push %%ebx\n" - "cpuid\n" \ - "mov %%ebx, %1\n" - "pop %%ebx\n" - : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ - : "a" (func) \ - ); - values[0] = a; - values[1] = b; - values[2] = c; - values[3] = d; -} -#elif defined(__x86_64__) -static __inline__ void x86_cpuid(int func, int values[4]) -{ - int64_t a, b, c, d; - /* We need to preserve ebx since we're compiling PIC code */ - /* this means we can't use "=b" for the second output register */ - __asm__ __volatile__ ( \ - "push %%rbx\n" - "cpuid\n" \ - "mov %%rbx, %1\n" - "pop %%rbx\n" - : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ - : "a" (func) \ - ); - values[0] = a; - values[1] = b; - values[2] = c; - values[3] = d; -} -#endif - -/* Get the size of a file by reading it until the end. This is needed - * because files under /proc do not always return a valid size when - * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed. - */ -static int -get_file_size(const char* pathname) -{ - - int fd, result = 0; - char buffer[256]; - - fd = open(pathname, O_RDONLY); - if (fd < 0) { - D("Can't open %s: %s\n", pathname, strerror(errno)); - return -1; - } - - for (;;) { - int ret = read(fd, buffer, sizeof buffer); - if (ret < 0) { - if (errno == EINTR) - continue; - D("Error while reading %s: %s\n", pathname, strerror(errno)); - break; - } - if (ret == 0) - break; - - result += ret; - } - close(fd); - return result; -} - -/* Read the content of /proc/cpuinfo into a user-provided buffer. - * Return the length of the data, or -1 on error. Does *not* - * zero-terminate the content. Will not read more - * than 'buffsize' bytes. - */ -static int -read_file(const char* pathname, char* buffer, size_t buffsize) -{ - int fd, count; - - fd = open(pathname, O_RDONLY); - if (fd < 0) { - D("Could not open %s: %s\n", pathname, strerror(errno)); - return -1; - } - count = 0; - while (count < (int)buffsize) { - int ret = read(fd, buffer + count, buffsize - count); - if (ret < 0) { - if (errno == EINTR) - continue; - D("Error while reading from %s: %s\n", pathname, strerror(errno)); - if (count == 0) - count = -1; - break; - } - if (ret == 0) - break; - count += ret; - } - close(fd); - return count; -} - -#ifdef __arm__ -/* Extract the content of a the first occurence of a given field in - * the content of /proc/cpuinfo and return it as a heap-allocated - * string that must be freed by the caller. - * - * Return NULL if not found - */ -static char* -extract_cpuinfo_field(const char* buffer, int buflen, const char* field) -{ - int fieldlen = strlen(field); - const char* bufend = buffer + buflen; - char* result = NULL; - int len; - const char *p, *q; - - /* Look for first field occurence, and ensures it starts the line. */ - p = buffer; - for (;;) { - p = memmem(p, bufend-p, field, fieldlen); - if (p == NULL) - goto EXIT; - - if (p == buffer || p[-1] == '\n') - break; - - p += fieldlen; - } - - /* Skip to the first column followed by a space */ - p += fieldlen; - p = memchr(p, ':', bufend-p); - if (p == NULL || p[1] != ' ') - goto EXIT; - - /* Find the end of the line */ - p += 2; - q = memchr(p, '\n', bufend-p); - if (q == NULL) - q = bufend; - - /* Copy the line into a heap-allocated buffer */ - len = q-p; - result = malloc(len+1); - if (result == NULL) - goto EXIT; - - memcpy(result, p, len); - result[len] = '\0'; - -EXIT: - return result; -} - -/* Checks that a space-separated list of items contains one given 'item'. - * Returns 1 if found, 0 otherwise. - */ -static int -has_list_item(const char* list, const char* item) -{ - const char* p = list; - int itemlen = strlen(item); - - if (list == NULL) - return 0; - - while (*p) { - const char* q; - - /* skip spaces */ - while (*p == ' ' || *p == '\t') - p++; - - /* find end of current list item */ - q = p; - while (*q && *q != ' ' && *q != '\t') - q++; - - if (itemlen == q-p && !memcmp(p, item, itemlen)) - return 1; - - /* skip to next item */ - p = q; - } - return 0; -} -#endif /* __arm__ */ - -/* Parse a number starting from 'input', but not going further - * than 'limit'. Return the value into '*result'. - * - * NOTE: Does not skip over leading spaces, or deal with sign characters. - * NOTE: Ignores overflows. - * - * The function returns NULL in case of error (bad format), or the new - * position after the decimal number in case of success (which will always - * be <= 'limit'). - */ -static const char* -parse_number(const char* input, const char* limit, int base, int* result) -{ - const char* p = input; - int val = 0; - while (p < limit) { - int d = (*p - '0'); - if ((unsigned)d >= 10U) { - d = (*p - 'a'); - if ((unsigned)d >= 6U) - d = (*p - 'A'); - if ((unsigned)d >= 6U) - break; - d += 10; - } - if (d >= base) - break; - val = val*base + d; - p++; - } - if (p == input) - return NULL; - - *result = val; - return p; -} - -static const char* -parse_decimal(const char* input, const char* limit, int* result) -{ - return parse_number(input, limit, 10, result); -} - -#ifdef __arm__ -static const char* -parse_hexadecimal(const char* input, const char* limit, int* result) -{ - return parse_number(input, limit, 16, result); -} -#endif /* __arm__ */ - -/* This small data type is used to represent a CPU list / mask, as read - * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt - * - * For now, we don't expect more than 32 cores on mobile devices, so keep - * everything simple. - */ -typedef struct { - uint32_t mask; -} CpuList; - -static __inline__ void -cpulist_init(CpuList* list) { - list->mask = 0; -} - -static __inline__ void -cpulist_and(CpuList* list1, CpuList* list2) { - list1->mask &= list2->mask; -} - -static __inline__ void -cpulist_set(CpuList* list, int index) { - if ((unsigned)index < 32) { - list->mask |= (uint32_t)(1U << index); - } -} - -static __inline__ int -cpulist_count(CpuList* list) { - return __builtin_popcount(list->mask); -} - -/* Parse a textual list of cpus and store the result inside a CpuList object. - * Input format is the following: - * - comma-separated list of items (no spaces) - * - each item is either a single decimal number (cpu index), or a range made - * of two numbers separated by a single dash (-). Ranges are inclusive. - * - * Examples: 0 - * 2,4-127,128-143 - * 0-1 - */ -static void -cpulist_parse(CpuList* list, const char* line, int line_len) -{ - const char* p = line; - const char* end = p + line_len; - const char* q; - - /* NOTE: the input line coming from sysfs typically contains a - * trailing newline, so take care of it in the code below - */ - while (p < end && *p != '\n') - { - int val, start_value, end_value; - - /* Find the end of current item, and put it into 'q' */ - q = memchr(p, ',', end-p); - if (q == NULL) { - q = end; - } - - /* Get first value */ - p = parse_decimal(p, q, &start_value); - if (p == NULL) - goto BAD_FORMAT; - - end_value = start_value; - - /* If we're not at the end of the item, expect a dash and - * and integer; extract end value. - */ - if (p < q && *p == '-') { - p = parse_decimal(p+1, q, &end_value); - if (p == NULL) - goto BAD_FORMAT; - } - - /* Set bits CPU list bits */ - for (val = start_value; val <= end_value; val++) { - cpulist_set(list, val); - } - - /* Jump to next item */ - p = q; - if (p < end) - p++; - } - -BAD_FORMAT: - ; -} - -/* Read a CPU list from one sysfs file */ -static void -cpulist_read_from(CpuList* list, const char* filename) -{ - char file[64]; - int filelen; - - cpulist_init(list); - - filelen = read_file(filename, file, sizeof file); - if (filelen < 0) { - D("Could not read %s: %s\n", filename, strerror(errno)); - return; - } - - cpulist_parse(list, file, filelen); -} -#if defined(__aarch64__) -// see kernel header -#define HWCAP_FP (1 << 0) -#define HWCAP_ASIMD (1 << 1) -#define HWCAP_AES (1 << 3) -#define HWCAP_PMULL (1 << 4) -#define HWCAP_SHA1 (1 << 5) -#define HWCAP_SHA2 (1 << 6) -#define HWCAP_CRC32 (1 << 7) -#endif - -#if defined(__arm__) - -// See kernel header. -#define HWCAP_VFP (1 << 6) -#define HWCAP_IWMMXT (1 << 9) -#define HWCAP_NEON (1 << 12) -#define HWCAP_VFPv3 (1 << 13) -#define HWCAP_VFPv3D16 (1 << 14) -#define HWCAP_VFPv4 (1 << 16) -#define HWCAP_IDIVA (1 << 17) -#define HWCAP_IDIVT (1 << 18) - -// see kernel header -#define HWCAP2_AES (1 << 0) -#define HWCAP2_PMULL (1 << 1) -#define HWCAP2_SHA1 (1 << 2) -#define HWCAP2_SHA2 (1 << 3) -#define HWCAP2_CRC32 (1 << 4) - -// This is the list of 32-bit ARMv7 optional features that are _always_ -// supported by ARMv8 CPUs, as mandated by the ARM Architecture Reference -// Manual. -#define HWCAP_SET_FOR_ARMV8 \ - ( HWCAP_VFP | \ - HWCAP_NEON | \ - HWCAP_VFPv3 | \ - HWCAP_VFPv4 | \ - HWCAP_IDIVA | \ - HWCAP_IDIVT ) -#endif - -#if defined(__mips__) -// see kernel header -#define HWCAP_MIPS_R6 (1 << 0) -#define HWCAP_MIPS_MSA (1 << 1) -#endif - -#if defined(__arm__) || defined(__aarch64__) || defined(__mips__) - -#define AT_HWCAP 16 -#define AT_HWCAP2 26 - -// Probe the system's C library for a 'getauxval' function and call it if -// it exits, or return 0 for failure. This function is available since API -// level 20. -// -// This code does *NOT* check for '__ANDROID_API__ >= 20' to support the -// edge case where some NDK developers use headers for a platform that is -// newer than the one really targetted by their application. -// This is typically done to use newer native APIs only when running on more -// recent Android versions, and requires careful symbol management. -// -// Note that getauxval() can't really be re-implemented here, because -// its implementation does not parse /proc/self/auxv. Instead it depends -// on values that are passed by the kernel at process-init time to the -// C runtime initialization layer. -static uint32_t -get_elf_hwcap_from_getauxval(int hwcap_type) { - typedef unsigned long getauxval_func_t(unsigned long); - - dlerror(); - void* libc_handle = dlopen("libc.so", RTLD_NOW); - if (!libc_handle) { - D("Could not dlopen() C library: %s\n", dlerror()); - return 0; - } - - uint32_t ret = 0; - getauxval_func_t* func = (getauxval_func_t*) - dlsym(libc_handle, "getauxval"); - if (!func) { - D("Could not find getauxval() in C library\n"); - } else { - // Note: getauxval() returns 0 on failure. Doesn't touch errno. - ret = (uint32_t)(*func)(hwcap_type); - } - dlclose(libc_handle); - return ret; -} -#endif - -#if defined(__arm__) -// Parse /proc/self/auxv to extract the ELF HW capabilities bitmap for the -// current CPU. Note that this file is not accessible from regular -// application processes on some Android platform releases. -// On success, return new ELF hwcaps, or 0 on failure. -static uint32_t -get_elf_hwcap_from_proc_self_auxv(void) { - const char filepath[] = "/proc/self/auxv"; - int fd = TEMP_FAILURE_RETRY(open(filepath, O_RDONLY)); - if (fd < 0) { - D("Could not open %s: %s\n", filepath, strerror(errno)); - return 0; - } - - struct { uint32_t tag; uint32_t value; } entry; - - uint32_t result = 0; - for (;;) { - int ret = TEMP_FAILURE_RETRY(read(fd, (char*)&entry, sizeof entry)); - if (ret < 0) { - D("Error while reading %s: %s\n", filepath, strerror(errno)); - break; - } - // Detect end of list. - if (ret == 0 || (entry.tag == 0 && entry.value == 0)) - break; - if (entry.tag == AT_HWCAP) { - result = entry.value; - break; - } - } - close(fd); - return result; -} - -/* Compute the ELF HWCAP flags from the content of /proc/cpuinfo. - * This works by parsing the 'Features' line, which lists which optional - * features the device's CPU supports, on top of its reference - * architecture. - */ -static uint32_t -get_elf_hwcap_from_proc_cpuinfo(const char* cpuinfo, int cpuinfo_len) { - uint32_t hwcaps = 0; - long architecture = 0; - char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture"); - if (cpuArch) { - architecture = strtol(cpuArch, NULL, 10); - free(cpuArch); - - if (architecture >= 8L) { - // This is a 32-bit ARM binary running on a 64-bit ARM64 kernel. - // The 'Features' line only lists the optional features that the - // device's CPU supports, compared to its reference architecture - // which are of no use for this process. - D("Faking 32-bit ARM HWCaps on ARMv%ld CPU\n", architecture); - return HWCAP_SET_FOR_ARMV8; - } - } - - char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features"); - if (cpuFeatures != NULL) { - D("Found cpuFeatures = '%s'\n", cpuFeatures); - - if (has_list_item(cpuFeatures, "vfp")) - hwcaps |= HWCAP_VFP; - if (has_list_item(cpuFeatures, "vfpv3")) - hwcaps |= HWCAP_VFPv3; - if (has_list_item(cpuFeatures, "vfpv3d16")) - hwcaps |= HWCAP_VFPv3D16; - if (has_list_item(cpuFeatures, "vfpv4")) - hwcaps |= HWCAP_VFPv4; - if (has_list_item(cpuFeatures, "neon")) - hwcaps |= HWCAP_NEON; - if (has_list_item(cpuFeatures, "idiva")) - hwcaps |= HWCAP_IDIVA; - if (has_list_item(cpuFeatures, "idivt")) - hwcaps |= HWCAP_IDIVT; - if (has_list_item(cpuFeatures, "idiv")) - hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT; - if (has_list_item(cpuFeatures, "iwmmxt")) - hwcaps |= HWCAP_IWMMXT; - - free(cpuFeatures); - } - return hwcaps; -} -#endif /* __arm__ */ - -/* Return the number of cpus present on a given device. - * - * To handle all weird kernel configurations, we need to compute the - * intersection of the 'present' and 'possible' CPU lists and count - * the result. - */ -static int -get_cpu_count(void) -{ - CpuList cpus_present[1]; - CpuList cpus_possible[1]; - - cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present"); - cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible"); - - /* Compute the intersection of both sets to get the actual number of - * CPU cores that can be used on this device by the kernel. - */ - cpulist_and(cpus_present, cpus_possible); - - return cpulist_count(cpus_present); -} - -static void -android_cpuInitFamily(void) -{ -#if defined(__arm__) - g_cpuFamily = ANDROID_CPU_FAMILY_ARM; -#elif defined(__i386__) - g_cpuFamily = ANDROID_CPU_FAMILY_X86; -#elif defined(__mips64) -/* Needs to be before __mips__ since the compiler defines both */ - g_cpuFamily = ANDROID_CPU_FAMILY_MIPS64; -#elif defined(__mips__) - g_cpuFamily = ANDROID_CPU_FAMILY_MIPS; -#elif defined(__aarch64__) - g_cpuFamily = ANDROID_CPU_FAMILY_ARM64; -#elif defined(__x86_64__) - g_cpuFamily = ANDROID_CPU_FAMILY_X86_64; -#else - g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN; -#endif -} - -static void -android_cpuInit(void) -{ - char* cpuinfo = NULL; - int cpuinfo_len; - - android_cpuInitFamily(); - - g_cpuFeatures = 0; - g_cpuCount = 1; - g_inited = 1; - - cpuinfo_len = get_file_size("/proc/cpuinfo"); - if (cpuinfo_len < 0) { - D("cpuinfo_len cannot be computed!"); - return; - } - cpuinfo = malloc(cpuinfo_len); - if (cpuinfo == NULL) { - D("cpuinfo buffer could not be allocated"); - return; - } - cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len); - D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len, - cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo); - - if (cpuinfo_len < 0) /* should not happen */ { - free(cpuinfo); - return; - } - - /* Count the CPU cores, the value may be 0 for single-core CPUs */ - g_cpuCount = get_cpu_count(); - if (g_cpuCount == 0) { - g_cpuCount = 1; - } - - D("found cpuCount = %d\n", g_cpuCount); - -#ifdef __arm__ - { - /* Extract architecture from the "CPU Architecture" field. - * The list is well-known, unlike the the output of - * the 'Processor' field which can vary greatly. - * - * See the definition of the 'proc_arch' array in - * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in - * same file. - */ - char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture"); - - if (cpuArch != NULL) { - char* end; - long archNumber; - int hasARMv7 = 0; - - D("found cpuArch = '%s'\n", cpuArch); - - /* read the initial decimal number, ignore the rest */ - archNumber = strtol(cpuArch, &end, 10); - - /* Note that ARMv8 is upwards compatible with ARMv7. */ - if (end > cpuArch && archNumber >= 7) { - hasARMv7 = 1; - } - - /* Unfortunately, it seems that certain ARMv6-based CPUs - * report an incorrect architecture number of 7! - * - * See http://code.google.com/p/android/issues/detail?id=10812 - * - * We try to correct this by looking at the 'elf_format' - * field reported by the 'Processor' field, which is of the - * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for - * an ARMv6-one. - */ - if (hasARMv7) { - char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len, - "Processor"); - if (cpuProc != NULL) { - D("found cpuProc = '%s'\n", cpuProc); - if (has_list_item(cpuProc, "(v6l)")) { - D("CPU processor and architecture mismatch!!\n"); - hasARMv7 = 0; - } - free(cpuProc); - } - } - - if (hasARMv7) { - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7; - } - - /* The LDREX / STREX instructions are available from ARMv6 */ - if (archNumber >= 6) { - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX; - } - - free(cpuArch); - } - - /* Extract the list of CPU features from ELF hwcaps */ - uint32_t hwcaps = 0; - hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); - if (!hwcaps) { - D("Parsing /proc/self/auxv to extract ELF hwcaps!\n"); - hwcaps = get_elf_hwcap_from_proc_self_auxv(); - } - if (!hwcaps) { - // Parsing /proc/self/auxv will fail from regular application - // processes on some Android platform versions, when this happens - // parse proc/cpuinfo instead. - D("Parsing /proc/cpuinfo to extract ELF hwcaps!\n"); - hwcaps = get_elf_hwcap_from_proc_cpuinfo(cpuinfo, cpuinfo_len); - } - - if (hwcaps != 0) { - int has_vfp = (hwcaps & HWCAP_VFP); - int has_vfpv3 = (hwcaps & HWCAP_VFPv3); - int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16); - int has_vfpv4 = (hwcaps & HWCAP_VFPv4); - int has_neon = (hwcaps & HWCAP_NEON); - int has_idiva = (hwcaps & HWCAP_IDIVA); - int has_idivt = (hwcaps & HWCAP_IDIVT); - int has_iwmmxt = (hwcaps & HWCAP_IWMMXT); - - // The kernel does a poor job at ensuring consistency when - // describing CPU features. So lots of guessing is needed. - - // 'vfpv4' implies VFPv3|VFP_FMA|FP16 - if (has_vfpv4) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 | - ANDROID_CPU_ARM_FEATURE_VFP_FP16 | - ANDROID_CPU_ARM_FEATURE_VFP_FMA; - - // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC, - // a value of 'vfpv3' doesn't necessarily mean that the D32 - // feature is present, so be conservative. All CPUs in the - // field that support D32 also support NEON, so this should - // not be a problem in practice. - if (has_vfpv3 || has_vfpv3d16) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; - - // 'vfp' is super ambiguous. Depending on the kernel, it can - // either mean VFPv2 or VFPv3. Make it depend on ARMv7. - if (has_vfp) { - if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; - else - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2; - } - - // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA - if (has_neon) { - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 | - ANDROID_CPU_ARM_FEATURE_NEON | - ANDROID_CPU_ARM_FEATURE_VFP_D32; - if (has_vfpv4) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA; - } - - // VFPv3 implies VFPv2 and ARMv7 - if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 | - ANDROID_CPU_ARM_FEATURE_ARMv7; - - if (has_idiva) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; - if (has_idivt) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2; - - if (has_iwmmxt) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt; - } - - /* Extract the list of CPU features from ELF hwcaps2 */ - uint32_t hwcaps2 = 0; - hwcaps2 = get_elf_hwcap_from_getauxval(AT_HWCAP2); - if (hwcaps2 != 0) { - int has_aes = (hwcaps2 & HWCAP2_AES); - int has_pmull = (hwcaps2 & HWCAP2_PMULL); - int has_sha1 = (hwcaps2 & HWCAP2_SHA1); - int has_sha2 = (hwcaps2 & HWCAP2_SHA2); - int has_crc32 = (hwcaps2 & HWCAP2_CRC32); - - if (has_aes) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_AES; - if (has_pmull) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_PMULL; - if (has_sha1) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA1; - if (has_sha2) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA2; - if (has_crc32) - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_CRC32; - } - /* Extract the cpuid value from various fields */ - // The CPUID value is broken up in several entries in /proc/cpuinfo. - // This table is used to rebuild it from the entries. - static const struct CpuIdEntry { - const char* field; - char format; - char bit_lshift; - char bit_length; - } cpu_id_entries[] = { - { "CPU implementer", 'x', 24, 8 }, - { "CPU variant", 'x', 20, 4 }, - { "CPU part", 'x', 4, 12 }, - { "CPU revision", 'd', 0, 4 }, - }; - size_t i; - D("Parsing /proc/cpuinfo to recover CPUID\n"); - for (i = 0; - i < sizeof(cpu_id_entries)/sizeof(cpu_id_entries[0]); - ++i) { - const struct CpuIdEntry* entry = &cpu_id_entries[i]; - char* value = extract_cpuinfo_field(cpuinfo, - cpuinfo_len, - entry->field); - if (value == NULL) - continue; - - D("field=%s value='%s'\n", entry->field, value); - char* value_end = value + strlen(value); - int val = 0; - const char* start = value; - const char* p; - if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X')) { - start += 2; - p = parse_hexadecimal(start, value_end, &val); - } else if (entry->format == 'x') - p = parse_hexadecimal(value, value_end, &val); - else - p = parse_decimal(value, value_end, &val); - - if (p > (const char*)start) { - val &= ((1 << entry->bit_length)-1); - val <<= entry->bit_lshift; - g_cpuIdArm |= (uint32_t) val; - } - - free(value); - } - - // Handle kernel configuration bugs that prevent the correct - // reporting of CPU features. - static const struct CpuFix { - uint32_t cpuid; - uint64_t or_flags; - } cpu_fixes[] = { - /* The Nexus 4 (Qualcomm Krait) kernel configuration - * forgets to report IDIV support. */ - { 0x510006f2, ANDROID_CPU_ARM_FEATURE_IDIV_ARM | - ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 }, - { 0x510006f3, ANDROID_CPU_ARM_FEATURE_IDIV_ARM | - ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 }, - }; - size_t n; - for (n = 0; n < sizeof(cpu_fixes)/sizeof(cpu_fixes[0]); ++n) { - const struct CpuFix* entry = &cpu_fixes[n]; - - if (g_cpuIdArm == entry->cpuid) - g_cpuFeatures |= entry->or_flags; - } - - // Special case: The emulator-specific Android 4.2 kernel fails - // to report support for the 32-bit ARM IDIV instruction. - // Technically, this is a feature of the virtual CPU implemented - // by the emulator. Note that it could also support Thumb IDIV - // in the future, and this will have to be slightly updated. - char* hardware = extract_cpuinfo_field(cpuinfo, - cpuinfo_len, - "Hardware"); - if (hardware) { - if (!strcmp(hardware, "Goldfish") && - g_cpuIdArm == 0x4100c080 && - (g_cpuFamily & ANDROID_CPU_ARM_FEATURE_ARMv7) != 0) { - g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; - } - free(hardware); - } - } -#endif /* __arm__ */ -#ifdef __aarch64__ - { - /* Extract the list of CPU features from ELF hwcaps */ - uint32_t hwcaps = 0; - hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); - if (hwcaps != 0) { - int has_fp = (hwcaps & HWCAP_FP); - int has_asimd = (hwcaps & HWCAP_ASIMD); - int has_aes = (hwcaps & HWCAP_AES); - int has_pmull = (hwcaps & HWCAP_PMULL); - int has_sha1 = (hwcaps & HWCAP_SHA1); - int has_sha2 = (hwcaps & HWCAP_SHA2); - int has_crc32 = (hwcaps & HWCAP_CRC32); - - if(has_fp == 0) { - D("ERROR: Floating-point unit missing, but is required by Android on AArch64 CPUs\n"); - } - if(has_asimd == 0) { - D("ERROR: ASIMD unit missing, but is required by Android on AArch64 CPUs\n"); - } - - if (has_fp) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_FP; - if (has_asimd) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_ASIMD; - if (has_aes) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_AES; - if (has_pmull) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_PMULL; - if (has_sha1) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA1; - if (has_sha2) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA2; - if (has_crc32) - g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_CRC32; - } - } -#endif /* __aarch64__ */ - -#if defined(__i386__) || defined(__x86_64__) - int regs[4]; - -/* According to http://en.wikipedia.org/wiki/CPUID */ -#define VENDOR_INTEL_b 0x756e6547 -#define VENDOR_INTEL_c 0x6c65746e -#define VENDOR_INTEL_d 0x49656e69 - - x86_cpuid(0, regs); - int vendorIsIntel = (regs[1] == VENDOR_INTEL_b && - regs[2] == VENDOR_INTEL_c && - regs[3] == VENDOR_INTEL_d); - - x86_cpuid(1, regs); - if ((regs[2] & (1 << 9)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3; - } - if ((regs[2] & (1 << 23)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT; - } - if ((regs[2] & (1 << 19)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_1; - } - if ((regs[2] & (1 << 20)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_2; - } - if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE; - } - if ((regs[2] & (1 << 25)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AES_NI; - } - if ((regs[2] & (1 << 28)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX; - } - if ((regs[2] & (1 << 30)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_RDRAND; - } - - x86_cpuid(7, regs); - if ((regs[1] & (1 << 5)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX2; - } - if ((regs[1] & (1 << 29)) != 0) { - g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SHA_NI; - } - - -#endif -#if defined( __mips__) - { /* MIPS and MIPS64 */ - /* Extract the list of CPU features from ELF hwcaps */ - uint32_t hwcaps = 0; - hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); - if (hwcaps != 0) { - int has_r6 = (hwcaps & HWCAP_MIPS_R6); - int has_msa = (hwcaps & HWCAP_MIPS_MSA); - if (has_r6) - g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_R6; - if (has_msa) - g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_MSA; - } - } -#endif /* __mips__ */ - - free(cpuinfo); -} - - -AndroidCpuFamily -android_getCpuFamily(void) -{ - pthread_once(&g_once, android_cpuInit); - return g_cpuFamily; -} - - -uint64_t -android_getCpuFeatures(void) -{ - pthread_once(&g_once, android_cpuInit); - return g_cpuFeatures; -} - - -int -android_getCpuCount(void) -{ - pthread_once(&g_once, android_cpuInit); - return g_cpuCount; -} - -static void -android_cpuInitDummy(void) -{ - g_inited = 1; -} - -int -android_setCpu(int cpu_count, uint64_t cpu_features) -{ - /* Fail if the library was already initialized. */ - if (g_inited) - return 0; - - android_cpuInitFamily(); - g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count); - g_cpuFeatures = cpu_features; - pthread_once(&g_once, android_cpuInitDummy); - - return 1; -} - -#ifdef __arm__ -uint32_t -android_getCpuIdArm(void) -{ - pthread_once(&g_once, android_cpuInit); - return g_cpuIdArm; -} - -int -android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id) -{ - if (!android_setCpu(cpu_count, cpu_features)) - return 0; - - g_cpuIdArm = cpu_id; - return 1; -} -#endif /* __arm__ */ - -/* - * Technical note: Making sense of ARM's FPU architecture versions. - * - * FPA was ARM's first attempt at an FPU architecture. There is no Android - * device that actually uses it since this technology was already obsolete - * when the project started. If you see references to FPA instructions - * somewhere, you can be sure that this doesn't apply to Android at all. - * - * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of - * new versions / additions to it. ARM considers this obsolete right now, - * and no known Android device implements it either. - * - * VFPv2 added a few instructions to VFPv1, and is an *optional* extension - * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device - * supporting the 'armeabi' ABI doesn't necessarily support these. - * - * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used - * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated - * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means - * that it provides 16 double-precision FPU registers (d0-d15) and 32 - * single-precision ones (s0-s31) which happen to be mapped to the same - * register banks. - * - * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16 - * additional double precision registers (d16-d31). Note that there are - * still only 32 single precision registers. - * - * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision - * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which - * are not supported by Android. Note that it is not compatible with VFPv2. - * - * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32 - * depending on context. For example GCC uses it for VFPv3-D32, but - * the Linux kernel code uses it for VFPv3-D16 (especially in - * /proc/cpuinfo). Always try to use the full designation when - * possible. - * - * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides - * instructions to perform parallel computations on vectors of 8, 16, - * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all - * NEON registers are also mapped to the same register banks. - * - * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to - * perform fused multiply-accumulate on VFP registers, as well as - * half-precision (16-bit) conversion operations. - * - * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision - * registers. - * - * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused - * multiply-accumulate instructions that work on the NEON registers. - * - * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32 - * depending on context. - * - * The following information was determined by scanning the binutils-2.22 - * sources: - * - * Basic VFP instruction subsets: - * - * #define FPU_VFP_EXT_V1xD 0x08000000 // Base VFP instruction set. - * #define FPU_VFP_EXT_V1 0x04000000 // Double-precision insns. - * #define FPU_VFP_EXT_V2 0x02000000 // ARM10E VFPr1. - * #define FPU_VFP_EXT_V3xD 0x01000000 // VFPv3 single-precision. - * #define FPU_VFP_EXT_V3 0x00800000 // VFPv3 double-precision. - * #define FPU_NEON_EXT_V1 0x00400000 // Neon (SIMD) insns. - * #define FPU_VFP_EXT_D32 0x00200000 // Registers D16-D31. - * #define FPU_VFP_EXT_FP16 0x00100000 // Half-precision extensions. - * #define FPU_NEON_EXT_FMA 0x00080000 // Neon fused multiply-add - * #define FPU_VFP_EXT_FMA 0x00040000 // VFP fused multiply-add - * - * FPU types (excluding NEON) - * - * FPU_VFP_V1xD (EXT_V1xD) - * | - * +--------------------------+ - * | | - * FPU_VFP_V1 (+EXT_V1) FPU_VFP_V3xD (+EXT_V2+EXT_V3xD) - * | | - * | | - * FPU_VFP_V2 (+EXT_V2) FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA) - * | - * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3) - * | - * +--------------------------+ - * | | - * FPU_VFP_V3 (+EXT_D32) FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA) - * | | - * | FPU_VFP_V4 (+EXT_D32) - * | - * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA) - * - * VFP architectures: - * - * ARCH_VFP_V1xD (EXT_V1xD) - * | - * +------------------+ - * | | - * | ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD) - * | | - * | ARCH_VFP_V3xD_FP16 (+EXT_FP16) - * | | - * | ARCH_VFP_V4_SP_D16 (+EXT_FMA) - * | - * ARCH_VFP_V1 (+EXT_V1) - * | - * ARCH_VFP_V2 (+EXT_V2) - * | - * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V3D16_FP16 (+EXT_FP16) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA) - * | | - * | ARCH_VFP_V4 (+EXT_D32) - * | | - * | ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA) - * | - * ARCH_VFP_V3 (+EXT_D32) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V3_FP16 (+EXT_FP16) - * | - * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON) - * | - * ARCH_NEON_FP16 (+EXT_FP16) - * - * -fpu= values and their correspondance with FPU architectures above: - * - * {"vfp", FPU_ARCH_VFP_V2}, - * {"vfp9", FPU_ARCH_VFP_V2}, - * {"vfp3", FPU_ARCH_VFP_V3}, // For backwards compatbility. - * {"vfp10", FPU_ARCH_VFP_V2}, - * {"vfp10-r0", FPU_ARCH_VFP_V1}, - * {"vfpxd", FPU_ARCH_VFP_V1xD}, - * {"vfpv2", FPU_ARCH_VFP_V2}, - * {"vfpv3", FPU_ARCH_VFP_V3}, - * {"vfpv3-fp16", FPU_ARCH_VFP_V3_FP16}, - * {"vfpv3-d16", FPU_ARCH_VFP_V3D16}, - * {"vfpv3-d16-fp16", FPU_ARCH_VFP_V3D16_FP16}, - * {"vfpv3xd", FPU_ARCH_VFP_V3xD}, - * {"vfpv3xd-fp16", FPU_ARCH_VFP_V3xD_FP16}, - * {"neon", FPU_ARCH_VFP_V3_PLUS_NEON_V1}, - * {"neon-fp16", FPU_ARCH_NEON_FP16}, - * {"vfpv4", FPU_ARCH_VFP_V4}, - * {"vfpv4-d16", FPU_ARCH_VFP_V4D16}, - * {"fpv4-sp-d16", FPU_ARCH_VFP_V4_SP_D16}, - * {"neon-vfpv4", FPU_ARCH_NEON_VFP_V4}, - * - * - * Simplified diagram that only includes FPUs supported by Android: - * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI, - * all others are optional and must be probed at runtime. - * - * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V3D16_FP16 (+EXT_FP16) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA) - * | | - * | ARCH_VFP_V4 (+EXT_D32) - * | | - * | ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA) - * | - * ARCH_VFP_V3 (+EXT_D32) - * | - * +-------------------+ - * | | - * | ARCH_VFP_V3_FP16 (+EXT_FP16) - * | - * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON) - * | - * ARCH_NEON_FP16 (+EXT_FP16) - * - */ diff --git a/thirdparty/libvpx/third_party/android/cpu-features.h b/thirdparty/libvpx/third_party/android/cpu-features.h deleted file mode 100644 index 1e9724197a..0000000000 --- a/thirdparty/libvpx/third_party/android/cpu-features.h +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (C) 2010 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -#ifndef CPU_FEATURES_H -#define CPU_FEATURES_H - -#include -#include - -__BEGIN_DECLS - -/* A list of valid values returned by android_getCpuFamily(). - * They describe the CPU Architecture of the current process. - */ -typedef enum { - ANDROID_CPU_FAMILY_UNKNOWN = 0, - ANDROID_CPU_FAMILY_ARM, - ANDROID_CPU_FAMILY_X86, - ANDROID_CPU_FAMILY_MIPS, - ANDROID_CPU_FAMILY_ARM64, - ANDROID_CPU_FAMILY_X86_64, - ANDROID_CPU_FAMILY_MIPS64, - - ANDROID_CPU_FAMILY_MAX /* do not remove */ - -} AndroidCpuFamily; - -/* Return the CPU family of the current process. - * - * Note that this matches the bitness of the current process. I.e. when - * running a 32-bit binary on a 64-bit capable CPU, this will return the - * 32-bit CPU family value. - */ -extern AndroidCpuFamily android_getCpuFamily(void); - -/* Return a bitmap describing a set of optional CPU features that are - * supported by the current device's CPU. The exact bit-flags returned - * depend on the value returned by android_getCpuFamily(). See the - * documentation for the ANDROID_CPU_*_FEATURE_* flags below for details. - */ -extern uint64_t android_getCpuFeatures(void); - -/* The list of feature flags for ANDROID_CPU_FAMILY_ARM that can be - * recognized by the library (see note below for 64-bit ARM). Value details - * are: - * - * VFPv2: - * CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs - * support these instructions. VFPv2 is a subset of VFPv3 so this will - * be set whenever VFPv3 is set too. - * - * ARMv7: - * CPU supports the ARMv7-A basic instruction set. - * This feature is mandated by the 'armeabi-v7a' ABI. - * - * VFPv3: - * CPU supports the VFPv3-D16 instruction set, providing hardware FPU - * support for single and double precision floating point registers. - * Note that only 16 FPU registers are available by default, unless - * the D32 bit is set too. This feature is also mandated by the - * 'armeabi-v7a' ABI. - * - * VFP_D32: - * CPU VFP optional extension that provides 32 FPU registers, - * instead of 16. Note that ARM mandates this feature is the 'NEON' - * feature is implemented by the CPU. - * - * NEON: - * CPU FPU supports "ARM Advanced SIMD" instructions, also known as - * NEON. Note that this mandates the VFP_D32 feature as well, per the - * ARM Architecture specification. - * - * VFP_FP16: - * Half-width floating precision VFP extension. If set, the CPU - * supports instructions to perform floating-point operations on - * 16-bit registers. This is part of the VFPv4 specification, but - * not mandated by any Android ABI. - * - * VFP_FMA: - * Fused multiply-accumulate VFP instructions extension. Also part of - * the VFPv4 specification, but not mandated by any Android ABI. - * - * NEON_FMA: - * Fused multiply-accumulate NEON instructions extension. Optional - * extension from the VFPv4 specification, but not mandated by any - * Android ABI. - * - * IDIV_ARM: - * Integer division available in ARM mode. Only available - * on recent CPUs (e.g. Cortex-A15). - * - * IDIV_THUMB2: - * Integer division available in Thumb-2 mode. Only available - * on recent CPUs (e.g. Cortex-A15). - * - * iWMMXt: - * Optional extension that adds MMX registers and operations to an - * ARM CPU. This is only available on a few XScale-based CPU designs - * sold by Marvell. Pretty rare in practice. - * - * AES: - * CPU supports AES instructions. These instructions are only - * available for 32-bit applications running on ARMv8 CPU. - * - * CRC32: - * CPU supports CRC32 instructions. These instructions are only - * available for 32-bit applications running on ARMv8 CPU. - * - * SHA2: - * CPU supports SHA2 instructions. These instructions are only - * available for 32-bit applications running on ARMv8 CPU. - * - * SHA1: - * CPU supports SHA1 instructions. These instructions are only - * available for 32-bit applications running on ARMv8 CPU. - * - * PMULL: - * CPU supports 64-bit PMULL and PMULL2 instructions. These - * instructions are only available for 32-bit applications - * running on ARMv8 CPU. - * - * If you want to tell the compiler to generate code that targets one of - * the feature set above, you should probably use one of the following - * flags (for more details, see technical note at the end of this file): - * - * -mfpu=vfp - * -mfpu=vfpv2 - * These are equivalent and tell GCC to use VFPv2 instructions for - * floating-point operations. Use this if you want your code to - * run on *some* ARMv6 devices, and any ARMv7-A device supported - * by Android. - * - * Generated code requires VFPv2 feature. - * - * -mfpu=vfpv3-d16 - * Tell GCC to use VFPv3 instructions (using only 16 FPU registers). - * This should be generic code that runs on any CPU that supports the - * 'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this. - * - * Generated code requires VFPv3 feature. - * - * -mfpu=vfpv3 - * Tell GCC to use VFPv3 instructions with 32 FPU registers. - * Generated code requires VFPv3|VFP_D32 features. - * - * -mfpu=neon - * Tell GCC to use VFPv3 instructions with 32 FPU registers, and - * also support NEON intrinsics (see ). - * Generated code requires VFPv3|VFP_D32|NEON features. - * - * -mfpu=vfpv4-d16 - * Generated code requires VFPv3|VFP_FP16|VFP_FMA features. - * - * -mfpu=vfpv4 - * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features. - * - * -mfpu=neon-vfpv4 - * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA - * features. - * - * -mcpu=cortex-a7 - * -mcpu=cortex-a15 - * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32| - * NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2 - * This flag implies -mfpu=neon-vfpv4. - * - * -mcpu=iwmmxt - * Allows the use of iWMMXt instrinsics with GCC. - * - * IMPORTANT NOTE: These flags should only be tested when - * android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM, i.e. this is a - * 32-bit process. - * - * When running a 64-bit ARM process on an ARMv8 CPU, - * android_getCpuFeatures() will return a different set of bitflags - */ -enum { - ANDROID_CPU_ARM_FEATURE_ARMv7 = (1 << 0), - ANDROID_CPU_ARM_FEATURE_VFPv3 = (1 << 1), - ANDROID_CPU_ARM_FEATURE_NEON = (1 << 2), - ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3), - ANDROID_CPU_ARM_FEATURE_VFPv2 = (1 << 4), - ANDROID_CPU_ARM_FEATURE_VFP_D32 = (1 << 5), - ANDROID_CPU_ARM_FEATURE_VFP_FP16 = (1 << 6), - ANDROID_CPU_ARM_FEATURE_VFP_FMA = (1 << 7), - ANDROID_CPU_ARM_FEATURE_NEON_FMA = (1 << 8), - ANDROID_CPU_ARM_FEATURE_IDIV_ARM = (1 << 9), - ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10), - ANDROID_CPU_ARM_FEATURE_iWMMXt = (1 << 11), - ANDROID_CPU_ARM_FEATURE_AES = (1 << 12), - ANDROID_CPU_ARM_FEATURE_PMULL = (1 << 13), - ANDROID_CPU_ARM_FEATURE_SHA1 = (1 << 14), - ANDROID_CPU_ARM_FEATURE_SHA2 = (1 << 15), - ANDROID_CPU_ARM_FEATURE_CRC32 = (1 << 16), -}; - -/* The bit flags corresponding to the output of android_getCpuFeatures() - * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM64. Value details - * are: - * - * FP: - * CPU has Floating-point unit. - * - * ASIMD: - * CPU has Advanced SIMD unit. - * - * AES: - * CPU supports AES instructions. - * - * CRC32: - * CPU supports CRC32 instructions. - * - * SHA2: - * CPU supports SHA2 instructions. - * - * SHA1: - * CPU supports SHA1 instructions. - * - * PMULL: - * CPU supports 64-bit PMULL and PMULL2 instructions. - */ -enum { - ANDROID_CPU_ARM64_FEATURE_FP = (1 << 0), - ANDROID_CPU_ARM64_FEATURE_ASIMD = (1 << 1), - ANDROID_CPU_ARM64_FEATURE_AES = (1 << 2), - ANDROID_CPU_ARM64_FEATURE_PMULL = (1 << 3), - ANDROID_CPU_ARM64_FEATURE_SHA1 = (1 << 4), - ANDROID_CPU_ARM64_FEATURE_SHA2 = (1 << 5), - ANDROID_CPU_ARM64_FEATURE_CRC32 = (1 << 6), -}; - -/* The bit flags corresponding to the output of android_getCpuFeatures() - * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_X86 or - * ANDROID_CPU_FAMILY_X86_64. - */ -enum { - ANDROID_CPU_X86_FEATURE_SSSE3 = (1 << 0), - ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1), - ANDROID_CPU_X86_FEATURE_MOVBE = (1 << 2), - ANDROID_CPU_X86_FEATURE_SSE4_1 = (1 << 3), - ANDROID_CPU_X86_FEATURE_SSE4_2 = (1 << 4), - ANDROID_CPU_X86_FEATURE_AES_NI = (1 << 5), - ANDROID_CPU_X86_FEATURE_AVX = (1 << 6), - ANDROID_CPU_X86_FEATURE_RDRAND = (1 << 7), - ANDROID_CPU_X86_FEATURE_AVX2 = (1 << 8), - ANDROID_CPU_X86_FEATURE_SHA_NI = (1 << 9), -}; - -/* The bit flags corresponding to the output of android_getCpuFeatures() - * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_MIPS - * or ANDROID_CPU_FAMILY_MIPS64. Values are: - * - * R6: - * CPU executes MIPS Release 6 instructions natively, and - * supports obsoleted R1..R5 instructions only via kernel traps. - * - * MSA: - * CPU supports Mips SIMD Architecture instructions. - */ -enum { - ANDROID_CPU_MIPS_FEATURE_R6 = (1 << 0), - ANDROID_CPU_MIPS_FEATURE_MSA = (1 << 1), -}; - - -/* Return the number of CPU cores detected on this device. */ -extern int android_getCpuCount(void); - -/* The following is used to force the CPU count and features - * mask in sandboxed processes. Under 4.1 and higher, these processes - * cannot access /proc, which is the only way to get information from - * the kernel about the current hardware (at least on ARM). - * - * It _must_ be called only once, and before any android_getCpuXXX - * function, any other case will fail. - * - * This function return 1 on success, and 0 on failure. - */ -extern int android_setCpu(int cpu_count, - uint64_t cpu_features); - -#ifdef __arm__ -/* Retrieve the ARM 32-bit CPUID value from the kernel. - * Note that this cannot work on sandboxed processes under 4.1 and - * higher, unless you called android_setCpuArm() before. - */ -extern uint32_t android_getCpuIdArm(void); - -/* An ARM-specific variant of android_setCpu() that also allows you - * to set the ARM CPUID field. - */ -extern int android_setCpuArm(int cpu_count, - uint64_t cpu_features, - uint32_t cpu_id); -#endif - -__END_DECLS - -#endif /* CPU_FEATURES_H */ diff --git a/thirdparty/libvpx/third_party/x86inc/LICENSE b/thirdparty/libvpx/third_party/x86inc/LICENSE deleted file mode 100644 index 7d07645a17..0000000000 --- a/thirdparty/libvpx/third_party/x86inc/LICENSE +++ /dev/null @@ -1,18 +0,0 @@ -Copyright (C) 2005-2012 x264 project - -Authors: Loren Merritt - Anton Mitrofanov - Jason Garrett-Glaser - Henrik Gramner - -Permission to use, copy, modify, and/or distribute this software for any -purpose with or without fee is hereby granted, provided that the above -copyright notice and this permission notice appear in all copies. - -THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/thirdparty/libvpx/third_party/x86inc/README.libvpx b/thirdparty/libvpx/third_party/x86inc/README.libvpx deleted file mode 100644 index 8d3cd966da..0000000000 --- a/thirdparty/libvpx/third_party/x86inc/README.libvpx +++ /dev/null @@ -1,20 +0,0 @@ -URL: https://git.videolan.org/git/x264.git -Version: d23d18655249944c1ca894b451e2c82c7a584c62 -License: ISC -License File: LICENSE - -Description: -x264/libav's framework for x86 assembly. Contains a variety of macros and -defines that help automatically allow assembly to work cross-platform. - -Local Modifications: -Get configuration from vpx_config.asm. -Prefix functions with vpx by default. -Manage name mangling (prefixing with '_') manually because 'PREFIX' does not - exist in libvpx. -Expand PIC default to macho64 and respect CONFIG_PIC from libvpx -Set 'private_extern' visibility for macho targets. -Copy PIC 'GLOBAL' macros from x86_abi_support.asm -Use .text instead of .rodata on macho to avoid broken tables in PIC mode. -Use .text with no alignment for aout -Only use 'hidden' visibility with Chromium diff --git a/thirdparty/libvpx/third_party/x86inc/x86inc.asm b/thirdparty/libvpx/third_party/x86inc/x86inc.asm deleted file mode 100644 index b647dff2f8..0000000000 --- a/thirdparty/libvpx/third_party/x86inc/x86inc.asm +++ /dev/null @@ -1,1649 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2016 x264 project -;* -;* Authors: Loren Merritt -;* Anton Mitrofanov -;* Fiona Glaser -;* Henrik Gramner -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%include "vpx_config.asm" - -%ifndef private_prefix - %define private_prefix vpx -%endif - -%ifndef public_prefix - %define public_prefix private_prefix -%endif - -%ifndef STACK_ALIGNMENT - %if ARCH_X86_64 - %define STACK_ALIGNMENT 16 - %else - %define STACK_ALIGNMENT 4 - %endif -%endif - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,x64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%define FORMAT_ELF 0 -%ifidn __OUTPUT_FORMAT__,elf - %define FORMAT_ELF 1 -%elifidn __OUTPUT_FORMAT__,elf32 - %define FORMAT_ELF 1 -%elifidn __OUTPUT_FORMAT__,elf64 - %define FORMAT_ELF 1 -%endif - -%define FORMAT_MACHO 0 -%ifidn __OUTPUT_FORMAT__,macho32 - %define FORMAT_MACHO 1 -%elifidn __OUTPUT_FORMAT__,macho64 - %define FORMAT_MACHO 1 -%endif - -; Set PREFIX for libvpx builds. -%if FORMAT_ELF - %undef PREFIX -%elif WIN64 - %undef PREFIX -%else - %define PREFIX -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -; In some instances macho32 tables get misaligned when using .rodata. -; When looking at the disassembly it appears that the offset is either -; correct or consistently off by 90. Placing them in the .text section -; works around the issue. It appears to be specific to the way libvpx -; handles the tables. -%macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,macho32 - SECTION .text align=%1 - fakegot: - %elifidn __OUTPUT_FORMAT__,aout - SECTION .text - %else - SECTION .rodata align=%1 - %endif -%endmacro - -; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" -; from original code is added in for 64bit. -%ifidn __OUTPUT_FORMAT__,elf32 -%define ABI_IS_32BIT 1 -%elifidn __OUTPUT_FORMAT__,macho32 -%define ABI_IS_32BIT 1 -%elifidn __OUTPUT_FORMAT__,win32 -%define ABI_IS_32BIT 1 -%elifidn __OUTPUT_FORMAT__,aout -%define ABI_IS_32BIT 1 -%else -%define ABI_IS_32BIT 0 -%endif - -%if ABI_IS_32BIT - %if CONFIG_PIC=1 - %ifidn __OUTPUT_FORMAT__,elf32 - %define GET_GOT_DEFINED 1 - %define WRT_PLT wrt ..plt - %macro GET_GOT 1 - extern _GLOBAL_OFFSET_TABLE_ - push %1 - call %%get_got - %%sub_offset: - jmp %%exitGG - %%get_got: - mov %1, [esp] - add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc - ret - %%exitGG: - %undef GLOBAL - %define GLOBAL(x) x + %1 wrt ..gotoff - %undef RESTORE_GOT - %define RESTORE_GOT pop %1 - %endmacro - %elifidn __OUTPUT_FORMAT__,macho32 - %define GET_GOT_DEFINED 1 - %macro GET_GOT 1 - push %1 - call %%get_got - %%get_got: - pop %1 - %undef GLOBAL - %define GLOBAL(x) x + %1 - %%get_got - %undef RESTORE_GOT - %define RESTORE_GOT pop %1 - %endmacro - %else - %define GET_GOT_DEFINED 0 - %endif - %endif - - %if ARCH_X86_64 == 0 - %undef PIC - %endif - -%else - %macro GET_GOT 1 - %endmacro - %define GLOBAL(x) rel x - %define WRT_PLT wrt ..plt - - %if WIN64 - %define PIC - %elifidn __OUTPUT_FORMAT__,macho64 - %define PIC - %elif CONFIG_PIC - %define PIC - %endif -%endif - -%ifnmacro GET_GOT - %macro GET_GOT 1 - %endmacro - %define GLOBAL(x) x -%endif -%ifndef RESTORE_GOT - %define RESTORE_GOT -%endif -%ifndef WRT_PLT - %define WRT_PLT -%endif - -%ifdef PIC - default rel -%endif - -%ifndef GET_GOT_DEFINED - %define GET_GOT_DEFINED 0 -%endif -; Done with PIC macros - -%ifdef __NASM_VER__ - %use smartalign -%endif - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. The stack will be aligned before -; allocating the specified stack size. If the required stack alignment is -; larger than the known stack alignment the stack will be manually aligned -; and an extra register will be allocated to hold the original stack -; pointer (to not invalidate r0m etc.). To prevent the use of an extra -; register as stack pointer, request a negative stack size. -; %4+/%5+ = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,7,0x40, dst, src, tmp -; declares a function (foo) that automatically loads two arguments (dst and -; src) into registers, uses one additional register (tmp) plus 7 vector -; registers (m0-m6) and allocates 0x40 bytes of stack space. - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Use this instead of RET if it's a branch target. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %define %2q %2 - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m [rstk + stack_offset + %3] - %define r%1mp qword r %+ %1 %+ m - %else - %define r%1m [rstk + stack_offset + %3] - %define r%1mp dword r %+ %1 %+ m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 - %if ARCH_X86_64 == 0 - %define r%1 e%1 - %endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset+gprsize - %endif -%endmacro - -%macro POP 1 - pop %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset-gprsize - %endif -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assertion ``%1'' failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%define required_stack_alignment ((mmsize + 15) & ~15) - -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) - %ifnum %1 - %if %1 != 0 - %assign %%pad 0 - %assign stack_size %1 - %if stack_size < 0 - %assign stack_size -stack_size - %endif - %if WIN64 - %assign %%pad %%pad + 32 ; shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers - %endif - %endif - %endif - %if required_stack_alignment <= STACK_ALIGNMENT - ; maintain the current stack alignment - %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) - SUB rsp, stack_size_padded - %else - %assign %%reg_num (regs_used - 1) - %xdefine rstk r %+ %%reg_num - ; align stack, and save original stack location directly above - ; it, i.e. in [rsp+stack_size_padded], so we can restore the - ; stack in a single instruction (i.e. mov rsp, rstk or mov - ; rsp, [rsp+stack_size_padded]) - %if %1 < 0 ; need to store rsp on stack - %xdefine rstkm [rsp + stack_size + %%pad] - %assign %%pad %%pad + gprsize - %else ; can keep rsp in rstk during whole function - %xdefine rstkm rstk - %endif - %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) - mov rstk, rsp - and rsp, ~(required_stack_alignment-1) - sub rsp, stack_size_padded - movifnidn rstkm, rstk - %endif - WIN64_PUSH_XMM - %endif - %endif -%endmacro - -%macro SETUP_STACK_POINTER 1 - %ifnum %1 - %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT - %if %1 > 0 - %assign regs_used (regs_used + 1) - %endif - %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 - ; Ensure that we don't clobber any registers containing arguments - %assign regs_used 5 + UNIX64 * 3 - %endif - %endif - %endif -%endmacro - -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4, %3 - %if mmsize != 8 && stack_size == 0 - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 - movaps [rstk + stack_offset + 8], xmm6 - %endif - %if xmm_regs_used > 7 - movaps [rstk + stack_offset + 24], xmm7 - %endif - %if xmm_regs_used > 8 - %assign %%i 8 - %rep xmm_regs_used-8 - movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 - ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad (xmm_regs_used-8)*16 + 32 - %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) - SUB rsp, stack_size_padded - %endif - WIN64_PUSH_XMM -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 - %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] - %endrep - %endif - %if stack_size_padded > 0 - %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT - mov rsp, rstkm - %else - add %1, stack_size_padded - %assign %%pad_size stack_size_padded - %endif - %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] - %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset (stack_offset-stack_size_padded) - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 - %if mmsize == 32 - vzeroupper - %endif - AUTO_REP_RET -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - %if stack_size_padded > 0 - %if required_stack_alignment > STACK_ALIGNMENT - mov rsp, rstkm - %else - add rsp, stack_size_padded - %endif - %endif - POP_IF_USED 14, 13, 12, 11, 10, 9 - %if mmsize == 32 - vzeroupper - %endif - AUTO_REP_RET -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m [rstk + stack_offset + 4*%1 + 4] - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - %if num_args > 7 - %assign num_args 7 - %endif - %if regs_used > 7 - %assign regs_used 7 - %endif - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 7 - PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - %if stack_size_padded > 0 - %if required_stack_alignment > STACK_ALIGNMENT - mov rsp, rstkm - %else - add rsp, stack_size_padded - %endif - %endif - POP_IF_USED 6, 5, 4, 3 - %if mmsize == 32 - vzeroupper - %endif - AUTO_REP_RET -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 - %macro WIN64_SPILL_XMM 1 - %endmacro - %macro WIN64_RESTORE_XMM 1 - %endmacro - %macro WIN64_PUSH_XMM 0 - %endmacro -%endif - -; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either -; a branch or a branch target. So switch to a 2-byte form of ret in that case. -; We can automatically detect "follows a branch", but not a branch target. -; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif - annotate_function_size -%endmacro - -%define last_branch_adr $$ -%macro AUTO_REP_RET 0 - %if notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. - %endif - ret - annotate_function_size -%endmacro - -%macro BRANCH_INSTR 0-* - %rep %0 - %macro %1 1-2 %1 - %2 %1 - %if notcpuflag(ssse3) - %%branch_instr equ $ - %xdefine last_branch_adr %%branch_instr - %endif - %endmacro - %rotate 1 - %endrep -%endmacro - -BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif - annotate_function_size -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX -; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). -%macro cglobal 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 1, %1 %+ SUFFIX, %2 -%endmacro -%macro cvisible 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 0, %1 %+ SUFFIX, %2 -%endmacro -%macro cglobal_internal 2-3+ - annotate_function_size - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - ; libvpx explicitly sets visibility in shared object builds. Avoid - ; setting visibility to hidden as it may break builds that split - ; sources on e.g., directory boundaries. - %ifdef CHROMIUM - %xdefine %%VISIBILITY hidden - %else - %xdefine %%VISIBILITY - %endif - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif - %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) - %xdefine %2.skip_prologue %2 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %2, 1 - %endif - %xdefine current_function %2 - %xdefine current_function_section __SECT__ - %if FORMAT_ELF - global %2:function %%VISIBILITY - %elif FORMAT_MACHO - %ifdef __NASM_VER__ - global %2 - %else - global %2:private_extern - %endif - %else - global %2 - %endif - align function_align - %2: - RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer - %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required - %assign stack_offset 0 ; stack pointer offset relative to the return address - %assign stack_size 0 ; amount of stack space that can be freely used inside a function - %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 - %ifnidn %3, "" - PROLOGUE %3 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %ifdef PREFIX - %xdefine %1 mangle(%1) - %endif - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 1-2+ - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %if FORMAT_ELF - global %1:data hidden - %else - global %1 - %endif - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. -%if FORMAT_ELF - [SECTION .note.GNU-stack noalloc noexec nowrite progbits] -%endif - -; Tell debuggers how large the function was. -; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. -; This is invoked by RET and similar macros, and also cglobal does it for the previous function, -; but if the last function in a source file doesn't use any of the standard macros for its epilogue, -; then its size might be unspecified. -%macro annotate_function_size 0 - %ifdef __YASM_VER__ - %ifdef current_function - %if FORMAT_ELF - current_function_section - %%ecf equ $ - size current_function %%ecf - current_function - __SECT__ - %endif - %endif - %endif -%endmacro - -; cpuflags - -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 - -; Returns a boolean value expressing whether or not the specified cpuflag is enabled. -%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) -%define notcpuflag(x) (cpuflag(x) ^ 1) - -; Takes an arbitrary number of cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-* - %xdefine SUFFIX - %undef cpuname - %assign cpuflags 0 - - %if %0 >= 1 - %rep %0 - %ifdef cpuname - %xdefine cpuname cpuname %+ _%1 - %else - %xdefine cpuname %1 - %endif - %assign cpuflags cpuflags | cpuflags_%1 - %rotate 1 - %endrep - %xdefine SUFFIX _ %+ cpuname - - %if cpuflag(avx) - %assign avx_enabled 1 - %endif - %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elif cpuflag(sse3) && notcpuflag(ssse3) - %define movu lddqu - %endif - %endif - - %if ARCH_X86_64 || cpuflag(sse2) - %ifdef __NASM_VER__ - ALIGNMODE k8 - %else - CPU amdnop - %endif - %else - %ifdef __NASM_VER__ - ALIGNMODE nop - %else - CPU basicnop - %endif - %endif -%endmacro - -; Merge mmx and sse* -; m# is a simd register of the currently selected size -; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# -; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nnmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nnmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nnxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_YMM 0-1+ - %assign avx_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %undef movh - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nnymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -%macro DECLARE_MMCAST 1 - %define mmmm%1 mm%1 - %define mmxmm%1 mm%1 - %define mmymm%1 mm%1 - %define xmmmm%1 mm%1 - %define xmmxmm%1 xmm%1 - %define xmmymm%1 xmm%1 - %define ymmmm%1 mm%1 - %define ymmxmm%1 xmm%1 - %define ymmymm%1 ymm%1 - %define xm%1 xmm %+ m%1 - %define ym%1 ymm %+ m%1 -%endmacro - -%assign i 0 -%rep 16 - DECLARE_MMCAST i - %assign i i+1 -%endrep - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap - %rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 - %endrep - %rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE nn, m%1, %1 - %rotate 2 - %endrep -%endmacro - -%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) - %ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 - %else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 - %endif -%endmacro - -%macro SWAP_INTERNAL_NUM 2-* - %rep %0-1 - %xdefine %%tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 %%tmp - CAT_XDEFINE nn, m%1, %1 - CAT_XDEFINE nn, m%2, %2 - %rotate 1 - %endrep -%endmacro - -%macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args nn %+ %1 - %rep %0-1 - %xdefine %%args %%args, nn %+ %2 - %rotate 1 - %endrep - SWAP_INTERNAL_NUM %%args -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE nn, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1 %+ SUFFIX, %1 -%endmacro -%macro call_internal 2 - %xdefine %%i %2 - %ifndef cglobaled_%2 - %ifdef cglobaled_%1 - %xdefine %%i %1 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 - %assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-avx emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == minimal instruction set -;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -;%6+: operands -%macro RUN_AVX_INSTR 6-9+ - %ifnum sizeof%7 - %assign __sizeofreg sizeof%7 - %elifnum sizeof%6 - %assign __sizeofreg sizeof%6 - %else - %assign __sizeofreg mmsize - %endif - %assign __emulate_avx 0 - %if avx_enabled && __sizeofreg >= 16 - %xdefine __instr v%1 - %else - %xdefine __instr %1 - %if %0 >= 8+%4 - %assign __emulate_avx 1 - %endif - %endif - %ifnidn %2, fnord - %ifdef cpuname - %if notcpuflag(%2) - %error use of ``%1'' %2 instruction in cpuname function: current_function - %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 - %error use of ``%1'' sse2 instruction in cpuname function: current_function - %endif - %endif - %endif - - %if __emulate_avx - %xdefine __src1 %7 - %xdefine __src2 %8 - %ifnidn %6, %7 - %if %0 >= 9 - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 - %else - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 - %endif - %if %5 && %4 == 0 - %ifnid %8 - ; 3-operand AVX instructions with a memory arg can only have it in src2, - ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). - ; So, if the instruction is commutative with a memory arg, swap them. - %xdefine __src1 %8 - %xdefine __src2 %7 - %endif - %endif - %if __sizeofreg == 8 - MOVQ %6, __src1 - %elif %3 - MOVAPS %6, __src1 - %else - MOVDQA %6, __src1 - %endif - %endif - %if %0 >= 9 - %1 %6, __src2, %9 - %else - %1 %6, __src2 - %endif - %elif %0 >= 9 - __instr %6, %7, %8, %9 - %elif %0 == 8 - __instr %6, %7, %8 - %elif %0 == 7 - __instr %6, %7 - %else - __instr %6 - %endif -%endmacro - -;%1 == instruction -;%2 == minimal instruction set -;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-5 fnord, 0, 1, 0 - %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 - %ifidn %2, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 - %elifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -; Instructions with both VEX and non-VEX encodings -; Non-destructive instructions are written without parameters -AVX_INSTR addpd, sse2, 1, 0, 1 -AVX_INSTR addps, sse, 1, 0, 1 -AVX_INSTR addsd, sse2, 1, 0, 1 -AVX_INSTR addss, sse, 1, 0, 1 -AVX_INSTR addsubpd, sse3, 1, 0, 0 -AVX_INSTR addsubps, sse3, 1, 0, 0 -AVX_INSTR aesdec, fnord, 0, 0, 0 -AVX_INSTR aesdeclast, fnord, 0, 0, 0 -AVX_INSTR aesenc, fnord, 0, 0, 0 -AVX_INSTR aesenclast, fnord, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist -AVX_INSTR andnpd, sse2, 1, 0, 0 -AVX_INSTR andnps, sse, 1, 0, 0 -AVX_INSTR andpd, sse2, 1, 0, 1 -AVX_INSTR andps, sse, 1, 0, 1 -AVX_INSTR blendpd, sse4, 1, 0, 0 -AVX_INSTR blendps, sse4, 1, 0, 0 -AVX_INSTR blendvpd, sse4, 1, 0, 0 -AVX_INSTR blendvps, sse4, 1, 0, 0 -AVX_INSTR cmppd, sse2, 1, 1, 0 -AVX_INSTR cmpps, sse, 1, 1, 0 -AVX_INSTR cmpsd, sse2, 1, 1, 0 -AVX_INSTR cmpss, sse, 1, 1, 0 -AVX_INSTR comisd, sse2 -AVX_INSTR comiss, sse -AVX_INSTR cvtdq2pd, sse2 -AVX_INSTR cvtdq2ps, sse2 -AVX_INSTR cvtpd2dq, sse2 -AVX_INSTR cvtpd2ps, sse2 -AVX_INSTR cvtps2dq, sse2 -AVX_INSTR cvtps2pd, sse2 -AVX_INSTR cvtsd2si, sse2 -AVX_INSTR cvtsd2ss, sse2 -AVX_INSTR cvtsi2sd, sse2 -AVX_INSTR cvtsi2ss, sse -AVX_INSTR cvtss2sd, sse2 -AVX_INSTR cvtss2si, sse -AVX_INSTR cvttpd2dq, sse2 -AVX_INSTR cvttps2dq, sse2 -AVX_INSTR cvttsd2si, sse2 -AVX_INSTR cvttss2si, sse -AVX_INSTR divpd, sse2, 1, 0, 0 -AVX_INSTR divps, sse, 1, 0, 0 -AVX_INSTR divsd, sse2, 1, 0, 0 -AVX_INSTR divss, sse, 1, 0, 0 -AVX_INSTR dppd, sse4, 1, 1, 0 -AVX_INSTR dpps, sse4, 1, 1, 0 -AVX_INSTR extractps, sse4 -AVX_INSTR haddpd, sse3, 1, 0, 0 -AVX_INSTR haddps, sse3, 1, 0, 0 -AVX_INSTR hsubpd, sse3, 1, 0, 0 -AVX_INSTR hsubps, sse3, 1, 0, 0 -AVX_INSTR insertps, sse4, 1, 1, 0 -AVX_INSTR lddqu, sse3 -AVX_INSTR ldmxcsr, sse -AVX_INSTR maskmovdqu, sse2 -AVX_INSTR maxpd, sse2, 1, 0, 1 -AVX_INSTR maxps, sse, 1, 0, 1 -AVX_INSTR maxsd, sse2, 1, 0, 1 -AVX_INSTR maxss, sse, 1, 0, 1 -AVX_INSTR minpd, sse2, 1, 0, 1 -AVX_INSTR minps, sse, 1, 0, 1 -AVX_INSTR minsd, sse2, 1, 0, 1 -AVX_INSTR minss, sse, 1, 0, 1 -AVX_INSTR movapd, sse2 -AVX_INSTR movaps, sse -AVX_INSTR movd, mmx -AVX_INSTR movddup, sse3 -AVX_INSTR movdqa, sse2 -AVX_INSTR movdqu, sse2 -AVX_INSTR movhlps, sse, 1, 0, 0 -AVX_INSTR movhpd, sse2, 1, 0, 0 -AVX_INSTR movhps, sse, 1, 0, 0 -AVX_INSTR movlhps, sse, 1, 0, 0 -AVX_INSTR movlpd, sse2, 1, 0, 0 -AVX_INSTR movlps, sse, 1, 0, 0 -AVX_INSTR movmskpd, sse2 -AVX_INSTR movmskps, sse -AVX_INSTR movntdq, sse2 -AVX_INSTR movntdqa, sse4 -AVX_INSTR movntpd, sse2 -AVX_INSTR movntps, sse -AVX_INSTR movq, mmx -AVX_INSTR movsd, sse2, 1, 0, 0 -AVX_INSTR movshdup, sse3 -AVX_INSTR movsldup, sse3 -AVX_INSTR movss, sse, 1, 0, 0 -AVX_INSTR movupd, sse2 -AVX_INSTR movups, sse -AVX_INSTR mpsadbw, sse4 -AVX_INSTR mulpd, sse2, 1, 0, 1 -AVX_INSTR mulps, sse, 1, 0, 1 -AVX_INSTR mulsd, sse2, 1, 0, 1 -AVX_INSTR mulss, sse, 1, 0, 1 -AVX_INSTR orpd, sse2, 1, 0, 1 -AVX_INSTR orps, sse, 1, 0, 1 -AVX_INSTR pabsb, ssse3 -AVX_INSTR pabsd, ssse3 -AVX_INSTR pabsw, ssse3 -AVX_INSTR packsswb, mmx, 0, 0, 0 -AVX_INSTR packssdw, mmx, 0, 0, 0 -AVX_INSTR packuswb, mmx, 0, 0, 0 -AVX_INSTR packusdw, sse4, 0, 0, 0 -AVX_INSTR paddb, mmx, 0, 0, 1 -AVX_INSTR paddw, mmx, 0, 0, 1 -AVX_INSTR paddd, mmx, 0, 0, 1 -AVX_INSTR paddq, sse2, 0, 0, 1 -AVX_INSTR paddsb, mmx, 0, 0, 1 -AVX_INSTR paddsw, mmx, 0, 0, 1 -AVX_INSTR paddusb, mmx, 0, 0, 1 -AVX_INSTR paddusw, mmx, 0, 0, 1 -AVX_INSTR palignr, ssse3 -AVX_INSTR pand, mmx, 0, 0, 1 -AVX_INSTR pandn, mmx, 0, 0, 0 -AVX_INSTR pavgb, mmx2, 0, 0, 1 -AVX_INSTR pavgw, mmx2, 0, 0, 1 -AVX_INSTR pblendvb, sse4, 0, 0, 0 -AVX_INSTR pblendw, sse4 -AVX_INSTR pclmulqdq -AVX_INSTR pcmpestri, sse42 -AVX_INSTR pcmpestrm, sse42 -AVX_INSTR pcmpistri, sse42 -AVX_INSTR pcmpistrm, sse42 -AVX_INSTR pcmpeqb, mmx, 0, 0, 1 -AVX_INSTR pcmpeqw, mmx, 0, 0, 1 -AVX_INSTR pcmpeqd, mmx, 0, 0, 1 -AVX_INSTR pcmpeqq, sse4, 0, 0, 1 -AVX_INSTR pcmpgtb, mmx, 0, 0, 0 -AVX_INSTR pcmpgtw, mmx, 0, 0, 0 -AVX_INSTR pcmpgtd, mmx, 0, 0, 0 -AVX_INSTR pcmpgtq, sse42, 0, 0, 0 -AVX_INSTR pextrb, sse4 -AVX_INSTR pextrd, sse4 -AVX_INSTR pextrq, sse4 -AVX_INSTR pextrw, mmx2 -AVX_INSTR phaddw, ssse3, 0, 0, 0 -AVX_INSTR phaddd, ssse3, 0, 0, 0 -AVX_INSTR phaddsw, ssse3, 0, 0, 0 -AVX_INSTR phminposuw, sse4 -AVX_INSTR phsubw, ssse3, 0, 0, 0 -AVX_INSTR phsubd, ssse3, 0, 0, 0 -AVX_INSTR phsubsw, ssse3, 0, 0, 0 -AVX_INSTR pinsrb, sse4 -AVX_INSTR pinsrd, sse4 -AVX_INSTR pinsrq, sse4 -AVX_INSTR pinsrw, mmx2 -AVX_INSTR pmaddwd, mmx, 0, 0, 1 -AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 -AVX_INSTR pmaxsb, sse4, 0, 0, 1 -AVX_INSTR pmaxsw, mmx2, 0, 0, 1 -AVX_INSTR pmaxsd, sse4, 0, 0, 1 -AVX_INSTR pmaxub, mmx2, 0, 0, 1 -AVX_INSTR pmaxuw, sse4, 0, 0, 1 -AVX_INSTR pmaxud, sse4, 0, 0, 1 -AVX_INSTR pminsb, sse4, 0, 0, 1 -AVX_INSTR pminsw, mmx2, 0, 0, 1 -AVX_INSTR pminsd, sse4, 0, 0, 1 -AVX_INSTR pminub, mmx2, 0, 0, 1 -AVX_INSTR pminuw, sse4, 0, 0, 1 -AVX_INSTR pminud, sse4, 0, 0, 1 -AVX_INSTR pmovmskb, mmx2 -AVX_INSTR pmovsxbw, sse4 -AVX_INSTR pmovsxbd, sse4 -AVX_INSTR pmovsxbq, sse4 -AVX_INSTR pmovsxwd, sse4 -AVX_INSTR pmovsxwq, sse4 -AVX_INSTR pmovsxdq, sse4 -AVX_INSTR pmovzxbw, sse4 -AVX_INSTR pmovzxbd, sse4 -AVX_INSTR pmovzxbq, sse4 -AVX_INSTR pmovzxwd, sse4 -AVX_INSTR pmovzxwq, sse4 -AVX_INSTR pmovzxdq, sse4 -AVX_INSTR pmuldq, sse4, 0, 0, 1 -AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 -AVX_INSTR pmulhuw, mmx2, 0, 0, 1 -AVX_INSTR pmulhw, mmx, 0, 0, 1 -AVX_INSTR pmullw, mmx, 0, 0, 1 -AVX_INSTR pmulld, sse4, 0, 0, 1 -AVX_INSTR pmuludq, sse2, 0, 0, 1 -AVX_INSTR por, mmx, 0, 0, 1 -AVX_INSTR psadbw, mmx2, 0, 0, 1 -AVX_INSTR pshufb, ssse3, 0, 0, 0 -AVX_INSTR pshufd, sse2 -AVX_INSTR pshufhw, sse2 -AVX_INSTR pshuflw, sse2 -AVX_INSTR psignb, ssse3, 0, 0, 0 -AVX_INSTR psignw, ssse3, 0, 0, 0 -AVX_INSTR psignd, ssse3, 0, 0, 0 -AVX_INSTR psllw, mmx, 0, 0, 0 -AVX_INSTR pslld, mmx, 0, 0, 0 -AVX_INSTR psllq, mmx, 0, 0, 0 -AVX_INSTR pslldq, sse2, 0, 0, 0 -AVX_INSTR psraw, mmx, 0, 0, 0 -AVX_INSTR psrad, mmx, 0, 0, 0 -AVX_INSTR psrlw, mmx, 0, 0, 0 -AVX_INSTR psrld, mmx, 0, 0, 0 -AVX_INSTR psrlq, mmx, 0, 0, 0 -AVX_INSTR psrldq, sse2, 0, 0, 0 -AVX_INSTR psubb, mmx, 0, 0, 0 -AVX_INSTR psubw, mmx, 0, 0, 0 -AVX_INSTR psubd, mmx, 0, 0, 0 -AVX_INSTR psubq, sse2, 0, 0, 0 -AVX_INSTR psubsb, mmx, 0, 0, 0 -AVX_INSTR psubsw, mmx, 0, 0, 0 -AVX_INSTR psubusb, mmx, 0, 0, 0 -AVX_INSTR psubusw, mmx, 0, 0, 0 -AVX_INSTR ptest, sse4 -AVX_INSTR punpckhbw, mmx, 0, 0, 0 -AVX_INSTR punpckhwd, mmx, 0, 0, 0 -AVX_INSTR punpckhdq, mmx, 0, 0, 0 -AVX_INSTR punpckhqdq, sse2, 0, 0, 0 -AVX_INSTR punpcklbw, mmx, 0, 0, 0 -AVX_INSTR punpcklwd, mmx, 0, 0, 0 -AVX_INSTR punpckldq, mmx, 0, 0, 0 -AVX_INSTR punpcklqdq, sse2, 0, 0, 0 -AVX_INSTR pxor, mmx, 0, 0, 1 -AVX_INSTR rcpps, sse, 1, 0, 0 -AVX_INSTR rcpss, sse, 1, 0, 0 -AVX_INSTR roundpd, sse4 -AVX_INSTR roundps, sse4 -AVX_INSTR roundsd, sse4 -AVX_INSTR roundss, sse4 -AVX_INSTR rsqrtps, sse, 1, 0, 0 -AVX_INSTR rsqrtss, sse, 1, 0, 0 -AVX_INSTR shufpd, sse2, 1, 1, 0 -AVX_INSTR shufps, sse, 1, 1, 0 -AVX_INSTR sqrtpd, sse2, 1, 0, 0 -AVX_INSTR sqrtps, sse, 1, 0, 0 -AVX_INSTR sqrtsd, sse2, 1, 0, 0 -AVX_INSTR sqrtss, sse, 1, 0, 0 -AVX_INSTR stmxcsr, sse -AVX_INSTR subpd, sse2, 1, 0, 0 -AVX_INSTR subps, sse, 1, 0, 0 -AVX_INSTR subsd, sse2, 1, 0, 0 -AVX_INSTR subss, sse, 1, 0, 0 -AVX_INSTR ucomisd, sse2 -AVX_INSTR ucomiss, sse -AVX_INSTR unpckhpd, sse2, 1, 0, 0 -AVX_INSTR unpckhps, sse, 1, 0, 0 -AVX_INSTR unpcklpd, sse2, 1, 0, 0 -AVX_INSTR unpcklps, sse, 1, 0, 0 -AVX_INSTR xorpd, sse2, 1, 0, 1 -AVX_INSTR xorps, sse, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 3dnow, 1, 0, 1 -AVX_INSTR pfsub, 3dnow, 1, 0, 0 -AVX_INSTR pfmul, 3dnow, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif - %assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %elifnidn %1, %4 - %6 %1, %2, %3 - %7 %1, %4 - %else - %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation -FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation -FMA_INSTR pmadcswd, pmaddwd, paddd - -; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. -; FMA3 is only possible if dst is the same as one of the src registers. -; Either src2 or src3 can be a memory operand. -%macro FMA4_INSTR 2-* - %push fma4_instr - %xdefine %$prefix %1 - %rep %0 - 1 - %macro %$prefix%2 4-6 %$prefix, %2 - %if notcpuflag(fma3) && notcpuflag(fma4) - %error use of ``%5%6'' fma instruction in cpuname function: current_function - %elif cpuflag(fma4) - v%5%6 %1, %2, %3, %4 - %elifidn %1, %2 - ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. - %ifid %3 - v%{5}213%6 %2, %3, %4 - %else - v%{5}132%6 %2, %4, %3 - %endif - %elifidn %1, %3 - v%{5}213%6 %3, %2, %4 - %elifidn %1, %4 - v%{5}231%6 %4, %2, %3 - %else - %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported - %endif - %endmacro - %rotate 1 - %endrep - %pop -%endmacro - -FMA4_INSTR fmadd, pd, ps, sd, ss -FMA4_INSTR fmaddsub, pd, ps -FMA4_INSTR fmsub, pd, ps, sd, ss -FMA4_INSTR fmsubadd, pd, ps -FMA4_INSTR fnmadd, pd, ps, sd, ss -FMA4_INSTR fnmsub, pd, ps, sd, ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) -%ifdef __YASM_VER__ - %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 - %endif - %endmacro - %endif -%endif -- cgit v1.2.3