From 1d686c3a23f3ae286ef964ab62199be96e4ad1dc Mon Sep 17 00:00:00 2001 From: Lexyan Date: Sat, 3 Sep 2016 15:38:08 +0200 Subject: Add aarch64 assembly optimization (ARMv8a 64 bits) The fixmuldiv functions don't need inline assembly to be fast in this architecture; the compiler (both clang and GCC) figure out to use the optimal instructions for this (which is 2 instruction sequence), and when letting the compiler emit the instructions instead of using inline assembly, the compiler is able to interleave those instructions with other instructions, improving scheduling, making it even faster than when using inline assembly. Overall, this gives about 50% speedup. --- libFDK/include/FDK_archdef.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'libFDK/include/FDK_archdef.h') diff --git a/libFDK/include/FDK_archdef.h b/libFDK/include/FDK_archdef.h index a831727..3aede59 100644 --- a/libFDK/include/FDK_archdef.h +++ b/libFDK/include/FDK_archdef.h @@ -198,6 +198,14 @@ amm-info@iis.fraunhofer.de #undef POW2COEFF_16BIT #undef LDCOEFF_16BIT +#elif defined(__aarch64__) || defined(__AARCH64EL__) +#define ARCH_PREFER_MULT_32x32 +#define ARCH_PREFER_MULT_32x16 +#define SINETABLE_16BIT +#define POW2COEFF_16BIT +#define LDCOEFF_16BIT +#define WINDOWTABLE_16BIT + #elif defined(__x86__) /* cppp replaced: elif */ #define ARCH_PREFER_MULT_32x16 #define SINETABLE_16BIT -- cgit v1.2.3