diff options
author | Dave Burke <daveburke@google.com> | 2012-04-17 09:51:45 -0700 |
---|---|---|
committer | Dave Burke <daveburke@google.com> | 2012-04-17 23:04:43 -0700 |
commit | 9bf37cc9712506b2483650c82d3c41152337ef7e (patch) | |
tree | 77db44e2bae06e3d144b255628be2b7a55c581d3 /libFDK/src/arm | |
parent | a37315fe10ee143d6d0b28c19d41a476a23e63ea (diff) | |
download | fdk-aac-9bf37cc9712506b2483650c82d3c41152337ef7e.tar.gz fdk-aac-9bf37cc9712506b2483650c82d3c41152337ef7e.tar.bz2 fdk-aac-9bf37cc9712506b2483650c82d3c41152337ef7e.zip |
Fraunhofer AAC codec.
License boilerplate update to follow.
Change-Id: I2810460c11a58b6d148d84673cc031f3685e79b5
Diffstat (limited to 'libFDK/src/arm')
-rw-r--r-- | libFDK/src/arm/autocorr2nd.cpp | 33 | ||||
-rw-r--r-- | libFDK/src/arm/dct_arm.cpp | 395 | ||||
-rw-r--r-- | libFDK/src/arm/fft_rad2_arm.cpp | 259 | ||||
-rw-r--r-- | libFDK/src/arm/qmf_arm.cpp | 710 | ||||
-rw-r--r-- | libFDK/src/arm/scale_arm.cpp | 110 |
5 files changed, 1507 insertions, 0 deletions
diff --git a/libFDK/src/arm/autocorr2nd.cpp b/libFDK/src/arm/autocorr2nd.cpp new file mode 100644 index 0000000..85926af --- /dev/null +++ b/libFDK/src/arm/autocorr2nd.cpp @@ -0,0 +1,33 @@ +/**************************************************************************** + + (C) Copyright Fraunhofer IIS (2006) + All Rights Reserved + + Please be advised that this software and/or program delivery is + Confidential Information of Fraunhofer and subject to and covered by the + + Fraunhofer IIS Software Evaluation Agreement + between Google Inc. and Fraunhofer + effective and in full force since March 1, 2012. + + You may use this software and/or program only under the terms and + conditions described in the above mentioned Fraunhofer IIS Software + Evaluation Agreement. Any other and/or further use requires a separate agreement. + + + This software and/or program is protected by copyright law and international + treaties. Any reproduction or distribution of this software and/or program, + or any portion of it, may result in severe civil and criminal penalties, and + will be prosecuted to the maximum extent possible under law. + + $Id$ + +*******************************************************************************/ + +/*! + * + * \brief Calculate second order autocorrelation + * + */ + + diff --git a/libFDK/src/arm/dct_arm.cpp b/libFDK/src/arm/dct_arm.cpp new file mode 100644 index 0000000..dd0ca09 --- /dev/null +++ b/libFDK/src/arm/dct_arm.cpp @@ -0,0 +1,395 @@ +/**************************************************************************** + + (C) copyright Fraunhofer IIS (2004) + All Rights Reserved + + Please be advised that this software and/or program delivery is + Confidential Information of Fraunhofer and subject to and covered by the + + Fraunhofer IIS Software Evaluation Agreement + between Google Inc. and Fraunhofer + effective and in full force since March 1, 2012. + + You may use this software and/or program only under the terms and + conditions described in the above mentioned Fraunhofer IIS Software + Evaluation Agreement. Any other and/or further use requires a separate agreement. + + + + $Id$ + +***************************************************************************/ + + + +#ifdef FUNCTION_dct_IV_func1 + +/* + Note: This assembler routine is here, because the ARM926 compiler does + not encode the inline assembler with optimal speed. + With this version, we save 2 cycles per loop iteration. +*/ + +__asm void dct_IV_func1( + int i, + const FIXP_SPK *twiddle, + FIXP_DBL *RESTRICT pDat_0, + FIXP_DBL *RESTRICT pDat_1) +{ + /* Register map: + r0 i + r1 twiddle + r2 pDat_0 + r3 pDat_1 + r4 accu1 + r5 accu2 + r6 accu3 + r7 accu4 + r8 val_tw + r9 accuX + */ + PUSH {r4-r9} + + /* 44 cycles for 2 iterations = 22 cycles/iteration */ +dct_IV_loop1_start +/* First iteration */ + LDR r8, [r1], #4 // val_tw = *twiddle++; + LDR r5, [r2, #0] // accu2 = pDat_0[0] + LDR r4, [r3, #0] // accu1 = pDat_1[0] + + SMULWT r9, r5, r8 // accuX = accu2*val_tw.l + SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h + RSB r9, r9, #0 // accuX =-accu2*val_tw.l + SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l + SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l + + LDR r8, [r1], #4 // val_tw = *twiddle++; + LDR r7, [r3, #-4] // accu4 = pDat_1[-1] + LDR r6, [r2, #4] // accu3 = pDat_0[1] + + SMULWB r9, r7, r8 // accuX = accu4*val_tw.h + SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l + RSB r9, r9, #0 // accuX =-accu4*val_tw.h + SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h + SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h + + STR r5, [r2], #4 // *pDat_0++ = accu2 + STR r4, [r2], #4 // *pDat_0++ = accu1 + STR r6, [r3], #-4 // *pDat_1-- = accu3 + STR r7, [r3], #-4 // *pDat_1-- = accu4 + +/* Second iteration */ + LDR r8, [r1], #4 // val_tw = *twiddle++; + LDR r5, [r2, #0] // accu2 = pDat_0[0] + LDR r4, [r3, #0] // accu1 = pDat_1[0] + + SMULWT r9, r5, r8 // accuX = accu2*val_tw.l + SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h + RSB r9, r9, #0 // accuX =-accu2*val_tw.l + SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l + SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l + + LDR r8, [r1], #4 // val_tw = *twiddle++; + LDR r7, [r3, #-4] // accu4 = pDat_1[-1] + LDR r6, [r2, #4] // accu3 = pDat_0[1] + + SMULWB r9, r7, r8 // accuX = accu4*val_tw.h + SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l + RSB r9, r9, #0 // accuX =-accu4*val_tw.h + SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h + SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h + + STR r5, [r2], #4 // *pDat_0++ = accu2 + STR r4, [r2], #4 // *pDat_0++ = accu1 + STR r6, [r3], #-4 // *pDat_1-- = accu3 + STR r7, [r3], #-4 // *pDat_1-- = accu4 + + SUBS r0, r0, #1 + BNE dct_IV_loop1_start + + POP {r4-r9} + + BX lr +} + +#endif /* FUNCTION_dct_IV_func1 */ + + +#ifdef FUNCTION_dct_IV_func2 + +FDK_INLINE +/* __attribute__((noinline)) */ +static void dct_IV_func2( + int i, + const FIXP_SPK *twiddle, + FIXP_DBL *pDat_0, + FIXP_DBL *pDat_1, + int inc) +{ + FIXP_DBL accu1, accu2, accu3, accu4, accuX; + LONG val_tw; + + accu1 = pDat_1[-2]; + accu2 = pDat_1[-1]; + + *--pDat_1 = -(pDat_0[1]>>1); + *pDat_0++ = (pDat_0[0]>>1); + + twiddle += inc; + +__asm + { + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + B dct_IV_loop2_2nd_part + + /* 42 cycles for 2 iterations = 21 cycles/iteration */ +dct_IV_loop2: + SMULWT accuX, accu2, val_tw + SMULWB accu2, accu2, val_tw + RSB accuX, accuX, #0 + SMLAWB accuX, accu1, val_tw, accuX + SMLAWT accu2, accu1, val_tw, accu2 + STR accuX, [pDat_0], #4 + STR accu2, [pDat_1, #-4] ! + + LDR accu4, [pDat_0, #4] + LDR accu3, [pDat_0] + SMULWB accuX, accu4, val_tw + SMULWT accu4, accu4, val_tw + RSB accuX, accuX, #0 + SMLAWT accuX, accu3, val_tw, accuX + SMLAWB accu4, accu3, val_tw, accu4 + + LDR accu1, [pDat_1, #-8] + LDR accu2, [pDat_1, #-4] + + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + + STR accuX, [pDat_1, #-4] ! + STR accu4, [pDat_0], #4 + +dct_IV_loop2_2nd_part: + SMULWT accuX, accu2, val_tw + SMULWB accu2, accu2, val_tw + RSB accuX, accuX, #0 + SMLAWB accuX, accu1, val_tw, accuX + SMLAWT accu2, accu1, val_tw, accu2 + STR accuX, [pDat_0], #4 + STR accu2, [pDat_1, #-4] ! + + LDR accu4, [pDat_0, #4] + LDR accu3, [pDat_0] + SMULWB accuX, accu4, val_tw + SMULWT accu4, accu4, val_tw + RSB accuX, accuX, #0 + SMLAWT accuX, accu3, val_tw, accuX + SMLAWB accu4, accu3, val_tw, accu4 + + LDR accu1, [pDat_1, #-8] + LDR accu2, [pDat_1, #-4] + + STR accuX, [pDat_1, #-4] ! + STR accu4, [pDat_0], #4 + + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + + SUBS i, i, #1 + BNE dct_IV_loop2 + } + + /* Last Sin and Cos value pair are the same */ + accu1 = fMultDiv2(accu1, WTC(0x5a82799a)); + accu2 = fMultDiv2(accu2, WTC(0x5a82799a)); + + *--pDat_1 = accu1 + accu2; + *pDat_0++ = accu1 - accu2; +} +#endif /* FUNCTION_dct_IV_func2 */ + + +#ifdef FUNCTION_dst_IV_func1 + +__asm void dst_IV_func1( + int i, + const FIXP_SPK *twiddle, + FIXP_DBL *pDat_0, + FIXP_DBL *pDat_1) +{ + /* Register map: + r0 i + r1 twiddle + r2 pDat_0 + r3 pDat_1 + r4 accu1 + r5 accu2 + r6 accu3 + r7 accu4 + r8 val_tw + r9 accuX + */ + PUSH {r4-r9} + +dst_IV_loop1 + LDR r8, [r1], #4 // val_tw = *twiddle++ + LDR r5, [r2] // accu2 = pDat_0[0] + LDR r6, [r2, #4] // accu3 = pDat_0[1] + RSB r5, r5, #0 // accu2 = -accu2 + SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l + LDR r4, [r3, #-4] // accu1 = pDat_1[-1] + RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l + SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l + SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l + LDR r7, [r3, #-8] // accu4 = pDat_1[-2] + SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l + LDR r8, [r1], #4 // val_tw = *twiddle++ + STR r5, [r2], #4 // *pDat_0++ = accu2 + STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX) + RSB r7, r7, #0 // accu4 = -accu4 + SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h + SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l + RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h + SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + STR r6, [r3, #-4] ! // *--pDat_1 = accu3 + STR r7, [r3, #-4] ! // *--pDat_1 = accu4 + + LDR r8, [r1], #4 // val_tw = *twiddle++ + LDR r5, [r2] // accu2 = pDat_0[0] + LDR r6, [r2, #4] // accu3 = pDat_0[1] + RSB r5, r5, #0 // accu2 = -accu2 + SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l + LDR r4, [r3, #-4] // accu1 = pDat_1[-1] + RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l + SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l + SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l + LDR r7, [r3, #-8] // accu4 = pDat_1[-2] + SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l + LDR r8, [r1], #4 // val_tw = *twiddle++ + STR r5, [r2], #4 // *pDat_0++ = accu2 + STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX) + RSB r7, r7, #0 // accu4 = -accu4 + SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h + SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l + RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h + SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + STR r6, [r3, #-4] ! // *--pDat_1 = accu3 + STR r7, [r3, #-4] ! // *--pDat_1 = accu4 + + SUBS r0, r0, #4 // i-= 4 + BNE dst_IV_loop1 + + POP {r4-r9} + BX lr +} +#endif /* FUNCTION_dst_IV_func1 */ + +#ifdef FUNCTION_dst_IV_func2 + +FDK_INLINE +/* __attribute__((noinline)) */ +static void dst_IV_func2( + int i, + const FIXP_SPK *twiddle, + FIXP_DBL *RESTRICT pDat_0, + FIXP_DBL *RESTRICT pDat_1, + int inc) +{ + FIXP_DBL accu1,accu2,accu3,accu4; + LONG val_tw; + + accu4 = pDat_0[0]; + accu3 = pDat_0[1]; + accu4 >>= 1; + accu3 >>= 1; + accu4 = -accu4; + + accu1 = pDat_1[-1]; + accu2 = pDat_1[0]; + + *pDat_0++ = accu3; + *pDat_1-- = accu4; + + + __asm + { + B dst_IV_loop2_2nd_part + + /* 50 cycles for 2 iterations = 25 cycles/iteration */ + +dst_IV_loop2: + + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + + RSB accu2, accu2, #0 // accu2 = -accu2 + RSB accu1, accu1, #0 // accu1 = -accu1 + SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l + SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l + RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l + SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l + SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h + STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 + STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 + + LDR accu4, [pDat_0] // accu4 = pDat_0[0] + LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] + + RSB accu4, accu4, #0 // accu4 = -accu4 + RSB accu3, accu3, #0 // accu3 = -accu3 + + SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h + SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l + RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h + SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + + LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] + LDR accu2, [pDat_1] // accu2 = pDat_1[0] + + STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 + STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 + +dst_IV_loop2_2nd_part: + + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + + RSB accu2, accu2, #0 // accu2 = -accu2 + RSB accu1, accu1, #0 // accu1 = -accu1 + SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l + SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l + RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l + SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l + SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h + STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 + STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 + + LDR accu4, [pDat_0] // accu4 = pDat_0[0] + LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] + + RSB accu4, accu4, #0 // accu4 = -accu4 + RSB accu3, accu3, #0 // accu3 = -accu3 + + SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h + SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l + RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h + SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + + LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] + LDR accu2, [pDat_1] // accu2 = pDat_1[0] + + STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 + STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 + + SUBS i, i, #1 + BNE dst_IV_loop2 + } + + /* Last Sin and Cos value pair are the same */ + accu1 = fMultDiv2(-accu1, WTC(0x5a82799a)); + accu2 = fMultDiv2(-accu2, WTC(0x5a82799a)); + + *pDat_0 = accu1 + accu2; + *pDat_1 = accu1 - accu2; +} +#endif /* FUNCTION_dst_IV_func2 */ diff --git a/libFDK/src/arm/fft_rad2_arm.cpp b/libFDK/src/arm/fft_rad2_arm.cpp new file mode 100644 index 0000000..f40961a --- /dev/null +++ b/libFDK/src/arm/fft_rad2_arm.cpp @@ -0,0 +1,259 @@ +/*************************** Fraunhofer IIS FDK Tools ********************** + + (C) Copyright Fraunhofer IIS (2005) + All Rights Reserved + + Please be advised that this software and/or program delivery is + Confidential Information of Fraunhofer and subject to and covered by the + + Fraunhofer IIS Software Evaluation Agreement + between Google Inc. and Fraunhofer + effective and in full force since March 1, 2012. + + You may use this software and/or program only under the terms and + conditions described in the above mentioned Fraunhofer IIS Software + Evaluation Agreement. Any other and/or further use requires a separate agreement. + + + $Id$ + Author(s): + Description: dit_fft ARM assembler replacements. + + This software and/or program is protected by copyright law and international + treaties. Any reproduction or distribution of this software and/or program, + or any portion of it, may result in severe civil and criminal penalties, and + will be prosecuted to the maximum extent possible under law. + +******************************************************************************/ + +/* NEON optimized FFT currently builds only with RVCT toolchain */ + +#ifndef FUNCTION_dit_fft + +/* If dit_fft was not yet defined by ARM-Cortex ... */ + +#if defined(SINETABLE_16BIT) + +#define FUNCTION_dit_fft + +/***************************************************************************** + + date: 28.07.2005 srl + + Contents/description: dit-tukey-FFT-algorithm + +******************************************************************************/ + +#if defined(FUNCTION_dit_fft) + + +void dit_fft(FIXP_DBL *x, const INT ldn, const FIXP_STP *trigdata, const INT trigDataSize) +{ + const INT n=1<<ldn; + INT i; + + scramble(x,n); + /* + * 1+2 stage radix 4 + */ + + for (i=0;i<n*2;i+=8) + { + FIXP_DBL a00, a10, a20, a30; + a00 = (x[i + 0] + x[i + 2])>>1; /* Re A + Re B */ + a10 = (x[i + 4] + x[i + 6])>>1; /* Re C + Re D */ + a20 = (x[i + 1] + x[i + 3])>>1; /* Im A + Im B */ + a30 = (x[i + 5] + x[i + 7])>>1; /* Im C + Im D */ + + x[i + 0] = a00 + a10; /* Re A' = Re A + Re B + Re C + Re D */ + x[i + 4] = a00 - a10; /* Re C' = Re A + Re B - Re C - Re D */ + x[i + 1] = a20 + a30; /* Im A' = Im A + Im B + Im C + Im D */ + x[i + 5] = a20 - a30; /* Im C' = Im A + Im B - Im C - Im D */ + + a00 = a00 - x[i + 2]; /* Re A - Re B */ + a10 = a10 - x[i + 6]; /* Re C - Re D */ + a20 = a20 - x[i + 3]; /* Im A - Im B */ + a30 = a30 - x[i + 7]; /* Im C - Im D */ + + x[i + 2] = a00 + a30; /* Re B' = Re A - Re B + Im C - Im D */ + x[i + 6] = a00 - a30; /* Re D' = Re A - Re B - Im C + Im D */ + x[i + 3] = a20 - a10; /* Im B' = Im A - Im B - Re C + Re D */ + x[i + 7] = a20 + a10; /* Im D' = Im A - Im B + Re C - Re D */ + } + + INT mh = 1 << 1; + INT ldm = ldn - 2; + INT trigstep = trigDataSize; + + do + { + const FIXP_STP *pTrigData = trigdata; + INT j; + + mh <<= 1; + trigstep >>= 1; + + FDK_ASSERT(trigstep > 0); + + /* Do first iteration with c=1.0 and s=0.0 separately to avoid loosing to much precision. + Beware: The impact on the overal FFT precision is rather large. */ + { + FIXP_DBL *xt1 = x; + int r = n; + + do { + FIXP_DBL *xt2 = xt1 + (mh<<1); + /* + FIXP_DBL *xt1 = x+ ((r)<<1); + FIXP_DBL *xt2 = xt1 + (mh<<1); + */ + FIXP_DBL vr,vi,ur,ui; + + //cplxMultDiv2(&vi, &vr, x[t2+1], x[t2], (FIXP_SGL)1.0, (FIXP_SGL)0.0); + vi = xt2[1]>>1; + vr = xt2[0]>>1; + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui+vi; + + xt2[0] = ur-vr; + xt2[1] = ui-vi; + + xt1 += mh; + xt2 += mh; + + //cplxMultDiv2(&vr, &vi, x[t2+1], x[t2], (FIXP_SGL)1.0, (FIXP_SGL)0.0); + vr = xt2[1]>>1; + vi = xt2[0]>>1; + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui-vi; + + xt2[0] = ur-vr; + xt2[1] = ui+vi; + + xt1 = xt2 + mh; + } while ((r=r-(mh<<1)) != 0); + } + for(j=4; j<mh; j+=4) + { + FIXP_DBL *xt1 = x + (j>>1); + FIXP_SPK cs; + int r = n; + + pTrigData += trigstep; + cs = *pTrigData; + + do + { + FIXP_DBL *xt2 = xt1 + (mh<<1); + FIXP_DBL vr,vi,ur,ui; + + cplxMultDiv2(&vi, &vr, xt2[1], xt2[0], cs); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui+vi; + + xt2[0] = ur-vr; + xt2[1] = ui-vi; + + xt1 += mh; + xt2 += mh; + + cplxMultDiv2(&vr, &vi, xt2[1], xt2[0], cs); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui-vi; + + xt2[0] = ur-vr; + xt2[1] = ui+vi; + + /* Same as above but for t1,t2 with j>mh/4 and thus cs swapped */ + xt1 = xt1 - (j); + xt2 = xt1 + (mh<<1); + + cplxMultDiv2(&vi, &vr, xt2[0], xt2[1], cs); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui-vi; + + xt2[0] = ur-vr; + xt2[1] = ui+vi; + + xt1 += mh; + xt2 += mh; + + cplxMultDiv2(&vr, &vi, xt2[0], xt2[1], cs); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur-vr; + xt1[1] = ui-vi; + + xt2[0] = ur+vr; + xt2[1] = ui+vi; + + xt1 = xt2 + (j); + } while ((r=r-(mh<<1)) != 0); + } + { + FIXP_DBL *xt1 = x + (mh>>1); + int r = n; + + do + { + FIXP_DBL *xt2 = xt1 + (mh<<1); + FIXP_DBL vr,vi,ur,ui; + + cplxMultDiv2(&vi, &vr, xt2[1], xt2[0], STC(0x5a82799a), STC(0x5a82799a)); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui+vi; + + xt2[0] = ur-vr; + xt2[1] = ui-vi; + + xt1 += mh; + xt2 += mh; + + cplxMultDiv2(&vr, &vi, xt2[1], xt2[0], STC(0x5a82799a), STC(0x5a82799a)); + + ur = xt1[0]>>1; + ui = xt1[1]>>1; + + xt1[0] = ur+vr; + xt1[1] = ui-vi; + + xt2[0] = ur-vr; + xt2[1] = ui+vi; + + xt1 = xt2 + mh; + } while ((r=r-(mh<<1)) != 0); + } + } while (--ldm != 0); +} + +#endif /* if defined(FUNCTION_dit_fft) */ + +#endif /* if defined(SINETABLE_16BIT) */ + +#endif /* ifndef FUNCTION_dit_fft */ diff --git a/libFDK/src/arm/qmf_arm.cpp b/libFDK/src/arm/qmf_arm.cpp new file mode 100644 index 0000000..df538a4 --- /dev/null +++ b/libFDK/src/arm/qmf_arm.cpp @@ -0,0 +1,710 @@ +/**************************************************************************** + + (C) Copyright Fraunhofer IIS (2004) + All Rights Reserved + + Please be advised that this software and/or program delivery is + Confidential Information of Fraunhofer and subject to and covered by the + + Fraunhofer IIS Software Evaluation Agreement + between Google Inc. and Fraunhofer + effective and in full force since March 1, 2012. + + You may use this software and/or program only under the terms and + conditions described in the above mentioned Fraunhofer IIS Software + Evaluation Agreement. Any other and/or further use requires a separate agreement. + + + This software and/or program is protected by copyright law and international + treaties. Any reproduction or distribution of this software and/or program, + or any portion of it, may result in severe civil and criminal penalties, and + will be prosecuted to the maximum extent possible under law. + + $Id$ + + History: 04-NOV-2009 A. Tritthart Optimized qmfSynPrototypeFirSlot1 + +****************************************************************************/ +#if (QMF_NO_POLY==5) + +#define FUNCTION_qmfForwardModulationLP_odd + +#ifdef FUNCTION_qmfForwardModulationLP_odd +static void +qmfForwardModulationLP_odd( HANDLE_QMF_FILTER_BANK anaQmf, /*!< Handle of Qmf Analysis Bank */ + const FIXP_QMF *timeIn, /*!< Time Signal */ + FIXP_QMF *rSubband ) /*!< Real Output */ +{ + int i; + int L = anaQmf->no_channels; + int M = L>>1; + int shift = (anaQmf->no_channels>>6) + 1; + int rSubband_e = 0; + + FIXP_QMF *rSubbandPtr0 = &rSubband[M+0]; /* runs with increment */ + FIXP_QMF *rSubbandPtr1 = &rSubband[M-1]; /* runs with decrement */ + FIXP_QMF *timeIn0 = (FIXP_DBL *) &timeIn[0]; /* runs with increment */ + FIXP_QMF *timeIn1 = (FIXP_DBL *) &timeIn[L]; /* runs with increment */ + FIXP_QMF *timeIn2 = (FIXP_DBL *) &timeIn[L-1]; /* runs with decrement */ + FIXP_QMF *timeIn3 = (FIXP_DBL *) &timeIn[2*L-1]; /* runs with decrement */ + + for (i = 0; i < M; i++) + { + *rSubbandPtr0++ = (*timeIn2-- >> 1) - (*timeIn0++ >> shift); + *rSubbandPtr1-- = (*timeIn1++ >> 1) + (*timeIn3-- >> shift); + } + + dct_IV(rSubband,L, &rSubband_e); +} +#endif /* FUNCTION_qmfForwardModulationLP_odd */ + + +/* NEON optimized QMF currently builts only with RVCT toolchain */ + +#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_5TE__) + +#if (SAMPLE_BITS == 16) +#define FUNCTION_qmfAnaPrototypeFirSlot +#endif + +#ifdef FUNCTION_qmfAnaPrototypeFirSlot + +#if defined(__GNUC__) /* cppp replaced: elif */ + +inline INT SMULBB (const SHORT a, const LONG b) +{ + INT result ; + __asm__ ("smulbb %0, %1, %2" + : "=r" (result) + : "r" (a), "r" (b)) ; + return result ; +} +inline INT SMULBT (const SHORT a, const LONG b) +{ + INT result ; + __asm__ ("smulbt %0, %1, %2" + : "=r" (result) + : "r" (a), "r" (b)) ; + return result ; +} + +inline INT SMLABB(const LONG accu, const SHORT a, const LONG b) +{ + INT result ; + __asm__ ("smlabb %0, %1, %2,%3" + : "=r" (result) + : "r" (a), "r" (b), "r" (accu)) ; + return result; +} +inline INT SMLABT(const LONG accu, const SHORT a, const LONG b) +{ + INT result ; + __asm__ ("smlabt %0, %1, %2,%3" + : "=r" (result) + : "r" (a), "r" (b), "r" (accu)) ; + return result; +} +#endif /* compiler selection */ + + +void qmfAnaPrototypeFirSlot( FIXP_QMF *analysisBuffer, + int no_channels, /*!< Number channels of analysis filter */ + const FIXP_PFT *p_filter, + int p_stride, /*!< Stide of analysis filter */ + FIXP_QAS *RESTRICT pFilterStates + ) +{ + LONG *p_flt = (LONG *) p_filter; + LONG flt; + FIXP_QMF *RESTRICT pData_0 = analysisBuffer + 2*no_channels - 1; + FIXP_QMF *RESTRICT pData_1 = analysisBuffer; + + FIXP_QAS *RESTRICT sta_0 = (FIXP_QAS *)pFilterStates; + FIXP_QAS *RESTRICT sta_1 = (FIXP_QAS *)pFilterStates + (2*QMF_NO_POLY*no_channels) - 1; + + FIXP_DBL accu0, accu1; + FIXP_QAS sta0, sta1; + + int staStep1 = no_channels<<1; + int staStep2 = (no_channels<<3) - 1; /* Rewind one less */ + + if (p_stride == 1) + { + /* FIR filter 0 */ + flt = *p_flt++; + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMULBB( sta1, flt); + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABB( accu1, sta1, flt); + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta1 = *sta_1; sta_1 += staStep2; + accu1 = SMLABB( accu1, sta1, flt); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + + /* FIR filters 1..63 127..65 or 1..31 63..33 */ + no_channels >>= 1; + for (; --no_channels; ) + { + sta0 = *sta_0; sta_0 += staStep1; /* 1,3,5, ... 29/61 */ + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMULBT( sta0, flt); + accu1 = SMULBT( sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 -= staStep2; + sta1 = *sta_1; sta_1 += staStep2; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + + /* Same sequence as above, but mix B=bottom with T=Top */ + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; /* 2,4,6, ... 30/62 */ + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMULBB( sta0, flt); + accu1 = SMULBB( sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 -= staStep2; + sta1 = *sta_1; sta_1 += staStep2; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + } + + /* FIR filter 31/63 and 33/65 */ + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMULBT( sta0, flt); + accu1 = SMULBT( sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 -= staStep2; + sta1 = *sta_1; sta_1 += staStep2; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + + /* FIR filter 32/64 */ + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMULBB( sta0, flt); + accu1 = SMULBB( sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt++; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = *p_flt; + sta0 = *sta_0; + sta1 = *sta_1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + } + else + { + int pfltStep = QMF_NO_POLY * (p_stride-1); + + flt = p_flt[0]; + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMULBB( sta1, flt); + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABT( accu1, sta1, flt); + + flt = p_flt[1]; + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABB( accu1, sta1, flt); + sta1 = *sta_1; sta_1 -= staStep1; + accu1 = SMLABT( accu1, sta1, flt); + + flt = p_flt[2]; p_flt += pfltStep; + sta1 = *sta_1; sta_1 += staStep2; + accu1 = SMLABB( accu1, sta1, flt); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + + /* FIR filters 1..63 127..65 or 1..31 63..33 */ + for (; --no_channels; ) + { + flt = p_flt[0]; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMULBB( sta0, flt); + accu1 = SMULBB( sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = p_flt[1]; + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + sta0 = *sta_0; sta_0 += staStep1; + sta1 = *sta_1; sta_1 -= staStep1; + accu0 = SMLABT( accu0, sta0, flt); + accu1 = SMLABT( accu1, sta1, flt); + + flt = p_flt[2]; p_flt += pfltStep; + sta0 = *sta_0; sta_0 -= staStep2; + sta1 = *sta_1; sta_1 += staStep2; + accu0 = SMLABB( accu0, sta0, flt); + accu1 = SMLABB( accu1, sta1, flt); + + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + *pData_1++ = FX_DBL2FX_QMF(accu1<<1); + } + + /* FIR filter 32/64 */ + flt = p_flt[0]; + sta0 = *sta_0; sta_0 += staStep1; + accu0 = SMULBB( sta0, flt); + sta0 = *sta_0; sta_0 += staStep1; + accu0 = SMLABT( accu0, sta0, flt); + + flt = p_flt[1]; + sta0 = *sta_0; sta_0 += staStep1; + accu0 = SMLABB( accu0, sta0, flt); + sta0 = *sta_0; sta_0 += staStep1; + accu0 = SMLABT( accu0, sta0, flt); + + flt = p_flt[2]; + sta0 = *sta_0; + accu0 = SMLABB( accu0, sta0, flt); + *pData_0-- = FX_DBL2FX_QMF(accu0<<1); + } +} +#endif /* FUNCTION_qmfAnaPrototypeFirSlot */ +#endif /* #if defined(__CC_ARM) && defined(__ARM_ARCH_6__) */ + +#if ( defined(__ARM_ARCH_5TE__) && (SAMPLE_BITS == 16) ) && !defined(QMF_TABLE_FULL) + +#define FUNCTION_qmfSynPrototypeFirSlot + +#if defined(FUNCTION_qmfSynPrototypeFirSlot) + +#if defined(__GNUC__) /* cppp replaced: elif */ + +inline INT SMULWB (const LONG a, const LONG b) +{ + INT result ; + __asm__ ("smulwb %0, %1, %2" + : "=r" (result) + : "r" (a), "r" (b)) ; + + return result ; +} +inline INT SMULWT (const LONG a, const LONG b) +{ + INT result ; + __asm__ ("smulwt %0, %1, %2" + : "=r" (result) + : "r" (a), "r" (b)) ; + + return result ; +} + +inline INT SMLAWB(const LONG accu, const LONG a, const LONG b) +{ + INT result; + asm("smlawb %0, %1, %2, %3 " + : "=r" (result) + : "r" (a), "r" (b), "r" (accu) ); + return result ; +} + +inline INT SMLAWT(const LONG accu, const LONG a, const LONG b) +{ + INT result; + asm("smlawt %0, %1, %2, %3 " + : "=r" (result) + : "r" (a), "r" (b), "r" (accu) ); + return result ; +} + +#endif /* ARM compiler selector */ + + +static void qmfSynPrototypeFirSlot1_filter(FIXP_QMF *RESTRICT realSlot, + FIXP_QMF *RESTRICT imagSlot, + const FIXP_DBL *RESTRICT p_flt, + FIXP_QSS *RESTRICT sta, + FIXP_DBL *pMyTimeOut, + int no_channels) +{ + /* This code was the base for the above listed assembler sequence */ + /* It can be used for debugging purpose or further optimizations */ + const FIXP_DBL *RESTRICT p_fltm = p_flt + 155; + + do + { + FIXP_DBL result; + FIXP_DBL A, B, real, imag, sta0; + + real = *--realSlot; + imag = *--imagSlot; + B = p_flt[4]; /* Bottom=[8] Top=[9] */ + A = p_fltm[3]; /* Bottom=[316] Top=[317] */ + sta0 = sta[0]; /* save state[0] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=9...........319 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=316...........6 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=8,18, ...318 */ + B = p_flt[3]; /* Bottom=[6] Top=[7] */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=317...........7 */ + A = p_fltm[4]; /* Bottom=[318] Top=[319] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=7...........317 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=318...........8 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=6...........316 */ + B = p_flt[2]; /* Bottom=[X] Top=[5] */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=9...........319 */ + A = p_fltm[2]; /* Bottom=[X] Top=[315] */ + *sta++ = SMULWT( imag, B ); /* index=5,15, ... 315 */ + result = SMLAWT( sta0, real, A ); /* index=315...........5 */ + + *pMyTimeOut++ = result; + + real = *--realSlot; + imag = *--imagSlot; + A = p_fltm[0]; /* Bottom=[310] Top=[311] */ + B = p_flt[7]; /* Bottom=[14] Top=[15] */ + result = SMLAWB( sta[0], real, A ); /* index=310...........0 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=14..........324 */ + *pMyTimeOut++ = result; + B = p_flt[6]; /* Bottom=[12] Top=[13] */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=311...........1 */ + A = p_fltm[1]; /* Bottom=[312] Top=[313] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=13..........323 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=312...........2 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=12..........322 */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=313...........3 */ + A = p_fltm[2]; /* Bottom=[314] Top=[315] */ + B = p_flt[5]; /* Bottom=[10] Top=[11] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=11..........321 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=314...........4 */ + *sta++ = SMULWB( imag, B ); /* index=10..........320 */ + + + p_flt += 5; + p_fltm -= 5; + } + while ((--no_channels) != 0); + +} + + + +INT qmfSynPrototypeFirSlot2( + HANDLE_QMF_FILTER_BANK qmf, + FIXP_QMF *RESTRICT realSlot, /*!< Input: Pointer to real Slot */ + FIXP_QMF *RESTRICT imagSlot, /*!< Input: Pointer to imag Slot */ + INT_PCM *RESTRICT timeOut, /*!< Time domain data */ + INT stride /*!< Time output buffer stride factor*/ + ) +{ + FIXP_QSS *RESTRICT sta = (FIXP_QSS*)qmf->FilterStates; + int no_channels = qmf->no_channels; + int scale = ((DFRACT_BITS-SAMPLE_BITS)-1-qmf->outScalefactor); + + /* We map an arry of 16-bit values upon an array of 2*16-bit values to read 2 values in one shot */ + const FIXP_DBL *RESTRICT p_flt = (FIXP_DBL *) qmf->p_filter; /* low=[0], high=[1] */ + const FIXP_DBL *RESTRICT p_fltm = (FIXP_DBL *) qmf->p_filter + 155; /* low=[310], high=[311] */ + + FDK_ASSERT(SAMPLE_BITS-1-qmf->outScalefactor >= 0); // (DFRACT_BITS-SAMPLE_BITS)-1-qmf->outScalefactor >= 0); + FDK_ASSERT(qmf->p_stride==2 && qmf->no_channels == 32); + + FDK_ASSERT((no_channels&3) == 0); /* should be a multiple of 4 */ + + realSlot += no_channels-1; // ~~"~~ + imagSlot += no_channels-1; // no_channels-1 .. 0 + + FIXP_DBL MyTimeOut[32]; + FIXP_DBL *pMyTimeOut = &MyTimeOut[0]; + + for (no_channels = no_channels; no_channels--;) + { + FIXP_DBL result; + FIXP_DBL A, B, real, imag; + + real = *realSlot--; + imag = *imagSlot--; + A = p_fltm[0]; /* Bottom=[310] Top=[311] */ + B = p_flt[7]; /* Bottom=[14] Top=[15] */ + result = SMLAWB( sta[0], real, A ); /* index=310...........0 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=14..........324 */ + B = p_flt[6]; /* Bottom=[12] Top=[13] */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=311...........1 */ + A = p_fltm[1]; /* Bottom=[312] Top=[313] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=13..........323 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=312...........2 */ + *sta++ = SMLAWB( sta[1], imag, B ); /* index=12..........322 */ + *sta++ = SMLAWT( sta[1], real, A ); /* index=313...........3 */ + A = p_fltm[2]; /* Bottom=[314] Top=[315] */ + B = p_flt[5]; /* Bottom=[10] Top=[11] */ + *sta++ = SMLAWT( sta[1], imag, B ); /* index=11..........321 */ + *sta++ = SMLAWB( sta[1], real, A ); /* index=314...........4 */ + *sta++ = SMULWB( imag, B ); /* index=10..........320 */ + + *pMyTimeOut++ = result; + + p_fltm -= 5; + p_flt += 5; + } + + pMyTimeOut = &MyTimeOut[0]; +#if (SAMPLE_BITS == 16) + const FIXP_DBL max_pos = (FIXP_DBL) 0x00007FFF << scale; + const FIXP_DBL max_neg = (FIXP_DBL) 0xFFFF8001 << scale; +#else + scale = -scale; + const FIXP_DBL max_pos = (FIXP_DBL) 0x7FFFFFFF >> scale; + const FIXP_DBL max_neg = (FIXP_DBL) 0x80000001 >> scale; +#endif + const FIXP_DBL add_neg = (1 << scale) - 1; + + no_channels = qmf->no_channels; + + timeOut += no_channels*stride; + + FDK_ASSERT(scale >= 0); + + if (qmf->outGain != 0x80000000) + { + FIXP_DBL gain = qmf->outGain; + for (no_channels>>=2; no_channels--;) + { + FIXP_DBL result1, result2; + + result1 = *pMyTimeOut++; + result2 = *pMyTimeOut++; + + result1 = fMult(result1,gain); + timeOut -= stride; + if (result1 < 0) result1 += add_neg; + if (result1 < max_neg) result1 = max_neg; + if (result1 > max_pos) result1 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result1 >> scale; +#else + timeOut[0] = result1 << scale; +#endif + + result2 = fMult(result2,gain); + timeOut -= stride; + if (result2 < 0) result2 += add_neg; + if (result2 < max_neg) result2 = max_neg; + if (result2 > max_pos) result2 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result2 >> scale; +#else + timeOut[0] = result2 << scale; +#endif + + result1 = *pMyTimeOut++; + result2 = *pMyTimeOut++; + + result1 = fMult(result1,gain); + timeOut -= stride; + if (result1 < 0) result1 += add_neg; + if (result1 < max_neg) result1 = max_neg; + if (result1 > max_pos) result1 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result1 >> scale; +#else + timeOut[0] = result1 << scale; +#endif + + result2 = fMult(result2,gain); + timeOut -= stride; + if (result2 < 0) result2 += add_neg; + if (result2 < max_neg) result2 = max_neg; + if (result2 > max_pos) result2 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result2 >> scale; +#else + timeOut[0] = result2 << scale; +#endif + } + } + else + { + for (no_channels>>=2; no_channels--;) + { + FIXP_DBL result1, result2; + result1 = *pMyTimeOut++; + result2 = *pMyTimeOut++; + timeOut -= stride; + if (result1 < 0) result1 += add_neg; + if (result1 < max_neg) result1 = max_neg; + if (result1 > max_pos) result1 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result1 >> scale; +#else + timeOut[0] = result1 << scale; +#endif + + timeOut -= stride; + if (result2 < 0) result2 += add_neg; + if (result2 < max_neg) result2 = max_neg; + if (result2 > max_pos) result2 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result2 >> scale; +#else + timeOut[0] = result2 << scale; +#endif + + result1 = *pMyTimeOut++; + result2 = *pMyTimeOut++; + timeOut -= stride; + if (result1 < 0) result1 += add_neg; + if (result1 < max_neg) result1 = max_neg; + if (result1 > max_pos) result1 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result1 >> scale; +#else + timeOut[0] = result1 << scale; +#endif + + timeOut -= stride; + if (result2 < 0) result2 += add_neg; + if (result2 < max_neg) result2 = max_neg; + if (result2 > max_pos) result2 = max_pos; +#if (SAMPLE_BITS == 16) + timeOut[0] = result2 >> scale; +#else + timeOut[0] = result2 << scale; +#endif + } + } + return 0; +} + +static +void qmfSynPrototypeFirSlot_fallback( HANDLE_QMF_FILTER_BANK qmf, + FIXP_DBL *realSlot, /*!< Input: Pointer to real Slot */ + FIXP_DBL *imagSlot, /*!< Input: Pointer to imag Slot */ + INT_PCM *timeOut, /*!< Time domain data */ + const int stride + ); + +/*! + \brief Perform Synthesis Prototype Filtering on a single slot of input data. + + The filter takes 2 * #MAX_SYNTHESIS_CHANNELS of input data and + generates #MAX_SYNTHESIS_CHANNELS time domain output samples. +*/ + +static +void qmfSynPrototypeFirSlot( HANDLE_QMF_FILTER_BANK qmf, + FIXP_DBL *realSlot, /*!< Input: Pointer to real Slot */ + FIXP_DBL *imagSlot, /*!< Input: Pointer to imag Slot */ + INT_PCM *timeOut, /*!< Time domain data */ + const int stride + ) +{ + INT err = -1; + + switch (qmf->p_stride) { + case 2: + err = qmfSynPrototypeFirSlot2(qmf, realSlot, imagSlot, timeOut, stride); + break; + default: + err = -1; + } + + /* fallback if configuration not available or failed */ + if(err!=0) { + qmfSynPrototypeFirSlot_fallback(qmf, realSlot, imagSlot, timeOut, stride); + } +} +#endif /* FUNCTION_qmfSynPrototypeFirSlot */ + +#endif /* ( defined(__CC_ARM) && defined(__ARM_ARCH_5TE__) && (SAMPLE_BITS == 16) ) && !defined(QMF_TABLE_FULL) */ + + + +/* #####################################################################################*/ + + + +#endif /* (QMF_NO_POLY==5) */ + diff --git a/libFDK/src/arm/scale_arm.cpp b/libFDK/src/arm/scale_arm.cpp new file mode 100644 index 0000000..906766f --- /dev/null +++ b/libFDK/src/arm/scale_arm.cpp @@ -0,0 +1,110 @@ +/*************************** Fraunhofer IIS FDK Tools ********************** + + (C) Copyright Fraunhofer IIS (2005) + All Rights Reserved + + Please be advised that this software and/or program delivery is + Confidential Information of Fraunhofer and subject to and covered by the + + Fraunhofer IIS Software Evaluation Agreement + between Google Inc. and Fraunhofer + effective and in full force since March 1, 2012. + + You may use this software and/or program only under the terms and + conditions described in the above mentioned Fraunhofer IIS Software + Evaluation Agreement. Any other and/or further use requires a separate agreement. + + + $Id$ + Author(s): Arthur Tritthart + Description: Scaling operations for ARM + + This software and/or program is protected by copyright law and international + treaties. Any reproduction or distribution of this software and/or program, + or any portion of it, may result in severe civil and criminal penalties, and + will be prosecuted to the maximum extent possible under law. + +******************************************************************************/ +/* prevent multiple inclusion with re-definitions */ +#ifndef __INCLUDE_SCALE_ARM__ +#define __INCLUDE_SCALE_ARM__ + +#define FUNCTION_scaleValuesWithFactor_DBL + +SCALE_INLINE +void scaleValuesWithFactor( + FIXP_DBL *vector, + FIXP_DBL factor, + INT len, + INT scalefactor + ) +{ + /* This code combines the fMult with the scaling */ + /* It performs a fMultDiv2 and increments shift by 1 */ + int shift = scalefactor + 1; + FIXP_DBL *mySpec = vector; + + shift = fixmin_I(shift,(INT)DFRACT_BITS-1); + + if (shift >= 0) + { + for (int i=0; i<(len>>2); i++) + { + FIXP_DBL tmp0 = mySpec[0]; + FIXP_DBL tmp1 = mySpec[1]; + FIXP_DBL tmp2 = mySpec[2]; + FIXP_DBL tmp3 = mySpec[3]; + tmp0 = fMultDiv2(tmp0, factor); + tmp1 = fMultDiv2(tmp1, factor); + tmp2 = fMultDiv2(tmp2, factor); + tmp3 = fMultDiv2(tmp3, factor); + tmp0 <<= shift; + tmp1 <<= shift; + tmp2 <<= shift; + tmp3 <<= shift; + *mySpec++ = tmp0; + *mySpec++ = tmp1; + *mySpec++ = tmp2; + *mySpec++ = tmp3; + } + for (int i=len&3; i--;) + { + FIXP_DBL tmp0 = mySpec[0]; + tmp0 = fMultDiv2(tmp0, factor); + tmp0 <<= shift; + *mySpec++ = tmp0; + } + } + else + { + shift = -shift; + for (int i=0; i<(len>>2); i++) + { + FIXP_DBL tmp0 = mySpec[0]; + FIXP_DBL tmp1 = mySpec[1]; + FIXP_DBL tmp2 = mySpec[2]; + FIXP_DBL tmp3 = mySpec[3]; + tmp0 = fMultDiv2(tmp0, factor); + tmp1 = fMultDiv2(tmp1, factor); + tmp2 = fMultDiv2(tmp2, factor); + tmp3 = fMultDiv2(tmp3, factor); + tmp0 >>= shift; + tmp1 >>= shift; + tmp2 >>= shift; + tmp3 >>= shift; + *mySpec++ = tmp0; + *mySpec++ = tmp1; + *mySpec++ = tmp2; + *mySpec++ = tmp3; + } + for (int i=len&3; i--;) + { + FIXP_DBL tmp0 = mySpec[0]; + tmp0 = fMultDiv2(tmp0, factor); + tmp0 >>= shift; + *mySpec++ = tmp0; + } + } +} + +#endif /* #ifndef __INCLUDE_SCALE_ARM__ */ |