diff options
Diffstat (limited to 'libFDK/src/arm/dct_arm.cpp')
-rw-r--r-- | libFDK/src/arm/dct_arm.cpp | 678 |
1 files changed, 397 insertions, 281 deletions
diff --git a/libFDK/src/arm/dct_arm.cpp b/libFDK/src/arm/dct_arm.cpp index 59b773e..dd66109 100644 --- a/libFDK/src/arm/dct_arm.cpp +++ b/libFDK/src/arm/dct_arm.cpp @@ -1,74 +1,85 @@ - -/* ----------------------------------------------------------------------------------------------------------- +/* ----------------------------------------------------------------------------- Software License for The Fraunhofer FDK AAC Codec Library for Android -© Copyright 1995 - 2013 Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. - All rights reserved. +© Copyright 1995 - 2018 Fraunhofer-Gesellschaft zur Förderung der angewandten +Forschung e.V. All rights reserved. 1. INTRODUCTION -The Fraunhofer FDK AAC Codec Library for Android ("FDK AAC Codec") is software that implements -the MPEG Advanced Audio Coding ("AAC") encoding and decoding scheme for digital audio. -This FDK AAC Codec software is intended to be used on a wide variety of Android devices. - -AAC's HE-AAC and HE-AAC v2 versions are regarded as today's most efficient general perceptual -audio codecs. AAC-ELD is considered the best-performing full-bandwidth communications codec by -independent studies and is widely deployed. AAC has been standardized by ISO and IEC as part -of the MPEG specifications. - -Patent licenses for necessary patent claims for the FDK AAC Codec (including those of Fraunhofer) -may be obtained through Via Licensing (www.vialicensing.com) or through the respective patent owners -individually for the purpose of encoding or decoding bit streams in products that are compliant with -the ISO/IEC MPEG audio standards. Please note that most manufacturers of Android devices already license -these patent claims through Via Licensing or directly from the patent owners, and therefore FDK AAC Codec -software may already be covered under those patent licenses when it is used for those licensed purposes only. - -Commercially-licensed AAC software libraries, including floating-point versions with enhanced sound quality, -are also available from Fraunhofer. Users are encouraged to check the Fraunhofer website for additional -applications information and documentation. +The Fraunhofer FDK AAC Codec Library for Android ("FDK AAC Codec") is software +that implements the MPEG Advanced Audio Coding ("AAC") encoding and decoding +scheme for digital audio. This FDK AAC Codec software is intended to be used on +a wide variety of Android devices. + +AAC's HE-AAC and HE-AAC v2 versions are regarded as today's most efficient +general perceptual audio codecs. AAC-ELD is considered the best-performing +full-bandwidth communications codec by independent studies and is widely +deployed. AAC has been standardized by ISO and IEC as part of the MPEG +specifications. + +Patent licenses for necessary patent claims for the FDK AAC Codec (including +those of Fraunhofer) may be obtained through Via Licensing +(www.vialicensing.com) or through the respective patent owners individually for +the purpose of encoding or decoding bit streams in products that are compliant +with the ISO/IEC MPEG audio standards. Please note that most manufacturers of +Android devices already license these patent claims through Via Licensing or +directly from the patent owners, and therefore FDK AAC Codec software may +already be covered under those patent licenses when it is used for those +licensed purposes only. + +Commercially-licensed AAC software libraries, including floating-point versions +with enhanced sound quality, are also available from Fraunhofer. Users are +encouraged to check the Fraunhofer website for additional applications +information and documentation. 2. COPYRIGHT LICENSE -Redistribution and use in source and binary forms, with or without modification, are permitted without -payment of copyright license fees provided that you satisfy the following conditions: +Redistribution and use in source and binary forms, with or without modification, +are permitted without payment of copyright license fees provided that you +satisfy the following conditions: -You must retain the complete text of this software license in redistributions of the FDK AAC Codec or -your modifications thereto in source code form. +You must retain the complete text of this software license in redistributions of +the FDK AAC Codec or your modifications thereto in source code form. -You must retain the complete text of this software license in the documentation and/or other materials -provided with redistributions of the FDK AAC Codec or your modifications thereto in binary form. -You must make available free of charge copies of the complete source code of the FDK AAC Codec and your +You must retain the complete text of this software license in the documentation +and/or other materials provided with redistributions of the FDK AAC Codec or +your modifications thereto in binary form. You must make available free of +charge copies of the complete source code of the FDK AAC Codec and your modifications thereto to recipients of copies in binary form. -The name of Fraunhofer may not be used to endorse or promote products derived from this library without -prior written permission. +The name of Fraunhofer may not be used to endorse or promote products derived +from this library without prior written permission. -You may not charge copyright license fees for anyone to use, copy or distribute the FDK AAC Codec -software or your modifications thereto. +You may not charge copyright license fees for anyone to use, copy or distribute +the FDK AAC Codec software or your modifications thereto. -Your modified versions of the FDK AAC Codec must carry prominent notices stating that you changed the software -and the date of any change. For modified versions of the FDK AAC Codec, the term -"Fraunhofer FDK AAC Codec Library for Android" must be replaced by the term -"Third-Party Modified Version of the Fraunhofer FDK AAC Codec Library for Android." +Your modified versions of the FDK AAC Codec must carry prominent notices stating +that you changed the software and the date of any change. For modified versions +of the FDK AAC Codec, the term "Fraunhofer FDK AAC Codec Library for Android" +must be replaced by the term "Third-Party Modified Version of the Fraunhofer FDK +AAC Codec Library for Android." 3. NO PATENT LICENSE -NO EXPRESS OR IMPLIED LICENSES TO ANY PATENT CLAIMS, including without limitation the patents of Fraunhofer, -ARE GRANTED BY THIS SOFTWARE LICENSE. Fraunhofer provides no warranty of patent non-infringement with -respect to this software. +NO EXPRESS OR IMPLIED LICENSES TO ANY PATENT CLAIMS, including without +limitation the patents of Fraunhofer, ARE GRANTED BY THIS SOFTWARE LICENSE. +Fraunhofer provides no warranty of patent non-infringement with respect to this +software. -You may use this FDK AAC Codec software or modifications thereto only for purposes that are authorized -by appropriate patent licenses. +You may use this FDK AAC Codec software or modifications thereto only for +purposes that are authorized by appropriate patent licenses. 4. DISCLAIMER -This FDK AAC Codec software is provided by Fraunhofer on behalf of the copyright holders and contributors -"AS IS" and WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, including but not limited to the implied warranties -of merchantability and fitness for a particular purpose. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE for any direct, indirect, incidental, special, exemplary, or consequential damages, -including but not limited to procurement of substitute goods or services; loss of use, data, or profits, -or business interruption, however caused and on any theory of liability, whether in contract, strict -liability, or tort (including negligence), arising in any way out of the use of this software, even if -advised of the possibility of such damage. +This FDK AAC Codec software is provided by Fraunhofer on behalf of the copyright +holders and contributors "AS IS" and WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, +including but not limited to the implied warranties of merchantability and +fitness for a particular purpose. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE for any direct, indirect, incidental, special, exemplary, +or consequential damages, including but not limited to procurement of substitute +goods or services; loss of use, data, or profits, or business interruption, +however caused and on any theory of liability, whether in contract, strict +liability, or tort (including negligence), arising in any way out of the use of +this software, even if advised of the possibility of such damage. 5. CONTACT INFORMATION @@ -79,9 +90,15 @@ Am Wolfsmantel 33 www.iis.fraunhofer.de/amm amm-info@iis.fraunhofer.de ------------------------------------------------------------------------------------------------------------ */ +----------------------------------------------------------------------------- */ + +/******************* Library for basic calculation routines ******************** + + Author(s): + Description: +*******************************************************************************/ #ifdef FUNCTION_dct_IV_func1 @@ -91,119 +108,168 @@ amm-info@iis.fraunhofer.de With this version, we save 2 cycles per loop iteration. */ -__asm void dct_IV_func1( - int i, - const FIXP_SPK *twiddle, - FIXP_DBL *RESTRICT pDat_0, - FIXP_DBL *RESTRICT pDat_1) -{ - /* Register map: - r0 i - r1 twiddle - r2 pDat_0 - r3 pDat_1 - r4 accu1 - r5 accu2 - r6 accu3 - r7 accu4 - r8 val_tw - r9 accuX - */ - PUSH {r4-r9} - - /* 44 cycles for 2 iterations = 22 cycles/iteration */ -dct_IV_loop1_start -/* First iteration */ - LDR r8, [r1], #4 // val_tw = *twiddle++; - LDR r5, [r2, #0] // accu2 = pDat_0[0] - LDR r4, [r3, #0] // accu1 = pDat_1[0] - - SMULWT r9, r5, r8 // accuX = accu2*val_tw.l - SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h - RSB r9, r9, #0 // accuX =-accu2*val_tw.l - SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l - SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l - - LDR r8, [r1], #4 // val_tw = *twiddle++; - LDR r7, [r3, #-4] // accu4 = pDat_1[-1] - LDR r6, [r2, #4] // accu3 = pDat_0[1] - - SMULWB r9, r7, r8 // accuX = accu4*val_tw.h - SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l - RSB r9, r9, #0 // accuX =-accu4*val_tw.h - SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h - SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h - - STR r5, [r2], #4 // *pDat_0++ = accu2 - STR r4, [r2], #4 // *pDat_0++ = accu1 - STR r6, [r3], #-4 // *pDat_1-- = accu3 - STR r7, [r3], #-4 // *pDat_1-- = accu4 - -/* Second iteration */ - LDR r8, [r1], #4 // val_tw = *twiddle++; - LDR r5, [r2, #0] // accu2 = pDat_0[0] - LDR r4, [r3, #0] // accu1 = pDat_1[0] - - SMULWT r9, r5, r8 // accuX = accu2*val_tw.l - SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h - RSB r9, r9, #0 // accuX =-accu2*val_tw.l - SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l - SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l - - LDR r8, [r1], #4 // val_tw = *twiddle++; - LDR r7, [r3, #-4] // accu4 = pDat_1[-1] - LDR r6, [r2, #4] // accu3 = pDat_0[1] - - SMULWB r9, r7, r8 // accuX = accu4*val_tw.h - SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l - RSB r9, r9, #0 // accuX =-accu4*val_tw.h - SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h - SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h - - STR r5, [r2], #4 // *pDat_0++ = accu2 - STR r4, [r2], #4 // *pDat_0++ = accu1 - STR r6, [r3], #-4 // *pDat_1-- = accu3 - STR r7, [r3], #-4 // *pDat_1-- = accu4 - - SUBS r0, r0, #1 - BNE dct_IV_loop1_start - - POP {r4-r9} - - BX lr +__asm void dct_IV_func1(int i, const FIXP_SPK *twiddle, + FIXP_DBL *RESTRICT pDat_0, FIXP_DBL *RESTRICT pDat_1) { + /* Register map: + r0 i + r1 twiddle + r2 pDat_0 + r3 pDat_1 + r4 accu1 + r5 accu2 + r6 accu3 + r7 accu4 + r8 val_tw + r9 accuX + */ + PUSH{r4 - r9} + + /* 44 cycles for 2 iterations = 22 cycles/iteration */ + dct_IV_loop1_start + /* First iteration */ + LDR r8, + [r1], +# 4 // val_tw = *twiddle++; + LDR r5, + [ r2, #0 ] // accu2 = pDat_0[0] + LDR r4, + [ r3, #0 ] // accu1 = pDat_1[0] + + SMULWT r9, + r5, + r8 // accuX = accu2*val_tw.l + SMULWB r5, + r5, + r8 // accu2 = accu2*val_tw.h + RSB r9, + r9, +# 0 // accuX =-accu2*val_tw.l + SMLAWT r5, r4, r8, + r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l + SMLAWB r4, + r4, r8, + r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l + + LDR r8, + [r1], +# 4 // val_tw = *twiddle++; + LDR r7, + [ r3, # - 4 ] // accu4 = pDat_1[-1] + LDR r6, + [ r2, #4 ] // accu3 = pDat_0[1] + + SMULWB r9, + r7, + r8 // accuX = accu4*val_tw.h + SMULWT r7, + r7, + r8 // accu4 = accu4*val_tw.l + RSB r9, + r9, +# 0 // accuX =-accu4*val_tw.h + SMLAWB r7, r6, r8, + r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h + SMLAWT r6, + r6, r8, + r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h + + STR r5, + [r2], +# 4 // *pDat_0++ = accu2 + STR r4, [r2], +# 4 // *pDat_0++ = accu1 + STR r6, [r3], +#- 4 // *pDat_1-- = accu3 + STR r7, [r3], +#- 4 // *pDat_1-- = accu4 + + /* Second iteration */ + LDR r8, [r1], +# 4 // val_tw = *twiddle++; + LDR r5, + [ r2, #0 ] // accu2 = pDat_0[0] + LDR r4, + [ r3, #0 ] // accu1 = pDat_1[0] + + SMULWT r9, + r5, + r8 // accuX = accu2*val_tw.l + SMULWB r5, + r5, + r8 // accu2 = accu2*val_tw.h + RSB r9, + r9, +# 0 // accuX =-accu2*val_tw.l + SMLAWT r5, r4, r8, + r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l + SMLAWB r4, + r4, r8, + r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l + + LDR r8, + [r1], +# 4 // val_tw = *twiddle++; + LDR r7, + [ r3, # - 4 ] // accu4 = pDat_1[-1] + LDR r6, + [ r2, #4 ] // accu3 = pDat_0[1] + + SMULWB r9, + r7, + r8 // accuX = accu4*val_tw.h + SMULWT r7, + r7, + r8 // accu4 = accu4*val_tw.l + RSB r9, + r9, +# 0 // accuX =-accu4*val_tw.h + SMLAWB r7, r6, r8, + r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h + SMLAWT r6, + r6, r8, + r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h + + STR r5, + [r2], +# 4 // *pDat_0++ = accu2 + STR r4, [r2], +# 4 // *pDat_0++ = accu1 + STR r6, [r3], +#- 4 // *pDat_1-- = accu3 + STR r7, [r3], +#- 4 // *pDat_1-- = accu4 + + SUBS r0, r0, +# 1 BNE dct_IV_loop1_start + + POP { r4 - r9 } + + BX lr } #endif /* FUNCTION_dct_IV_func1 */ - #ifdef FUNCTION_dct_IV_func2 -FDK_INLINE /* __attribute__((noinline)) */ -static void dct_IV_func2( - int i, - const FIXP_SPK *twiddle, - FIXP_DBL *pDat_0, - FIXP_DBL *pDat_1, - int inc) -{ +static inline void dct_IV_func2(int i, const FIXP_SPK *twiddle, + FIXP_DBL *pDat_0, FIXP_DBL *pDat_1, int inc) { FIXP_DBL accu1, accu2, accu3, accu4, accuX; LONG val_tw; accu1 = pDat_1[-2]; accu2 = pDat_1[-1]; - *--pDat_1 = -(pDat_0[1]>>1); - *pDat_0++ = (pDat_0[0]>>1); + *--pDat_1 = -(pDat_0[1] >> 1); + *pDat_0++ = (pDat_0[0] >> 1); twiddle += inc; -__asm - { - LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + __asm { + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc B dct_IV_loop2_2nd_part - /* 42 cycles for 2 iterations = 21 cycles/iteration */ + /* 42 cycles for 2 iterations = 21 cycles/iteration */ dct_IV_loop2: SMULWT accuX, accu2, val_tw SMULWB accu2, accu2, val_tw @@ -224,7 +290,7 @@ dct_IV_loop2: LDR accu1, [pDat_1, #-8] LDR accu2, [pDat_1, #-4] - LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc STR accuX, [pDat_1, #-4] ! STR accu4, [pDat_0], #4 @@ -252,7 +318,7 @@ dct_IV_loop2_2nd_part: STR accuX, [pDat_1, #-4] ! STR accu4, [pDat_0], #4 - LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc SUBS i, i, #1 BNE dct_IV_loop2 @@ -267,96 +333,148 @@ dct_IV_loop2_2nd_part: } #endif /* FUNCTION_dct_IV_func2 */ - #ifdef FUNCTION_dst_IV_func1 -__asm void dst_IV_func1( - int i, - const FIXP_SPK *twiddle, - FIXP_DBL *pDat_0, - FIXP_DBL *pDat_1) -{ - /* Register map: - r0 i - r1 twiddle - r2 pDat_0 - r3 pDat_1 - r4 accu1 - r5 accu2 - r6 accu3 - r7 accu4 - r8 val_tw - r9 accuX - */ - PUSH {r4-r9} - -dst_IV_loop1 - LDR r8, [r1], #4 // val_tw = *twiddle++ - LDR r5, [r2] // accu2 = pDat_0[0] - LDR r6, [r2, #4] // accu3 = pDat_0[1] - RSB r5, r5, #0 // accu2 = -accu2 - SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l - LDR r4, [r3, #-4] // accu1 = pDat_1[-1] - RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l - SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l - SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l - LDR r7, [r3, #-8] // accu4 = pDat_1[-2] - SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l - LDR r8, [r1], #4 // val_tw = *twiddle++ - STR r5, [r2], #4 // *pDat_0++ = accu2 - STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX) - RSB r7, r7, #0 // accu4 = -accu4 - SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h - SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l - RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h - SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h - SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h - STR r6, [r3, #-4] ! // *--pDat_1 = accu3 - STR r7, [r3, #-4] ! // *--pDat_1 = accu4 - - LDR r8, [r1], #4 // val_tw = *twiddle++ - LDR r5, [r2] // accu2 = pDat_0[0] - LDR r6, [r2, #4] // accu3 = pDat_0[1] - RSB r5, r5, #0 // accu2 = -accu2 - SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l - LDR r4, [r3, #-4] // accu1 = pDat_1[-1] - RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l - SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l - SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l - LDR r7, [r3, #-8] // accu4 = pDat_1[-2] - SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l - LDR r8, [r1], #4 // val_tw = *twiddle++ - STR r5, [r2], #4 // *pDat_0++ = accu2 - STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX) - RSB r7, r7, #0 // accu4 = -accu4 - SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h - SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l - RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h - SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h - SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h - STR r6, [r3, #-4] ! // *--pDat_1 = accu3 - STR r7, [r3, #-4] ! // *--pDat_1 = accu4 - - SUBS r0, r0, #4 // i-= 4 - BNE dst_IV_loop1 - - POP {r4-r9} - BX lr +__asm void dst_IV_func1(int i, const FIXP_SPK *twiddle, FIXP_DBL *pDat_0, + FIXP_DBL *pDat_1) { + /* Register map: + r0 i + r1 twiddle + r2 pDat_0 + r3 pDat_1 + r4 accu1 + r5 accu2 + r6 accu3 + r7 accu4 + r8 val_tw + r9 accuX + */ + PUSH{r4 - r9} + + dst_IV_loop1 LDR r8, + [r1], +# 4 // val_tw = *twiddle++ + LDR r5, + [r2] // accu2 = pDat_0[0] + LDR r6, + [ r2, #4 ] // accu3 = pDat_0[1] + RSB r5, + r5, +# 0 // accu2 = -accu2 + SMULWT r9, r5, + r8 // accuX = (-accu2)*val_tw.l + LDR r4, + [ r3, # - 4 ] // accu1 = pDat_1[-1] + RSB r9, + r9, +# 0 // accuX = -(-accu2)*val_tw.l + SMLAWB r9, r4, r8, + r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l + SMULWT r4, + r4, + r8 // accu1 = accu1*val_tw.l + LDR r7, + [ r3, # - 8 ] // accu4 = pDat_1[-2] + SMLAWB r5, + r5, r8, + r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l + LDR r8, + [r1], +# 4 // val_tw = *twiddle++ + STR r5, [r2], +# 4 // *pDat_0++ = accu2 + STR r9, [r2], +# 4 // *pDat_0++ = accu1 (accuX) + RSB r7, r7, +# 0 // accu4 = -accu4 + SMULWB r5, r7, + r8 // accu2 = (-accu4)*val_tw.h + SMULWB r4, + r6, + r8 // accu1 = (-accu4)*val_tw.l + RSB r5, + r5, +# 0 // accu2 = -(-accu4)*val_tw.h + SMLAWT r6, r6, r8, + r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWT r7, + r7, r8, + r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + STR r6, + [ r3, # - 4 ] ! // *--pDat_1 = accu3 + STR r7, + [ r3, # - 4 ] ! // *--pDat_1 = accu4 + + LDR r8, + [r1], +# 4 // val_tw = *twiddle++ + LDR r5, + [r2] // accu2 = pDat_0[0] + LDR r6, + [ r2, #4 ] // accu3 = pDat_0[1] + RSB r5, + r5, +# 0 // accu2 = -accu2 + SMULWT r9, r5, + r8 // accuX = (-accu2)*val_tw.l + LDR r4, + [ r3, # - 4 ] // accu1 = pDat_1[-1] + RSB r9, + r9, +# 0 // accuX = -(-accu2)*val_tw.l + SMLAWB r9, r4, r8, + r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l + SMULWT r4, + r4, + r8 // accu1 = accu1*val_tw.l + LDR r7, + [ r3, # - 8 ] // accu4 = pDat_1[-2] + SMLAWB r5, + r5, r8, + r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l + LDR r8, + [r1], +# 4 // val_tw = *twiddle++ + STR r5, [r2], +# 4 // *pDat_0++ = accu2 + STR r9, [r2], +# 4 // *pDat_0++ = accu1 (accuX) + RSB r7, r7, +# 0 // accu4 = -accu4 + SMULWB r5, r7, + r8 // accu2 = (-accu4)*val_tw.h + SMULWB r4, + r6, + r8 // accu1 = (-accu4)*val_tw.l + RSB r5, + r5, +# 0 // accu2 = -(-accu4)*val_tw.h + SMLAWT r6, r6, r8, + r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWT r7, + r7, r8, + r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + STR r6, + [ r3, # - 4 ] ! // *--pDat_1 = accu3 + STR r7, + [ r3, # - 4 ] ! // *--pDat_1 = accu4 + + SUBS r0, + r0, +# 4 // i-= 4 + BNE dst_IV_loop1 + + POP{r4 - r9} BX lr } #endif /* FUNCTION_dst_IV_func1 */ #ifdef FUNCTION_dst_IV_func2 -FDK_INLINE /* __attribute__((noinline)) */ -static void dst_IV_func2( - int i, - const FIXP_SPK *twiddle, - FIXP_DBL *RESTRICT pDat_0, - FIXP_DBL *RESTRICT pDat_1, - int inc) -{ - FIXP_DBL accu1,accu2,accu3,accu4; +static inline void dst_IV_func2(int i, const FIXP_SPK *twiddle, + FIXP_DBL *RESTRICT pDat_0, + FIXP_DBL *RESTRICT pDat_1, int inc) { + FIXP_DBL accu1, accu2, accu3, accu4; LONG val_tw; accu4 = pDat_0[0]; @@ -371,76 +489,74 @@ static void dst_IV_func2( *pDat_0++ = accu3; *pDat_1-- = accu4; - - __asm - { + __asm { B dst_IV_loop2_2nd_part - /* 50 cycles for 2 iterations = 25 cycles/iteration */ + /* 50 cycles for 2 iterations = 25 cycles/iteration */ dst_IV_loop2: - LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc - RSB accu2, accu2, #0 // accu2 = -accu2 - RSB accu1, accu1, #0 // accu1 = -accu1 - SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l - SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l - RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l - SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l - SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h - STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 - STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 + RSB accu2, accu2, #0 // accu2 = -accu2 + RSB accu1, accu1, #0 // accu1 = -accu1 + SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l + SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l + RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l + SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l + SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h + STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 + STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 - LDR accu4, [pDat_0] // accu4 = pDat_0[0] - LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] + LDR accu4, [pDat_0] // accu4 = pDat_0[0] + LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] - RSB accu4, accu4, #0 // accu4 = -accu4 - RSB accu3, accu3, #0 // accu3 = -accu3 + RSB accu4, accu4, #0 // accu4 = -accu4 + RSB accu3, accu3, #0 // accu3 = -accu3 - SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h - SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l - RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h - SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h - SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h + SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l + RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h + SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h - LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] - LDR accu2, [pDat_1] // accu2 = pDat_1[0] + LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] + LDR accu2, [pDat_1] // accu2 = pDat_1[0] - STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 - STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 + STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 + STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 dst_IV_loop2_2nd_part: - LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc + LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc - RSB accu2, accu2, #0 // accu2 = -accu2 - RSB accu1, accu1, #0 // accu1 = -accu1 - SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l - SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l - RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l - SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l - SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h - STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 - STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 + RSB accu2, accu2, #0 // accu2 = -accu2 + RSB accu1, accu1, #0 // accu1 = -accu1 + SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l + SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l + RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l + SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l + SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h + STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1 + STR accu2, [pDat_0], #4 // *pDat_0++ = accu2 - LDR accu4, [pDat_0] // accu4 = pDat_0[0] - LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] + LDR accu4, [pDat_0] // accu4 = pDat_0[0] + LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1] - RSB accu4, accu4, #0 // accu4 = -accu4 - RSB accu3, accu3, #0 // accu3 = -accu3 + RSB accu4, accu4, #0 // accu4 = -accu4 + RSB accu3, accu3, #0 // accu3 = -accu3 - SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h - SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l - RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h - SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h - SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h + SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h + SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l + RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h + SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h + SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h - LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] - LDR accu2, [pDat_1] // accu2 = pDat_1[0] + LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1] + LDR accu2, [pDat_1] // accu2 = pDat_1[0] - STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 - STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 + STR accu3, [pDat_0], #4 // *pDat_0++ = accu3 + STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4 SUBS i, i, #1 BNE dst_IV_loop2 |