aboutsummaryrefslogtreecommitdiffstats
path: root/libFDK/src/arm/dct_arm.cpp
diff options
context:
space:
mode:
authorFraunhofer IIS FDK <audio-fdk@iis.fraunhofer.de>2018-02-26 20:17:00 +0100
committerJean-Michel Trivi <jmtrivi@google.com>2018-04-19 11:21:15 -0700
commit6cfabd35363c3ef5e3b209b867169a500b3ccc3c (patch)
tree01c0a19f2735e8b5d2407555fe992d4230d089eb /libFDK/src/arm/dct_arm.cpp
parent6288a1e34c4dede4c2806beb1736ece6580558c7 (diff)
downloadfdk-aac-6cfabd35363c3ef5e3b209b867169a500b3ccc3c.tar.gz
fdk-aac-6cfabd35363c3ef5e3b209b867169a500b3ccc3c.tar.bz2
fdk-aac-6cfabd35363c3ef5e3b209b867169a500b3ccc3c.zip
Upgrade to FDKv2
Bug: 71430241 Test: CTS DecoderTest and DecoderTestAacDrc original-Change-Id: Iaa20f749b8a04d553b20247cfe1a8930ebbabe30 Apply clang-format also on header files. original-Change-Id: I14de1ef16bbc79ec0283e745f98356a10efeb2e4 Fixes for MPEG-D DRC original-Change-Id: If1de2d74bbbac84b3f67de3b88b83f6a23b8a15c Catch unsupported tw_mdct at an early stage original-Change-Id: Ied9dd00d754162a0e3ca1ae3e6b854315d818afe Fixing PVC transition frames original-Change-Id: Ib75725abe39252806c32d71176308f2c03547a4e Move qmf bands sanity check original-Change-Id: Iab540c3013c174d9490d2ae100a4576f51d8dbc4 Initialize scaling variable original-Change-Id: I3c4087101b70e998c71c1689b122b0d7762e0f9e Add 16 qmf band configuration to getSlotNrgHQ() original-Change-Id: I49a5d30f703a1b126ff163df9656db2540df21f1 Always apply byte alignment at the end of the AudioMuxElement original-Change-Id: I42d560287506d65d4c3de8bfe3eb9a4ebeb4efc7 Setup SBR element only if no parse error exists original-Change-Id: I1915b73704bc80ab882b9173d6bec59cbd073676 Additional array index check in HCR original-Change-Id: I18cc6e501ea683b5009f1bbee26de8ddd04d8267 Fix fade-in index selection in concealment module original-Change-Id: Ibf802ed6ed8c05e9257e1f3b6d0ac1162e9b81c1 Enable explicit backward compatible parser for AAC_LD original-Change-Id: I27e9c678dcb5d40ed760a6d1e06609563d02482d Skip spatial specific config in explicit backward compatible ASC original-Change-Id: Iff7cc365561319e886090cedf30533f562ea4d6e Update flags description in decoder API original-Change-Id: I9a5b4f8da76bb652f5580cbd3ba9760425c43830 Add QMF domain reset function original-Change-Id: I4f89a8a2c0277d18103380134e4ed86996e9d8d6 DRC upgrade v2.1.0 original-Change-Id: I5731c0540139dab220094cd978ef42099fc45b74 Fix integer overflow in sqrtFixp_lookup() original-Change-Id: I429a6f0d19aa2cc957e0f181066f0ca73968c914 Fix integer overflow in invSqrtNorm2() original-Change-Id: I84de5cbf9fb3adeb611db203fe492fabf4eb6155 Fix integer overflow in GenerateRandomVector() original-Change-Id: I3118a641008bd9484d479e5b0b1ee2b5d7d44d74 Fix integer overflow in adjustTimeSlot_EldGrid() original-Change-Id: I29d503c247c5c8282349b79df940416a512fb9d5 Fix integer overflow in FDKsbrEnc_codeEnvelope() original-Change-Id: I6b34b61ebb9d525b0c651ed08de2befc1f801449 Follow-up on: Fix integer overflow in adjustTimeSlot_EldGrid() original-Change-Id: I6f8f578cc7089e5eb7c7b93e580b72ca35ad689a Fix integer overflow in get_pk_v2() original-Change-Id: I63375bed40d45867f6eeaa72b20b1f33e815938c Fix integer overflow in Syn_filt_zero() original-Change-Id: Ie0c02fdfbe03988f9d3b20d10cd9fe4c002d1279 Fix integer overflow in CFac_CalcFacSignal() original-Change-Id: Id2d767c40066c591b51768e978eb8af3b803f0c5 Fix integer overflow in FDKaacEnc_FDKaacEnc_calcPeNoAH() original-Change-Id: Idcbd0f4a51ae2550ed106aa6f3d678d1f9724841 Fix integer overflow in sbrDecoder_calculateGainVec() original-Change-Id: I7081bcbe29c5cede9821b38d93de07c7add2d507 Fix integer overflow in CLpc_SynthesisLattice() original-Change-Id: I4a95ddc18de150102352d4a1845f06094764c881 Fix integer overflow in Pred_Lt4() original-Change-Id: I4dbd012b2de7d07c3e70a47b92e3bfae8dbc750a Fix integer overflow in FDKsbrEnc_InitSbrFastTransientDetector() original-Change-Id: I788cbec1a4a00f44c2f3a72ad7a4afa219807d04 Fix unsigned integer overflow in FDKaacEnc_WriteBitstream() original-Change-Id: I68fc75166e7d2cd5cd45b18dbe3d8c2a92f1822a Fix unsigned integer overflow in FDK_MetadataEnc_Init() original-Change-Id: Ie8d025f9bcdb2442c704bd196e61065c03c10af4 Fix overflow in pseudo random number generators original-Change-Id: I3e2551ee01356297ca14e3788436ede80bd5513c Fix unsigned integer overflow in sbrDecoder_Parse() original-Change-Id: I3f231b2f437e9c37db4d5b964164686710eee971 Fix unsigned integer overflow in longsub() original-Change-Id: I73c2bc50415cac26f1f5a29e125bbe75f9180a6e Fix unsigned integer overflow in CAacDecoder_DecodeFrame() original-Change-Id: Ifce2db4b1454b46fa5f887e9d383f1cc43b291e4 Fix overflow at CLpdChannelStream_Read() original-Change-Id: Idb9d822ce3a4272e4794b643644f5434e2d4bf3f Fix unsigned integer overflow in Hcr_State_BODY_SIGN_ESC__ESC_WORD() original-Change-Id: I1ccf77c0015684b85534c5eb97162740a870b71c Fix unsigned integer overflow in UsacConfig_Parse() original-Change-Id: Ie6d27f84b6ae7eef092ecbff4447941c77864d9f Fix unsigned integer overflow in aacDecoder_drcParse() original-Change-Id: I713f28e883eea3d70b6fa56a7b8f8c22bcf66ca0 Fix unsigned integer overflow in aacDecoder_drcReadCompression() original-Change-Id: Ia34dfeb88c4705c558bce34314f584965cafcf7a Fix unsigned integer overflow in CDataStreamElement_Read() original-Change-Id: Iae896cc1d11f0a893d21be6aa90bd3e60a2c25f0 Fix unsigned integer overflow in transportDec_AdjustEndOfAccessUnit() original-Change-Id: I64cf29a153ee784bb4a16fdc088baabebc0007dc Fix unsigned integer overflow in transportDec_GetAuBitsRemaining() original-Change-Id: I975b3420faa9c16a041874ba0db82e92035962e4 Fix unsigned integer overflow in extractExtendedData() original-Change-Id: I2a59eb09e2053cfb58dfb75fcecfad6b85a80a8f Fix signed integer overflow in CAacDecoder_ExtPayloadParse() original-Change-Id: I4ad5ca4e3b83b5d964f1c2f8c5e7b17c477c7929 Fix unsigned integer overflow in CAacDecoder_DecodeFrame() original-Change-Id: I29a39df77d45c52a0c9c5c83c1ba81f8d0f25090 Follow-up on: Fix integer overflow in CLpc_SynthesisLattice() original-Change-Id: I8fb194ffc073a3432a380845be71036a272d388f Fix signed integer overflow in _interpolateDrcGain() original-Change-Id: I879ec9ab14005069a7c47faf80e8bc6e03d22e60 Fix unsigned integer overflow in FDKreadBits() original-Change-Id: I1f47a6a8037ff70375aa8844947d5681bb4287ad Fix unsigned integer overflow in FDKbyteAlign() original-Change-Id: Id5f3a11a0c9e50fc6f76ed6c572dbd4e9f2af766 Fix unsigned integer overflow in FDK_get32() original-Change-Id: I9d33b8e97e3d38cbb80629cb859266ca0acdce96 Fix unsigned integer overflow in FDK_pushBack() original-Change-Id: Ic87f899bc8c6acf7a377a8ca7f3ba74c3a1e1c19 Fix unsigned integer overflow in FDK_pushForward() original-Change-Id: I3b754382f6776a34be1602e66694ede8e0b8effc Fix unsigned integer overflow in ReadPsData() original-Change-Id: I25361664ba8139e32bbbef2ca8c106a606ce9c37 Fix signed integer overflow in E_UTIL_residu() original-Change-Id: I8c3abd1f437ee869caa8fb5903ce7d3d641b6aad REVERT: Follow-up on: Integer overflow in CLpc_SynthesisLattice(). original-Change-Id: I3d340099acb0414795c8dfbe6362bc0a8f045f9b Follow-up on: Fix integer overflow in CLpc_SynthesisLattice() original-Change-Id: I4aedb8b3a187064e9f4d985175aa55bb99cc7590 Follow-up on: Fix unsigned integer overflow in aacDecoder_drcParse() original-Change-Id: I2aa2e13916213bf52a67e8b0518e7bf7e57fb37d Fix integer overflow in acelp original-Change-Id: Ie6390c136d84055f8b728aefbe4ebef6e029dc77 Fix unsigned integer overflow in aacDecoder_UpdateBitStreamCounters() original-Change-Id: I391ffd97ddb0b2c184cba76139bfb356a3b4d2e2 Adjust concealment default settings original-Change-Id: I6a95db935a327c47df348030bcceafcb29f54b21 Saturate estimatedStartPos original-Change-Id: I27be2085e0ae83ec9501409f65e003f6bcba1ab6 Negative shift exponent in _interpolateDrcGain() original-Change-Id: I18edb26b26d002aafd5e633d4914960f7a359c29 Negative shift exponent in calculateICC() original-Change-Id: I3dcd2ae98d2eb70ee0d59750863cbb2a6f4f8aba Too large shift exponent in FDK_put() original-Change-Id: Ib7d9aaa434d2d8de4a13b720ca0464b31ca9b671 Too large shift exponent in CalcInvLdData() original-Change-Id: I43e6e78d4cd12daeb1dcd5d82d1798bdc2550262 Member access within null pointer of type SBR_CHANNEL original-Change-Id: Idc5e4ea8997810376d2f36bbdf628923b135b097 Member access within null pointer of type CpePersistentData original-Change-Id: Ib6c91cb0d37882768e5baf63324e429589de0d9d Member access within null pointer FDKaacEnc_psyMain() original-Change-Id: I7729b7f4479970531d9dc823abff63ca52e01997 Member access within null pointer FDKaacEnc_GetPnsParam() original-Change-Id: I9aa3b9f3456ae2e0f7483dbd5b3dde95fc62da39 Member access within null pointer FDKsbrEnc_EnvEncodeFrame() original-Change-Id: I67936f90ea714e90b3e81bc0dd1472cc713eb23a Add HCR sanity check original-Change-Id: I6c1d9732ebcf6af12f50b7641400752f74be39f7 Fix memory issue for HBE edge case with 8:3 SBR original-Change-Id: I11ea58a61e69fbe8bf75034b640baee3011e63e9 Additional SBR parametrization sanity check for ELD original-Change-Id: Ie26026fbfe174c2c7b3691f6218b5ce63e322140 Add MPEG-D DRC channel layout check original-Change-Id: Iea70a74f171b227cce636a9eac4ba662777a2f72 Additional out-of-bounds checks in MPEG-D DRC original-Change-Id: Ife4a8c3452c6fde8a0a09e941154a39a769777d4 Change-Id: Ic63cb2f628720f54fe9b572b0cb528e2599c624e
Diffstat (limited to 'libFDK/src/arm/dct_arm.cpp')
-rw-r--r--libFDK/src/arm/dct_arm.cpp678
1 files changed, 397 insertions, 281 deletions
diff --git a/libFDK/src/arm/dct_arm.cpp b/libFDK/src/arm/dct_arm.cpp
index 59b773e..dd66109 100644
--- a/libFDK/src/arm/dct_arm.cpp
+++ b/libFDK/src/arm/dct_arm.cpp
@@ -1,74 +1,85 @@
-
-/* -----------------------------------------------------------------------------------------------------------
+/* -----------------------------------------------------------------------------
Software License for The Fraunhofer FDK AAC Codec Library for Android
-© Copyright 1995 - 2013 Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
- All rights reserved.
+© Copyright 1995 - 2018 Fraunhofer-Gesellschaft zur Förderung der angewandten
+Forschung e.V. All rights reserved.
1. INTRODUCTION
-The Fraunhofer FDK AAC Codec Library for Android ("FDK AAC Codec") is software that implements
-the MPEG Advanced Audio Coding ("AAC") encoding and decoding scheme for digital audio.
-This FDK AAC Codec software is intended to be used on a wide variety of Android devices.
-
-AAC's HE-AAC and HE-AAC v2 versions are regarded as today's most efficient general perceptual
-audio codecs. AAC-ELD is considered the best-performing full-bandwidth communications codec by
-independent studies and is widely deployed. AAC has been standardized by ISO and IEC as part
-of the MPEG specifications.
-
-Patent licenses for necessary patent claims for the FDK AAC Codec (including those of Fraunhofer)
-may be obtained through Via Licensing (www.vialicensing.com) or through the respective patent owners
-individually for the purpose of encoding or decoding bit streams in products that are compliant with
-the ISO/IEC MPEG audio standards. Please note that most manufacturers of Android devices already license
-these patent claims through Via Licensing or directly from the patent owners, and therefore FDK AAC Codec
-software may already be covered under those patent licenses when it is used for those licensed purposes only.
-
-Commercially-licensed AAC software libraries, including floating-point versions with enhanced sound quality,
-are also available from Fraunhofer. Users are encouraged to check the Fraunhofer website for additional
-applications information and documentation.
+The Fraunhofer FDK AAC Codec Library for Android ("FDK AAC Codec") is software
+that implements the MPEG Advanced Audio Coding ("AAC") encoding and decoding
+scheme for digital audio. This FDK AAC Codec software is intended to be used on
+a wide variety of Android devices.
+
+AAC's HE-AAC and HE-AAC v2 versions are regarded as today's most efficient
+general perceptual audio codecs. AAC-ELD is considered the best-performing
+full-bandwidth communications codec by independent studies and is widely
+deployed. AAC has been standardized by ISO and IEC as part of the MPEG
+specifications.
+
+Patent licenses for necessary patent claims for the FDK AAC Codec (including
+those of Fraunhofer) may be obtained through Via Licensing
+(www.vialicensing.com) or through the respective patent owners individually for
+the purpose of encoding or decoding bit streams in products that are compliant
+with the ISO/IEC MPEG audio standards. Please note that most manufacturers of
+Android devices already license these patent claims through Via Licensing or
+directly from the patent owners, and therefore FDK AAC Codec software may
+already be covered under those patent licenses when it is used for those
+licensed purposes only.
+
+Commercially-licensed AAC software libraries, including floating-point versions
+with enhanced sound quality, are also available from Fraunhofer. Users are
+encouraged to check the Fraunhofer website for additional applications
+information and documentation.
2. COPYRIGHT LICENSE
-Redistribution and use in source and binary forms, with or without modification, are permitted without
-payment of copyright license fees provided that you satisfy the following conditions:
+Redistribution and use in source and binary forms, with or without modification,
+are permitted without payment of copyright license fees provided that you
+satisfy the following conditions:
-You must retain the complete text of this software license in redistributions of the FDK AAC Codec or
-your modifications thereto in source code form.
+You must retain the complete text of this software license in redistributions of
+the FDK AAC Codec or your modifications thereto in source code form.
-You must retain the complete text of this software license in the documentation and/or other materials
-provided with redistributions of the FDK AAC Codec or your modifications thereto in binary form.
-You must make available free of charge copies of the complete source code of the FDK AAC Codec and your
+You must retain the complete text of this software license in the documentation
+and/or other materials provided with redistributions of the FDK AAC Codec or
+your modifications thereto in binary form. You must make available free of
+charge copies of the complete source code of the FDK AAC Codec and your
modifications thereto to recipients of copies in binary form.
-The name of Fraunhofer may not be used to endorse or promote products derived from this library without
-prior written permission.
+The name of Fraunhofer may not be used to endorse or promote products derived
+from this library without prior written permission.
-You may not charge copyright license fees for anyone to use, copy or distribute the FDK AAC Codec
-software or your modifications thereto.
+You may not charge copyright license fees for anyone to use, copy or distribute
+the FDK AAC Codec software or your modifications thereto.
-Your modified versions of the FDK AAC Codec must carry prominent notices stating that you changed the software
-and the date of any change. For modified versions of the FDK AAC Codec, the term
-"Fraunhofer FDK AAC Codec Library for Android" must be replaced by the term
-"Third-Party Modified Version of the Fraunhofer FDK AAC Codec Library for Android."
+Your modified versions of the FDK AAC Codec must carry prominent notices stating
+that you changed the software and the date of any change. For modified versions
+of the FDK AAC Codec, the term "Fraunhofer FDK AAC Codec Library for Android"
+must be replaced by the term "Third-Party Modified Version of the Fraunhofer FDK
+AAC Codec Library for Android."
3. NO PATENT LICENSE
-NO EXPRESS OR IMPLIED LICENSES TO ANY PATENT CLAIMS, including without limitation the patents of Fraunhofer,
-ARE GRANTED BY THIS SOFTWARE LICENSE. Fraunhofer provides no warranty of patent non-infringement with
-respect to this software.
+NO EXPRESS OR IMPLIED LICENSES TO ANY PATENT CLAIMS, including without
+limitation the patents of Fraunhofer, ARE GRANTED BY THIS SOFTWARE LICENSE.
+Fraunhofer provides no warranty of patent non-infringement with respect to this
+software.
-You may use this FDK AAC Codec software or modifications thereto only for purposes that are authorized
-by appropriate patent licenses.
+You may use this FDK AAC Codec software or modifications thereto only for
+purposes that are authorized by appropriate patent licenses.
4. DISCLAIMER
-This FDK AAC Codec software is provided by Fraunhofer on behalf of the copyright holders and contributors
-"AS IS" and WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, including but not limited to the implied warranties
-of merchantability and fitness for a particular purpose. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE for any direct, indirect, incidental, special, exemplary, or consequential damages,
-including but not limited to procurement of substitute goods or services; loss of use, data, or profits,
-or business interruption, however caused and on any theory of liability, whether in contract, strict
-liability, or tort (including negligence), arising in any way out of the use of this software, even if
-advised of the possibility of such damage.
+This FDK AAC Codec software is provided by Fraunhofer on behalf of the copyright
+holders and contributors "AS IS" and WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES,
+including but not limited to the implied warranties of merchantability and
+fitness for a particular purpose. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE for any direct, indirect, incidental, special, exemplary,
+or consequential damages, including but not limited to procurement of substitute
+goods or services; loss of use, data, or profits, or business interruption,
+however caused and on any theory of liability, whether in contract, strict
+liability, or tort (including negligence), arising in any way out of the use of
+this software, even if advised of the possibility of such damage.
5. CONTACT INFORMATION
@@ -79,9 +90,15 @@ Am Wolfsmantel 33
www.iis.fraunhofer.de/amm
amm-info@iis.fraunhofer.de
------------------------------------------------------------------------------------------------------------ */
+----------------------------------------------------------------------------- */
+
+/******************* Library for basic calculation routines ********************
+
+ Author(s):
+ Description:
+*******************************************************************************/
#ifdef FUNCTION_dct_IV_func1
@@ -91,119 +108,168 @@ amm-info@iis.fraunhofer.de
With this version, we save 2 cycles per loop iteration.
*/
-__asm void dct_IV_func1(
- int i,
- const FIXP_SPK *twiddle,
- FIXP_DBL *RESTRICT pDat_0,
- FIXP_DBL *RESTRICT pDat_1)
-{
- /* Register map:
- r0 i
- r1 twiddle
- r2 pDat_0
- r3 pDat_1
- r4 accu1
- r5 accu2
- r6 accu3
- r7 accu4
- r8 val_tw
- r9 accuX
- */
- PUSH {r4-r9}
-
- /* 44 cycles for 2 iterations = 22 cycles/iteration */
-dct_IV_loop1_start
-/* First iteration */
- LDR r8, [r1], #4 // val_tw = *twiddle++;
- LDR r5, [r2, #0] // accu2 = pDat_0[0]
- LDR r4, [r3, #0] // accu1 = pDat_1[0]
-
- SMULWT r9, r5, r8 // accuX = accu2*val_tw.l
- SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h
- RSB r9, r9, #0 // accuX =-accu2*val_tw.l
- SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l
- SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l
-
- LDR r8, [r1], #4 // val_tw = *twiddle++;
- LDR r7, [r3, #-4] // accu4 = pDat_1[-1]
- LDR r6, [r2, #4] // accu3 = pDat_0[1]
-
- SMULWB r9, r7, r8 // accuX = accu4*val_tw.h
- SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l
- RSB r9, r9, #0 // accuX =-accu4*val_tw.h
- SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h
- SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h
-
- STR r5, [r2], #4 // *pDat_0++ = accu2
- STR r4, [r2], #4 // *pDat_0++ = accu1
- STR r6, [r3], #-4 // *pDat_1-- = accu3
- STR r7, [r3], #-4 // *pDat_1-- = accu4
-
-/* Second iteration */
- LDR r8, [r1], #4 // val_tw = *twiddle++;
- LDR r5, [r2, #0] // accu2 = pDat_0[0]
- LDR r4, [r3, #0] // accu1 = pDat_1[0]
-
- SMULWT r9, r5, r8 // accuX = accu2*val_tw.l
- SMULWB r5, r5, r8 // accu2 = accu2*val_tw.h
- RSB r9, r9, #0 // accuX =-accu2*val_tw.l
- SMLAWT r5, r4, r8, r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l
- SMLAWB r4, r4, r8, r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l
-
- LDR r8, [r1], #4 // val_tw = *twiddle++;
- LDR r7, [r3, #-4] // accu4 = pDat_1[-1]
- LDR r6, [r2, #4] // accu3 = pDat_0[1]
-
- SMULWB r9, r7, r8 // accuX = accu4*val_tw.h
- SMULWT r7, r7, r8 // accu4 = accu4*val_tw.l
- RSB r9, r9, #0 // accuX =-accu4*val_tw.h
- SMLAWB r7, r6, r8, r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h
- SMLAWT r6, r6, r8, r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h
-
- STR r5, [r2], #4 // *pDat_0++ = accu2
- STR r4, [r2], #4 // *pDat_0++ = accu1
- STR r6, [r3], #-4 // *pDat_1-- = accu3
- STR r7, [r3], #-4 // *pDat_1-- = accu4
-
- SUBS r0, r0, #1
- BNE dct_IV_loop1_start
-
- POP {r4-r9}
-
- BX lr
+__asm void dct_IV_func1(int i, const FIXP_SPK *twiddle,
+ FIXP_DBL *RESTRICT pDat_0, FIXP_DBL *RESTRICT pDat_1) {
+ /* Register map:
+ r0 i
+ r1 twiddle
+ r2 pDat_0
+ r3 pDat_1
+ r4 accu1
+ r5 accu2
+ r6 accu3
+ r7 accu4
+ r8 val_tw
+ r9 accuX
+ */
+ PUSH{r4 - r9}
+
+ /* 44 cycles for 2 iterations = 22 cycles/iteration */
+ dct_IV_loop1_start
+ /* First iteration */
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++;
+ LDR r5,
+ [ r2, #0 ] // accu2 = pDat_0[0]
+ LDR r4,
+ [ r3, #0 ] // accu1 = pDat_1[0]
+
+ SMULWT r9,
+ r5,
+ r8 // accuX = accu2*val_tw.l
+ SMULWB r5,
+ r5,
+ r8 // accu2 = accu2*val_tw.h
+ RSB r9,
+ r9,
+# 0 // accuX =-accu2*val_tw.l
+ SMLAWT r5, r4, r8,
+ r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l
+ SMLAWB r4,
+ r4, r8,
+ r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l
+
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++;
+ LDR r7,
+ [ r3, # - 4 ] // accu4 = pDat_1[-1]
+ LDR r6,
+ [ r2, #4 ] // accu3 = pDat_0[1]
+
+ SMULWB r9,
+ r7,
+ r8 // accuX = accu4*val_tw.h
+ SMULWT r7,
+ r7,
+ r8 // accu4 = accu4*val_tw.l
+ RSB r9,
+ r9,
+# 0 // accuX =-accu4*val_tw.h
+ SMLAWB r7, r6, r8,
+ r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h
+ SMLAWT r6,
+ r6, r8,
+ r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h
+
+ STR r5,
+ [r2],
+# 4 // *pDat_0++ = accu2
+ STR r4, [r2],
+# 4 // *pDat_0++ = accu1
+ STR r6, [r3],
+#- 4 // *pDat_1-- = accu3
+ STR r7, [r3],
+#- 4 // *pDat_1-- = accu4
+
+ /* Second iteration */
+ LDR r8, [r1],
+# 4 // val_tw = *twiddle++;
+ LDR r5,
+ [ r2, #0 ] // accu2 = pDat_0[0]
+ LDR r4,
+ [ r3, #0 ] // accu1 = pDat_1[0]
+
+ SMULWT r9,
+ r5,
+ r8 // accuX = accu2*val_tw.l
+ SMULWB r5,
+ r5,
+ r8 // accu2 = accu2*val_tw.h
+ RSB r9,
+ r9,
+# 0 // accuX =-accu2*val_tw.l
+ SMLAWT r5, r4, r8,
+ r5 // accu2 = accu2*val_tw.h + accu1*val_tw.l
+ SMLAWB r4,
+ r4, r8,
+ r9 // accu1 = accu1*val_tw.h - accu2*val_tw.l
+
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++;
+ LDR r7,
+ [ r3, # - 4 ] // accu4 = pDat_1[-1]
+ LDR r6,
+ [ r2, #4 ] // accu3 = pDat_0[1]
+
+ SMULWB r9,
+ r7,
+ r8 // accuX = accu4*val_tw.h
+ SMULWT r7,
+ r7,
+ r8 // accu4 = accu4*val_tw.l
+ RSB r9,
+ r9,
+# 0 // accuX =-accu4*val_tw.h
+ SMLAWB r7, r6, r8,
+ r7 // accu4 = accu4*val_tw.l+accu3*val_tw.h
+ SMLAWT r6,
+ r6, r8,
+ r9 // accu3 = accu3*val_tw.l-accu4*val_tw.h
+
+ STR r5,
+ [r2],
+# 4 // *pDat_0++ = accu2
+ STR r4, [r2],
+# 4 // *pDat_0++ = accu1
+ STR r6, [r3],
+#- 4 // *pDat_1-- = accu3
+ STR r7, [r3],
+#- 4 // *pDat_1-- = accu4
+
+ SUBS r0, r0,
+# 1 BNE dct_IV_loop1_start
+
+ POP { r4 - r9 }
+
+ BX lr
}
#endif /* FUNCTION_dct_IV_func1 */
-
#ifdef FUNCTION_dct_IV_func2
-FDK_INLINE
/* __attribute__((noinline)) */
-static void dct_IV_func2(
- int i,
- const FIXP_SPK *twiddle,
- FIXP_DBL *pDat_0,
- FIXP_DBL *pDat_1,
- int inc)
-{
+static inline void dct_IV_func2(int i, const FIXP_SPK *twiddle,
+ FIXP_DBL *pDat_0, FIXP_DBL *pDat_1, int inc) {
FIXP_DBL accu1, accu2, accu3, accu4, accuX;
LONG val_tw;
accu1 = pDat_1[-2];
accu2 = pDat_1[-1];
- *--pDat_1 = -(pDat_0[1]>>1);
- *pDat_0++ = (pDat_0[0]>>1);
+ *--pDat_1 = -(pDat_0[1] >> 1);
+ *pDat_0++ = (pDat_0[0] >> 1);
twiddle += inc;
-__asm
- {
- LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
+ __asm {
+ LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
B dct_IV_loop2_2nd_part
- /* 42 cycles for 2 iterations = 21 cycles/iteration */
+ /* 42 cycles for 2 iterations = 21 cycles/iteration */
dct_IV_loop2:
SMULWT accuX, accu2, val_tw
SMULWB accu2, accu2, val_tw
@@ -224,7 +290,7 @@ dct_IV_loop2:
LDR accu1, [pDat_1, #-8]
LDR accu2, [pDat_1, #-4]
- LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
+ LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
STR accuX, [pDat_1, #-4] !
STR accu4, [pDat_0], #4
@@ -252,7 +318,7 @@ dct_IV_loop2_2nd_part:
STR accuX, [pDat_1, #-4] !
STR accu4, [pDat_0], #4
- LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
+ LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
SUBS i, i, #1
BNE dct_IV_loop2
@@ -267,96 +333,148 @@ dct_IV_loop2_2nd_part:
}
#endif /* FUNCTION_dct_IV_func2 */
-
#ifdef FUNCTION_dst_IV_func1
-__asm void dst_IV_func1(
- int i,
- const FIXP_SPK *twiddle,
- FIXP_DBL *pDat_0,
- FIXP_DBL *pDat_1)
-{
- /* Register map:
- r0 i
- r1 twiddle
- r2 pDat_0
- r3 pDat_1
- r4 accu1
- r5 accu2
- r6 accu3
- r7 accu4
- r8 val_tw
- r9 accuX
- */
- PUSH {r4-r9}
-
-dst_IV_loop1
- LDR r8, [r1], #4 // val_tw = *twiddle++
- LDR r5, [r2] // accu2 = pDat_0[0]
- LDR r6, [r2, #4] // accu3 = pDat_0[1]
- RSB r5, r5, #0 // accu2 = -accu2
- SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l
- LDR r4, [r3, #-4] // accu1 = pDat_1[-1]
- RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l
- SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
- SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l
- LDR r7, [r3, #-8] // accu4 = pDat_1[-2]
- SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
- LDR r8, [r1], #4 // val_tw = *twiddle++
- STR r5, [r2], #4 // *pDat_0++ = accu2
- STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX)
- RSB r7, r7, #0 // accu4 = -accu4
- SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h
- SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l
- RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h
- SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
- SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
- STR r6, [r3, #-4] ! // *--pDat_1 = accu3
- STR r7, [r3, #-4] ! // *--pDat_1 = accu4
-
- LDR r8, [r1], #4 // val_tw = *twiddle++
- LDR r5, [r2] // accu2 = pDat_0[0]
- LDR r6, [r2, #4] // accu3 = pDat_0[1]
- RSB r5, r5, #0 // accu2 = -accu2
- SMULWT r9, r5, r8 // accuX = (-accu2)*val_tw.l
- LDR r4, [r3, #-4] // accu1 = pDat_1[-1]
- RSB r9, r9, #0 // accuX = -(-accu2)*val_tw.l
- SMLAWB r9, r4, r8, r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
- SMULWT r4, r4, r8 // accu1 = accu1*val_tw.l
- LDR r7, [r3, #-8] // accu4 = pDat_1[-2]
- SMLAWB r5, r5, r8, r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
- LDR r8, [r1], #4 // val_tw = *twiddle++
- STR r5, [r2], #4 // *pDat_0++ = accu2
- STR r9, [r2], #4 // *pDat_0++ = accu1 (accuX)
- RSB r7, r7, #0 // accu4 = -accu4
- SMULWB r5, r7, r8 // accu2 = (-accu4)*val_tw.h
- SMULWB r4, r6, r8 // accu1 = (-accu4)*val_tw.l
- RSB r5, r5, #0 // accu2 = -(-accu4)*val_tw.h
- SMLAWT r6, r6, r8, r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
- SMLAWT r7, r7, r8, r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
- STR r6, [r3, #-4] ! // *--pDat_1 = accu3
- STR r7, [r3, #-4] ! // *--pDat_1 = accu4
-
- SUBS r0, r0, #4 // i-= 4
- BNE dst_IV_loop1
-
- POP {r4-r9}
- BX lr
+__asm void dst_IV_func1(int i, const FIXP_SPK *twiddle, FIXP_DBL *pDat_0,
+ FIXP_DBL *pDat_1) {
+ /* Register map:
+ r0 i
+ r1 twiddle
+ r2 pDat_0
+ r3 pDat_1
+ r4 accu1
+ r5 accu2
+ r6 accu3
+ r7 accu4
+ r8 val_tw
+ r9 accuX
+ */
+ PUSH{r4 - r9}
+
+ dst_IV_loop1 LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++
+ LDR r5,
+ [r2] // accu2 = pDat_0[0]
+ LDR r6,
+ [ r2, #4 ] // accu3 = pDat_0[1]
+ RSB r5,
+ r5,
+# 0 // accu2 = -accu2
+ SMULWT r9, r5,
+ r8 // accuX = (-accu2)*val_tw.l
+ LDR r4,
+ [ r3, # - 4 ] // accu1 = pDat_1[-1]
+ RSB r9,
+ r9,
+# 0 // accuX = -(-accu2)*val_tw.l
+ SMLAWB r9, r4, r8,
+ r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
+ SMULWT r4,
+ r4,
+ r8 // accu1 = accu1*val_tw.l
+ LDR r7,
+ [ r3, # - 8 ] // accu4 = pDat_1[-2]
+ SMLAWB r5,
+ r5, r8,
+ r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++
+ STR r5, [r2],
+# 4 // *pDat_0++ = accu2
+ STR r9, [r2],
+# 4 // *pDat_0++ = accu1 (accuX)
+ RSB r7, r7,
+# 0 // accu4 = -accu4
+ SMULWB r5, r7,
+ r8 // accu2 = (-accu4)*val_tw.h
+ SMULWB r4,
+ r6,
+ r8 // accu1 = (-accu4)*val_tw.l
+ RSB r5,
+ r5,
+# 0 // accu2 = -(-accu4)*val_tw.h
+ SMLAWT r6, r6, r8,
+ r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+ SMLAWT r7,
+ r7, r8,
+ r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+ STR r6,
+ [ r3, # - 4 ] ! // *--pDat_1 = accu3
+ STR r7,
+ [ r3, # - 4 ] ! // *--pDat_1 = accu4
+
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++
+ LDR r5,
+ [r2] // accu2 = pDat_0[0]
+ LDR r6,
+ [ r2, #4 ] // accu3 = pDat_0[1]
+ RSB r5,
+ r5,
+# 0 // accu2 = -accu2
+ SMULWT r9, r5,
+ r8 // accuX = (-accu2)*val_tw.l
+ LDR r4,
+ [ r3, # - 4 ] // accu1 = pDat_1[-1]
+ RSB r9,
+ r9,
+# 0 // accuX = -(-accu2)*val_tw.l
+ SMLAWB r9, r4, r8,
+ r9 // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
+ SMULWT r4,
+ r4,
+ r8 // accu1 = accu1*val_tw.l
+ LDR r7,
+ [ r3, # - 8 ] // accu4 = pDat_1[-2]
+ SMLAWB r5,
+ r5, r8,
+ r4 // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
+ LDR r8,
+ [r1],
+# 4 // val_tw = *twiddle++
+ STR r5, [r2],
+# 4 // *pDat_0++ = accu2
+ STR r9, [r2],
+# 4 // *pDat_0++ = accu1 (accuX)
+ RSB r7, r7,
+# 0 // accu4 = -accu4
+ SMULWB r5, r7,
+ r8 // accu2 = (-accu4)*val_tw.h
+ SMULWB r4,
+ r6,
+ r8 // accu1 = (-accu4)*val_tw.l
+ RSB r5,
+ r5,
+# 0 // accu2 = -(-accu4)*val_tw.h
+ SMLAWT r6, r6, r8,
+ r5 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+ SMLAWT r7,
+ r7, r8,
+ r4 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+ STR r6,
+ [ r3, # - 4 ] ! // *--pDat_1 = accu3
+ STR r7,
+ [ r3, # - 4 ] ! // *--pDat_1 = accu4
+
+ SUBS r0,
+ r0,
+# 4 // i-= 4
+ BNE dst_IV_loop1
+
+ POP{r4 - r9} BX lr
}
#endif /* FUNCTION_dst_IV_func1 */
#ifdef FUNCTION_dst_IV_func2
-FDK_INLINE
/* __attribute__((noinline)) */
-static void dst_IV_func2(
- int i,
- const FIXP_SPK *twiddle,
- FIXP_DBL *RESTRICT pDat_0,
- FIXP_DBL *RESTRICT pDat_1,
- int inc)
-{
- FIXP_DBL accu1,accu2,accu3,accu4;
+static inline void dst_IV_func2(int i, const FIXP_SPK *twiddle,
+ FIXP_DBL *RESTRICT pDat_0,
+ FIXP_DBL *RESTRICT pDat_1, int inc) {
+ FIXP_DBL accu1, accu2, accu3, accu4;
LONG val_tw;
accu4 = pDat_0[0];
@@ -371,76 +489,74 @@ static void dst_IV_func2(
*pDat_0++ = accu3;
*pDat_1-- = accu4;
-
- __asm
- {
+ __asm {
B dst_IV_loop2_2nd_part
- /* 50 cycles for 2 iterations = 25 cycles/iteration */
+ /* 50 cycles for 2 iterations = 25 cycles/iteration */
dst_IV_loop2:
- LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
+ LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
- RSB accu2, accu2, #0 // accu2 = -accu2
- RSB accu1, accu1, #0 // accu1 = -accu1
- SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l
- SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l
- RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l
- SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
- SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
- STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1
- STR accu2, [pDat_0], #4 // *pDat_0++ = accu2
+ RSB accu2, accu2, #0 // accu2 = -accu2
+ RSB accu1, accu1, #0 // accu1 = -accu1
+ SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l
+ SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l
+ RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l
+ SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
+ SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
+ STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1
+ STR accu2, [pDat_0], #4 // *pDat_0++ = accu2
- LDR accu4, [pDat_0] // accu4 = pDat_0[0]
- LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1]
+ LDR accu4, [pDat_0] // accu4 = pDat_0[0]
+ LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1]
- RSB accu4, accu4, #0 // accu4 = -accu4
- RSB accu3, accu3, #0 // accu3 = -accu3
+ RSB accu4, accu4, #0 // accu4 = -accu4
+ RSB accu3, accu3, #0 // accu3 = -accu3
- SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h
- SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l
- RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h
- SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
- SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+ SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h
+ SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l
+ RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h
+ SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+ SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
- LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1]
- LDR accu2, [pDat_1] // accu2 = pDat_1[0]
+ LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1]
+ LDR accu2, [pDat_1] // accu2 = pDat_1[0]
- STR accu3, [pDat_0], #4 // *pDat_0++ = accu3
- STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4
+ STR accu3, [pDat_0], #4 // *pDat_0++ = accu3
+ STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4
dst_IV_loop2_2nd_part:
- LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
+ LDR val_tw, [twiddle], inc, LSL #2 // val_tw = *twiddle; twiddle += inc
- RSB accu2, accu2, #0 // accu2 = -accu2
- RSB accu1, accu1, #0 // accu1 = -accu1
- SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l
- SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l
- RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l
- SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
- SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
- STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1
- STR accu2, [pDat_0], #4 // *pDat_0++ = accu2
+ RSB accu2, accu2, #0 // accu2 = -accu2
+ RSB accu1, accu1, #0 // accu1 = -accu1
+ SMULWT accu3, accu2, val_tw // accu3 = (-accu2)*val_tw.l
+ SMULWT accu4, accu1, val_tw // accu4 = (-accu1)*val_tw.l
+ RSB accu3, accu3, #0 // accu3 = -accu2*val_tw.l
+ SMLAWB accu1, accu1, val_tw, accu3 // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
+ SMLAWB accu2, accu2, val_tw, accu4 // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
+ STR accu1, [pDat_1], #-4 // *pDat_1-- = accu1
+ STR accu2, [pDat_0], #4 // *pDat_0++ = accu2
- LDR accu4, [pDat_0] // accu4 = pDat_0[0]
- LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1]
+ LDR accu4, [pDat_0] // accu4 = pDat_0[0]
+ LDR accu3, [pDat_0, #4] // accu3 = pDat_0[1]
- RSB accu4, accu4, #0 // accu4 = -accu4
- RSB accu3, accu3, #0 // accu3 = -accu3
+ RSB accu4, accu4, #0 // accu4 = -accu4
+ RSB accu3, accu3, #0 // accu3 = -accu3
- SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h
- SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l
- RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h
- SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
- SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+ SMULWB accu1, accu3, val_tw // accu1 = (-accu3)*val_tw.h
+ SMULWT accu2, accu3, val_tw // accu2 = (-accu3)*val_tw.l
+ RSB accu1, accu1, #0 // accu1 = -(-accu3)*val_tw.h
+ SMLAWT accu3, accu4, val_tw, accu1 // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+ SMLAWB accu4, accu4, val_tw, accu2 // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
- LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1]
- LDR accu2, [pDat_1] // accu2 = pDat_1[0]
+ LDR accu1, [pDat_1, #-4] // accu1 = pDat_1[-1]
+ LDR accu2, [pDat_1] // accu2 = pDat_1[0]
- STR accu3, [pDat_0], #4 // *pDat_0++ = accu3
- STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4
+ STR accu3, [pDat_0], #4 // *pDat_0++ = accu3
+ STR accu4, [pDat_1], #-4 // *pDat_1-- = accu4
SUBS i, i, #1
BNE dst_IV_loop2