From 9bf37cc9712506b2483650c82d3c41152337ef7e Mon Sep 17 00:00:00 2001
From: Dave Burke <daveburke@google.com>
Date: Tue, 17 Apr 2012 09:51:45 -0700
Subject: Fraunhofer AAC codec.

License boilerplate update to follow.

Change-Id: I2810460c11a58b6d148d84673cc031f3685e79b5
---
 libFDK/src/arm/autocorr2nd.cpp  |  33 ++
 libFDK/src/arm/dct_arm.cpp      | 395 ++++++++++++++++++++++
 libFDK/src/arm/fft_rad2_arm.cpp | 259 +++++++++++++++
 libFDK/src/arm/qmf_arm.cpp      | 710 ++++++++++++++++++++++++++++++++++++++++
 libFDK/src/arm/scale_arm.cpp    | 110 +++++++
 5 files changed, 1507 insertions(+)
 create mode 100644 libFDK/src/arm/autocorr2nd.cpp
 create mode 100644 libFDK/src/arm/dct_arm.cpp
 create mode 100644 libFDK/src/arm/fft_rad2_arm.cpp
 create mode 100644 libFDK/src/arm/qmf_arm.cpp
 create mode 100644 libFDK/src/arm/scale_arm.cpp

(limited to 'libFDK/src/arm')

diff --git a/libFDK/src/arm/autocorr2nd.cpp b/libFDK/src/arm/autocorr2nd.cpp
new file mode 100644
index 0000000..85926af
--- /dev/null
+++ b/libFDK/src/arm/autocorr2nd.cpp
@@ -0,0 +1,33 @@
+/****************************************************************************
+
+                     (C) Copyright Fraunhofer IIS (2006)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+   This software and/or program is protected by copyright law and international
+   treaties. Any reproduction or distribution of this software and/or program,
+   or any portion of it, may result in severe civil and criminal penalties, and
+   will be prosecuted to the maximum extent possible under law.
+
+ $Id$
+
+*******************************************************************************/
+
+/*!
+ *
+ * \brief Calculate second order autocorrelation
+ *
+ */
+
+
diff --git a/libFDK/src/arm/dct_arm.cpp b/libFDK/src/arm/dct_arm.cpp
new file mode 100644
index 0000000..dd0ca09
--- /dev/null
+++ b/libFDK/src/arm/dct_arm.cpp
@@ -0,0 +1,395 @@
+/****************************************************************************
+
+                       (C) copyright Fraunhofer IIS (2004)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+
+ $Id$
+
+***************************************************************************/
+
+
+
+#ifdef FUNCTION_dct_IV_func1
+
+/*
+   Note: This assembler routine is here, because the ARM926 compiler does
+         not encode the inline assembler with optimal speed.
+         With this version, we save 2 cycles per loop iteration.
+*/
+
+__asm  void dct_IV_func1(
+    int i,
+    const FIXP_SPK *twiddle,
+    FIXP_DBL *RESTRICT pDat_0,
+    FIXP_DBL *RESTRICT pDat_1)
+{
+    /* Register map:
+       r0   i
+       r1   twiddle
+       r2   pDat_0
+       r3   pDat_1
+       r4   accu1
+       r5   accu2
+       r6   accu3
+       r7   accu4
+       r8   val_tw
+       r9   accuX
+    */
+    PUSH    {r4-r9}
+
+     /* 44 cycles for 2 iterations = 22 cycles/iteration */
+dct_IV_loop1_start
+/*  First iteration */
+    LDR     r8, [r1], #4    // val_tw = *twiddle++;
+    LDR     r5, [r2, #0]    // accu2 = pDat_0[0]
+    LDR     r4, [r3, #0]    // accu1 = pDat_1[0]
+
+    SMULWT  r9, r5, r8      // accuX = accu2*val_tw.l
+    SMULWB  r5, r5, r8      // accu2 = accu2*val_tw.h
+    RSB     r9, r9, #0      // accuX =-accu2*val_tw.l
+    SMLAWT  r5, r4, r8, r5  // accu2 = accu2*val_tw.h + accu1*val_tw.l
+    SMLAWB  r4, r4, r8, r9  // accu1 = accu1*val_tw.h - accu2*val_tw.l
+
+    LDR     r8, [r1], #4    // val_tw = *twiddle++;
+    LDR     r7, [r3, #-4]   // accu4 = pDat_1[-1]
+    LDR     r6, [r2, #4]    // accu3 = pDat_0[1]
+
+    SMULWB  r9, r7, r8      // accuX = accu4*val_tw.h
+    SMULWT  r7, r7, r8      // accu4 = accu4*val_tw.l
+    RSB     r9, r9, #0      // accuX =-accu4*val_tw.h
+    SMLAWB  r7, r6, r8, r7  // accu4 = accu4*val_tw.l+accu3*val_tw.h
+    SMLAWT  r6, r6, r8, r9  // accu3 = accu3*val_tw.l-accu4*val_tw.h
+
+    STR     r5, [r2], #4    // *pDat_0++ = accu2
+    STR     r4, [r2], #4    // *pDat_0++ = accu1
+    STR     r6, [r3], #-4   // *pDat_1-- = accu3
+    STR     r7, [r3], #-4   // *pDat_1-- = accu4
+
+/*  Second iteration */
+    LDR     r8, [r1], #4    // val_tw = *twiddle++;
+    LDR     r5, [r2, #0]    // accu2 = pDat_0[0]
+    LDR     r4, [r3, #0]    // accu1 = pDat_1[0]
+
+    SMULWT  r9, r5, r8      // accuX = accu2*val_tw.l
+    SMULWB  r5, r5, r8      // accu2 = accu2*val_tw.h
+    RSB     r9, r9, #0      // accuX =-accu2*val_tw.l
+    SMLAWT  r5, r4, r8, r5  // accu2 = accu2*val_tw.h + accu1*val_tw.l
+    SMLAWB  r4, r4, r8, r9  // accu1 = accu1*val_tw.h - accu2*val_tw.l
+
+    LDR     r8, [r1], #4    // val_tw = *twiddle++;
+    LDR     r7, [r3, #-4]   // accu4 = pDat_1[-1]
+    LDR     r6, [r2, #4]    // accu3 = pDat_0[1]
+
+    SMULWB  r9, r7, r8      // accuX = accu4*val_tw.h
+    SMULWT  r7, r7, r8      // accu4 = accu4*val_tw.l
+    RSB     r9, r9, #0      // accuX =-accu4*val_tw.h
+    SMLAWB  r7, r6, r8, r7  // accu4 = accu4*val_tw.l+accu3*val_tw.h
+    SMLAWT  r6, r6, r8, r9  // accu3 = accu3*val_tw.l-accu4*val_tw.h
+
+    STR     r5, [r2], #4    // *pDat_0++ = accu2
+    STR     r4, [r2], #4    // *pDat_0++ = accu1
+    STR     r6, [r3], #-4   // *pDat_1-- = accu3
+    STR     r7, [r3], #-4   // *pDat_1-- = accu4
+
+    SUBS    r0, r0, #1
+    BNE     dct_IV_loop1_start
+
+    POP     {r4-r9}
+
+    BX      lr
+}
+
+#endif /* FUNCTION_dct_IV_func1 */
+
+
+#ifdef FUNCTION_dct_IV_func2
+
+FDK_INLINE
+/* __attribute__((noinline)) */
+static void dct_IV_func2(
+    int i,
+    const FIXP_SPK *twiddle,
+    FIXP_DBL *pDat_0,
+    FIXP_DBL *pDat_1,
+    int inc)
+{
+  FIXP_DBL accu1, accu2, accu3, accu4, accuX;
+  LONG val_tw;
+
+  accu1 = pDat_1[-2];
+  accu2 = pDat_1[-1];
+
+  *--pDat_1 = -(pDat_0[1]>>1);
+  *pDat_0++ = (pDat_0[0]>>1);
+
+  twiddle += inc;
+
+__asm
+  {
+    LDR     val_tw, [twiddle], inc, LSL #2    // val_tw = *twiddle; twiddle += inc
+    B       dct_IV_loop2_2nd_part
+
+    /* 42 cycles for 2 iterations = 21 cycles/iteration */
+dct_IV_loop2:
+    SMULWT  accuX, accu2, val_tw
+    SMULWB  accu2, accu2, val_tw
+    RSB     accuX, accuX, #0
+    SMLAWB  accuX, accu1, val_tw, accuX
+    SMLAWT  accu2, accu1, val_tw, accu2
+    STR     accuX, [pDat_0], #4
+    STR     accu2, [pDat_1, #-4] !
+
+    LDR     accu4, [pDat_0, #4]
+    LDR     accu3, [pDat_0]
+    SMULWB  accuX, accu4, val_tw
+    SMULWT  accu4, accu4, val_tw
+    RSB     accuX, accuX, #0
+    SMLAWT  accuX, accu3, val_tw, accuX
+    SMLAWB  accu4, accu3, val_tw, accu4
+
+    LDR     accu1, [pDat_1, #-8]
+    LDR     accu2, [pDat_1, #-4]
+
+    LDR     val_tw, [twiddle], inc, LSL #2    // val_tw = *twiddle; twiddle += inc
+
+    STR     accuX, [pDat_1, #-4] !
+    STR     accu4, [pDat_0], #4
+
+dct_IV_loop2_2nd_part:
+    SMULWT  accuX, accu2, val_tw
+    SMULWB  accu2, accu2, val_tw
+    RSB     accuX, accuX, #0
+    SMLAWB  accuX, accu1, val_tw, accuX
+    SMLAWT  accu2, accu1, val_tw, accu2
+    STR     accuX, [pDat_0], #4
+    STR     accu2, [pDat_1, #-4] !
+
+    LDR     accu4, [pDat_0, #4]
+    LDR     accu3, [pDat_0]
+    SMULWB  accuX, accu4, val_tw
+    SMULWT  accu4, accu4, val_tw
+    RSB     accuX, accuX, #0
+    SMLAWT  accuX, accu3, val_tw, accuX
+    SMLAWB  accu4, accu3, val_tw, accu4
+
+    LDR     accu1, [pDat_1, #-8]
+    LDR     accu2, [pDat_1, #-4]
+
+    STR     accuX, [pDat_1, #-4] !
+    STR     accu4, [pDat_0], #4
+
+    LDR     val_tw, [twiddle], inc, LSL #2    // val_tw = *twiddle; twiddle += inc
+
+    SUBS    i, i, #1
+    BNE     dct_IV_loop2
+  }
+
+  /* Last Sin and Cos value pair are the same */
+  accu1 = fMultDiv2(accu1, WTC(0x5a82799a));
+  accu2 = fMultDiv2(accu2, WTC(0x5a82799a));
+
+  *--pDat_1 = accu1 + accu2;
+  *pDat_0++ = accu1 - accu2;
+}
+#endif /* FUNCTION_dct_IV_func2 */
+
+
+#ifdef FUNCTION_dst_IV_func1
+
+__asm void dst_IV_func1(
+    int i,
+    const FIXP_SPK *twiddle, 
+    FIXP_DBL *pDat_0, 
+    FIXP_DBL *pDat_1)
+{
+    /* Register map:
+       r0   i
+       r1   twiddle
+       r2   pDat_0
+       r3   pDat_1
+       r4   accu1
+       r5   accu2
+       r6   accu3
+       r7   accu4
+       r8   val_tw
+       r9   accuX
+    */
+    PUSH    {r4-r9}
+
+dst_IV_loop1
+    LDR     r8, [r1], #4               // val_tw = *twiddle++
+    LDR     r5, [r2]                   // accu2 = pDat_0[0]
+    LDR     r6, [r2, #4]               // accu3 = pDat_0[1]
+    RSB     r5, r5, #0                 // accu2 = -accu2
+    SMULWT  r9, r5, r8                 // accuX = (-accu2)*val_tw.l
+    LDR     r4, [r3, #-4]              // accu1 = pDat_1[-1] 
+    RSB     r9, r9, #0                 // accuX = -(-accu2)*val_tw.l
+    SMLAWB  r9, r4, r8, r9             // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
+    SMULWT  r4, r4, r8                 // accu1 = accu1*val_tw.l
+    LDR     r7, [r3, #-8]              // accu4 = pDat_1[-2]
+    SMLAWB  r5, r5, r8, r4             // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
+    LDR     r8, [r1], #4               // val_tw = *twiddle++
+    STR     r5, [r2], #4               // *pDat_0++ = accu2
+    STR     r9, [r2], #4               // *pDat_0++ = accu1 (accuX)
+    RSB     r7, r7, #0                 // accu4 = -accu4
+    SMULWB  r5, r7, r8                 // accu2 = (-accu4)*val_tw.h
+    SMULWB  r4, r6, r8                 // accu1 = (-accu4)*val_tw.l
+    RSB     r5, r5, #0                 // accu2 = -(-accu4)*val_tw.h
+    SMLAWT  r6, r6, r8, r5             // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+    SMLAWT  r7, r7, r8, r4             // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+    STR     r6, [r3, #-4] !            // *--pDat_1 = accu3
+    STR     r7, [r3, #-4] !            // *--pDat_1 = accu4
+
+    LDR     r8, [r1], #4               // val_tw = *twiddle++
+    LDR     r5, [r2]                   // accu2 = pDat_0[0]
+    LDR     r6, [r2, #4]               // accu3 = pDat_0[1]
+    RSB     r5, r5, #0                 // accu2 = -accu2
+    SMULWT  r9, r5, r8                 // accuX = (-accu2)*val_tw.l
+    LDR     r4, [r3, #-4]              // accu1 = pDat_1[-1] 
+    RSB     r9, r9, #0                 // accuX = -(-accu2)*val_tw.l
+    SMLAWB  r9, r4, r8, r9             // accuX = accu1*val_tw.h-(-accu2)*val_tw.l
+    SMULWT  r4, r4, r8                 // accu1 = accu1*val_tw.l
+    LDR     r7, [r3, #-8]              // accu4 = pDat_1[-2]
+    SMLAWB  r5, r5, r8, r4             // accu2 = (-accu2)*val_tw.t+accu1*val_tw.l
+    LDR     r8, [r1], #4               // val_tw = *twiddle++
+    STR     r5, [r2], #4               // *pDat_0++ = accu2
+    STR     r9, [r2], #4               // *pDat_0++ = accu1 (accuX)
+    RSB     r7, r7, #0                 // accu4 = -accu4
+    SMULWB  r5, r7, r8                 // accu2 = (-accu4)*val_tw.h
+    SMULWB  r4, r6, r8                 // accu1 = (-accu4)*val_tw.l
+    RSB     r5, r5, #0                 // accu2 = -(-accu4)*val_tw.h
+    SMLAWT  r6, r6, r8, r5             // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+    SMLAWT  r7, r7, r8, r4             // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+    STR     r6, [r3, #-4] !            // *--pDat_1 = accu3
+    STR     r7, [r3, #-4] !            // *--pDat_1 = accu4
+
+    SUBS    r0, r0, #4                 // i-= 4
+    BNE     dst_IV_loop1
+
+    POP     {r4-r9}
+    BX      lr
+}
+#endif /* FUNCTION_dst_IV_func1 */
+
+#ifdef FUNCTION_dst_IV_func2
+
+FDK_INLINE 
+/* __attribute__((noinline)) */
+static void dst_IV_func2(
+    int i,
+    const FIXP_SPK *twiddle,
+    FIXP_DBL *RESTRICT pDat_0, 
+    FIXP_DBL *RESTRICT pDat_1,
+    int inc)
+{
+  FIXP_DBL accu1,accu2,accu3,accu4;
+  LONG val_tw;
+  
+  accu4 = pDat_0[0];
+  accu3 = pDat_0[1];
+  accu4 >>= 1;
+  accu3 >>= 1;
+  accu4 = -accu4;
+
+  accu1 = pDat_1[-1];
+  accu2 = pDat_1[0];
+
+  *pDat_0++ = accu3;
+  *pDat_1-- = accu4;
+
+  
+  __asm
+  {
+    B       dst_IV_loop2_2nd_part
+    
+    /* 50 cycles for 2 iterations = 25 cycles/iteration */
+
+dst_IV_loop2:
+
+    LDR     val_tw, [twiddle], inc, LSL #2    // val_tw = *twiddle; twiddle += inc
+    
+    RSB     accu2, accu2, #0                  // accu2 = -accu2
+    RSB     accu1, accu1, #0                  // accu1 = -accu1
+    SMULWT  accu3, accu2, val_tw              // accu3 = (-accu2)*val_tw.l
+    SMULWT  accu4, accu1, val_tw              // accu4 = (-accu1)*val_tw.l
+    RSB     accu3, accu3, #0                  // accu3 = -accu2*val_tw.l
+    SMLAWB  accu1, accu1, val_tw, accu3       // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
+    SMLAWB  accu2, accu2, val_tw, accu4       // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
+    STR     accu1, [pDat_1], #-4              // *pDat_1-- = accu1
+  	STR     accu2, [pDat_0], #4               // *pDat_0++ = accu2
+  	
+  	LDR     accu4, [pDat_0]                   // accu4 = pDat_0[0]
+  	LDR     accu3, [pDat_0, #4]               // accu3 = pDat_0[1]
+  	
+    RSB     accu4, accu4, #0                  // accu4 = -accu4
+    RSB     accu3, accu3, #0                  // accu3 = -accu3
+
+    SMULWB  accu1, accu3, val_tw              // accu1 = (-accu3)*val_tw.h
+    SMULWT  accu2, accu3, val_tw              // accu2 = (-accu3)*val_tw.l
+    RSB     accu1, accu1, #0                  // accu1 = -(-accu3)*val_tw.h
+    SMLAWT  accu3, accu4, val_tw, accu1       // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+    SMLAWB  accu4, accu4, val_tw, accu2       // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+
+    LDR     accu1, [pDat_1, #-4]              // accu1 = pDat_1[-1]
+    LDR     accu2, [pDat_1]                   // accu2 = pDat_1[0]
+    
+    STR     accu3, [pDat_0], #4               // *pDat_0++ = accu3
+    STR     accu4, [pDat_1], #-4              // *pDat_1-- = accu4
+
+dst_IV_loop2_2nd_part:
+
+    LDR     val_tw, [twiddle], inc, LSL #2    // val_tw = *twiddle; twiddle += inc
+    
+    RSB     accu2, accu2, #0                  // accu2 = -accu2
+    RSB     accu1, accu1, #0                  // accu1 = -accu1
+    SMULWT  accu3, accu2, val_tw              // accu3 = (-accu2)*val_tw.l
+    SMULWT  accu4, accu1, val_tw              // accu4 = (-accu1)*val_tw.l
+    RSB     accu3, accu3, #0                  // accu3 = -accu2*val_tw.l
+    SMLAWB  accu1, accu1, val_tw, accu3       // accu1 = -accu1*val_tw.h-(-accu2)*val_tw.l
+    SMLAWB  accu2, accu2, val_tw, accu4       // accu2 = (-accu1)*val_tw.l+(-accu2)*val_tw.h
+    STR     accu1, [pDat_1], #-4              // *pDat_1-- = accu1
+  	STR     accu2, [pDat_0], #4               // *pDat_0++ = accu2
+  	
+  	LDR     accu4, [pDat_0]                   // accu4 = pDat_0[0]
+  	LDR     accu3, [pDat_0, #4]               // accu3 = pDat_0[1]
+  	
+    RSB     accu4, accu4, #0                  // accu4 = -accu4
+    RSB     accu3, accu3, #0                  // accu3 = -accu3
+
+    SMULWB  accu1, accu3, val_tw              // accu1 = (-accu3)*val_tw.h
+    SMULWT  accu2, accu3, val_tw              // accu2 = (-accu3)*val_tw.l
+    RSB     accu1, accu1, #0                  // accu1 = -(-accu3)*val_tw.h
+    SMLAWT  accu3, accu4, val_tw, accu1       // accu3 = (-accu4)*val_tw.l-(-accu3)*val_tw.h
+    SMLAWB  accu4, accu4, val_tw, accu2       // accu4 = (-accu3)*val_tw.l+(-accu4)*val_tw.h
+
+    LDR     accu1, [pDat_1, #-4]              // accu1 = pDat_1[-1]
+    LDR     accu2, [pDat_1]                   // accu2 = pDat_1[0]
+    
+    STR     accu3, [pDat_0], #4               // *pDat_0++ = accu3
+    STR     accu4, [pDat_1], #-4              // *pDat_1-- = accu4
+
+    SUBS    i, i, #1
+    BNE     dst_IV_loop2
+  }
+  
+  /* Last Sin and Cos value pair are the same */
+  accu1 = fMultDiv2(-accu1, WTC(0x5a82799a));
+  accu2 = fMultDiv2(-accu2, WTC(0x5a82799a));
+
+  *pDat_0 = accu1 + accu2;
+  *pDat_1 = accu1 - accu2;
+}
+#endif /* FUNCTION_dst_IV_func2 */
diff --git a/libFDK/src/arm/fft_rad2_arm.cpp b/libFDK/src/arm/fft_rad2_arm.cpp
new file mode 100644
index 0000000..f40961a
--- /dev/null
+++ b/libFDK/src/arm/fft_rad2_arm.cpp
@@ -0,0 +1,259 @@
+/***************************  Fraunhofer IIS FDK Tools  **********************
+
+                        (C) Copyright Fraunhofer IIS (2005)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+   $Id$
+   Author(s):
+   Description: dit_fft ARM assembler replacements.
+
+   This software and/or program is protected by copyright law and international
+   treaties. Any reproduction or distribution of this software and/or program,
+   or any portion of it, may result in severe civil and criminal penalties, and
+   will be prosecuted to the maximum extent possible under law.
+
+******************************************************************************/
+
+/* NEON optimized FFT currently builds only with RVCT toolchain */
+
+#ifndef FUNCTION_dit_fft
+
+/* If dit_fft was not yet defined by ARM-Cortex ... */
+
+#if defined(SINETABLE_16BIT)
+
+#define FUNCTION_dit_fft
+
+/*****************************************************************************
+
+   date:   28.07.2005   srl
+
+   Contents/description: dit-tukey-FFT-algorithm
+
+******************************************************************************/
+
+#if defined(FUNCTION_dit_fft)
+
+
+void dit_fft(FIXP_DBL *x, const INT ldn, const FIXP_STP *trigdata, const INT trigDataSize)
+{
+    const INT n=1<<ldn;
+    INT i;
+
+    scramble(x,n);
+    /*
+     * 1+2 stage radix 4
+     */
+
+    for (i=0;i<n*2;i+=8)
+    {
+      FIXP_DBL a00, a10, a20, a30;
+      a00 = (x[i + 0] + x[i + 2])>>1;  /* Re A + Re B */
+      a10 = (x[i + 4] + x[i + 6])>>1;  /* Re C + Re D */
+      a20 = (x[i + 1] + x[i + 3])>>1;  /* Im A + Im B */
+      a30 = (x[i + 5] + x[i + 7])>>1;  /* Im C + Im D */
+
+      x[i + 0] = a00 + a10;       /* Re A' = Re A + Re B + Re C + Re D */
+      x[i + 4] = a00 - a10;       /* Re C' = Re A + Re B - Re C - Re D */
+      x[i + 1] = a20 + a30;       /* Im A' = Im A + Im B + Im C + Im D */
+      x[i + 5] = a20 - a30;       /* Im C' = Im A + Im B - Im C - Im D */
+
+      a00 = a00 - x[i + 2];       /* Re A - Re B */
+      a10 = a10 - x[i + 6];       /* Re C - Re D */
+      a20 = a20 - x[i + 3];       /* Im A - Im B */
+      a30 = a30 - x[i + 7];       /* Im C - Im D */
+
+      x[i + 2] = a00 + a30;       /* Re B' = Re A - Re B + Im C - Im D */
+      x[i + 6] = a00 - a30;       /* Re D' = Re A - Re B - Im C + Im D */
+      x[i + 3] = a20 - a10;       /* Im B' = Im A - Im B - Re C + Re D */
+      x[i + 7] = a20 + a10;       /* Im D' = Im A - Im B + Re C - Re D */
+    }
+
+    INT mh = 1 << 1;
+    INT ldm = ldn - 2;
+    INT trigstep = trigDataSize;
+
+    do
+    {
+        const FIXP_STP *pTrigData = trigdata;
+        INT j;
+
+        mh <<= 1;
+        trigstep >>= 1;
+
+        FDK_ASSERT(trigstep > 0);
+
+        /* Do first iteration with c=1.0 and s=0.0 separately to avoid loosing to much precision.
+           Beware: The impact on the overal FFT precision is rather large. */
+        {
+            FIXP_DBL *xt1 = x;
+            int r = n;
+
+            do {
+                FIXP_DBL *xt2 = xt1 + (mh<<1);
+                /*
+                FIXP_DBL *xt1 = x+ ((r)<<1);
+                FIXP_DBL *xt2 = xt1 + (mh<<1);
+                */
+                FIXP_DBL vr,vi,ur,ui;                 
+
+                //cplxMultDiv2(&vi, &vr, x[t2+1], x[t2], (FIXP_SGL)1.0, (FIXP_SGL)0.0);
+                vi = xt2[1]>>1;
+                vr = xt2[0]>>1;
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui+vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui-vi;
+
+                xt1 += mh;
+                xt2 += mh;
+
+                //cplxMultDiv2(&vr, &vi, x[t2+1], x[t2], (FIXP_SGL)1.0, (FIXP_SGL)0.0);
+                vr = xt2[1]>>1;
+                vi = xt2[0]>>1;
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui-vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui+vi;
+
+                xt1 = xt2 + mh;
+            } while ((r=r-(mh<<1)) != 0);
+        }
+        for(j=4; j<mh; j+=4)
+        {
+            FIXP_DBL *xt1 = x + (j>>1);
+            FIXP_SPK cs;
+            int r = n;
+
+            pTrigData += trigstep;
+            cs = *pTrigData;
+
+            do
+            {
+                FIXP_DBL *xt2 = xt1 + (mh<<1);
+                FIXP_DBL vr,vi,ur,ui;
+
+                cplxMultDiv2(&vi, &vr, xt2[1], xt2[0], cs);
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui+vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui-vi;
+
+                xt1 += mh;
+                xt2 += mh;
+
+                cplxMultDiv2(&vr, &vi, xt2[1], xt2[0], cs);
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui-vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui+vi;
+
+                /* Same as above but for t1,t2 with j>mh/4 and thus cs swapped */
+                xt1 = xt1 - (j);
+                xt2 = xt1 + (mh<<1);
+
+                cplxMultDiv2(&vi, &vr, xt2[0], xt2[1], cs);
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui-vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui+vi;
+
+                xt1 += mh;
+                xt2 += mh;
+
+                cplxMultDiv2(&vr, &vi, xt2[0], xt2[1], cs);
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur-vr;
+                xt1[1] = ui-vi;
+
+                xt2[0] = ur+vr;
+                xt2[1] = ui+vi;
+
+                xt1 = xt2 + (j);
+            }  while ((r=r-(mh<<1)) != 0);
+        }
+        {
+            FIXP_DBL *xt1 = x + (mh>>1);
+            int r = n;
+
+            do
+            {
+                FIXP_DBL *xt2 = xt1 + (mh<<1);
+                FIXP_DBL vr,vi,ur,ui;
+
+                cplxMultDiv2(&vi, &vr, xt2[1], xt2[0], STC(0x5a82799a), STC(0x5a82799a));
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui+vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui-vi;
+
+                xt1 += mh;
+                xt2 += mh;
+
+                cplxMultDiv2(&vr, &vi, xt2[1], xt2[0], STC(0x5a82799a), STC(0x5a82799a));
+
+                ur = xt1[0]>>1;
+                ui = xt1[1]>>1;
+
+                xt1[0] = ur+vr;
+                xt1[1] = ui-vi;
+
+                xt2[0] = ur-vr;
+                xt2[1] = ui+vi;
+
+                xt1 = xt2 + mh;
+            }  while ((r=r-(mh<<1)) != 0);
+        }
+    } while (--ldm != 0);
+}
+
+#endif /* if defined(FUNCTION_dit_fft)  */
+
+#endif /* if defined(SINETABLE_16BIT) */
+
+#endif /* ifndef FUNCTION_dit_fft */
diff --git a/libFDK/src/arm/qmf_arm.cpp b/libFDK/src/arm/qmf_arm.cpp
new file mode 100644
index 0000000..df538a4
--- /dev/null
+++ b/libFDK/src/arm/qmf_arm.cpp
@@ -0,0 +1,710 @@
+/****************************************************************************
+
+                     (C) Copyright Fraunhofer IIS (2004)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+   This software and/or program is protected by copyright law and international
+   treaties. Any reproduction or distribution of this software and/or program,
+   or any portion of it, may result in severe civil and criminal penalties, and
+   will be prosecuted to the maximum extent possible under law.
+
+ $Id$
+
+ History: 04-NOV-2009 A. Tritthart Optimized qmfSynPrototypeFirSlot1
+
+****************************************************************************/
+#if (QMF_NO_POLY==5)
+
+#define FUNCTION_qmfForwardModulationLP_odd
+
+#ifdef FUNCTION_qmfForwardModulationLP_odd
+static void
+qmfForwardModulationLP_odd( HANDLE_QMF_FILTER_BANK anaQmf, /*!< Handle of Qmf Analysis Bank  */
+                            const FIXP_QMF *timeIn,        /*!< Time Signal */
+                            FIXP_QMF *rSubband )           /*!< Real Output */
+{
+  int i;
+  int L = anaQmf->no_channels;
+  int M = L>>1;
+  int shift = (anaQmf->no_channels>>6) + 1;
+  int rSubband_e = 0;
+
+  FIXP_QMF *rSubbandPtr0 = &rSubband[M+0];                /* runs with increment */
+  FIXP_QMF *rSubbandPtr1 = &rSubband[M-1];                /* runs with decrement */
+  FIXP_QMF *timeIn0 = (FIXP_DBL *) &timeIn[0];            /* runs with increment */
+  FIXP_QMF *timeIn1 = (FIXP_DBL *) &timeIn[L];            /* runs with increment */
+  FIXP_QMF *timeIn2 = (FIXP_DBL *) &timeIn[L-1];          /* runs with decrement */
+  FIXP_QMF *timeIn3 = (FIXP_DBL *) &timeIn[2*L-1];        /* runs with decrement */
+
+  for (i = 0; i < M; i++)
+  {
+    *rSubbandPtr0++ = (*timeIn2-- >> 1) - (*timeIn0++ >> shift);
+    *rSubbandPtr1-- = (*timeIn1++ >> 1) + (*timeIn3-- >> shift);
+  }
+
+  dct_IV(rSubband,L, &rSubband_e);
+}
+#endif /* FUNCTION_qmfForwardModulationLP_odd */
+
+
+/* NEON optimized QMF currently builts only with RVCT toolchain */
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_5TE__)
+
+#if (SAMPLE_BITS == 16)
+#define FUNCTION_qmfAnaPrototypeFirSlot
+#endif
+
+#ifdef FUNCTION_qmfAnaPrototypeFirSlot
+
+#if defined(__GNUC__)	/* cppp replaced: elif */
+
+inline INT SMULBB (const SHORT a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smulbb %0, %1, %2"
+     : "=r" (result)
+     : "r" (a), "r" (b)) ;
+  return result ;
+}
+inline INT SMULBT (const SHORT a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smulbt %0, %1, %2"
+     : "=r" (result)
+     : "r" (a), "r" (b)) ;
+  return result ;
+}
+
+inline INT SMLABB(const LONG accu, const SHORT a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smlabb %0, %1, %2,%3"
+     : "=r" (result)
+     : "r" (a), "r" (b), "r" (accu)) ;
+  return result;
+}
+inline INT SMLABT(const LONG accu, const SHORT a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smlabt %0, %1, %2,%3"
+     : "=r" (result)
+     : "r" (a), "r" (b), "r" (accu)) ;
+  return result;
+}
+#endif /* compiler selection  */
+
+
+void qmfAnaPrototypeFirSlot( FIXP_QMF *analysisBuffer,
+                             int       no_channels,             /*!< Number channels of analysis filter */
+                             const FIXP_PFT *p_filter,
+                             int       p_stride,                /*!< Stide of analysis filter    */
+                             FIXP_QAS *RESTRICT pFilterStates
+                            )
+{
+  LONG *p_flt = (LONG *) p_filter;
+  LONG flt;
+  FIXP_QMF *RESTRICT pData_0 = analysisBuffer + 2*no_channels - 1;
+  FIXP_QMF *RESTRICT pData_1 = analysisBuffer;
+
+  FIXP_QAS *RESTRICT sta_0 = (FIXP_QAS *)pFilterStates;
+  FIXP_QAS *RESTRICT sta_1 = (FIXP_QAS *)pFilterStates + (2*QMF_NO_POLY*no_channels) - 1;
+
+  FIXP_DBL accu0, accu1;
+  FIXP_QAS sta0, sta1;
+
+  int staStep1 =  no_channels<<1;
+  int staStep2 = (no_channels<<3) - 1; /* Rewind one less */
+
+  if (p_stride == 1)
+  {
+    /* FIR filter 0 */
+    flt = *p_flt++;
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMULBB(        sta1, flt);
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = *p_flt++;
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABB( accu1, sta1, flt);
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = *p_flt++;
+    sta1 = *sta_1;  sta_1 += staStep2;
+    accu1 = SMLABB( accu1, sta1, flt);
+    *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+
+    /* FIR filters 1..63 127..65 or 1..31 63..33 */
+    no_channels >>= 1;
+    for (; --no_channels; )
+    {
+      sta0 = *sta_0; sta_0 += staStep1;  /* 1,3,5, ... 29/61 */
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMULBT(        sta0, flt);
+      accu1 = SMULBT(        sta1, flt);
+
+      flt = *p_flt++;
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      flt = *p_flt++;
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      sta0 = *sta_0; sta_0 -= staStep2;
+      sta1 = *sta_1; sta_1 += staStep2;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+      *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+
+      /* Same sequence as above, but mix B=bottom with T=Top */
+
+      flt = *p_flt++;
+      sta0 = *sta_0; sta_0 += staStep1;  /* 2,4,6, ... 30/62 */
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMULBB(        sta0, flt);
+      accu1 = SMULBB(        sta1, flt);
+
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      flt = *p_flt++;
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      flt = *p_flt++;
+      sta0 = *sta_0; sta_0 -= staStep2;
+      sta1 = *sta_1; sta_1 += staStep2;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+      *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+    }
+
+    /* FIR filter 31/63 and 33/65 */
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMULBT(        sta0, flt);
+    accu1 = SMULBT(        sta1, flt);
+
+    flt = *p_flt++;
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABB( accu0, sta0, flt);
+    accu1 = SMLABB( accu1, sta1, flt);
+
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABT( accu0, sta0, flt);
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = *p_flt++;
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABB( accu0, sta0, flt);
+    accu1 = SMLABB( accu1, sta1, flt);
+
+    sta0 = *sta_0; sta_0 -= staStep2;
+    sta1 = *sta_1; sta_1 += staStep2;
+    accu0 = SMLABT( accu0, sta0, flt);
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+    *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+
+    /* FIR filter 32/64 */
+    flt = *p_flt++;
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMULBB(        sta0, flt);
+    accu1 = SMULBB(        sta1, flt);
+
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABT( accu0, sta0, flt);
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = *p_flt++;
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABB( accu0, sta0, flt);
+    accu1 = SMLABB( accu1, sta1, flt);
+
+    sta0 = *sta_0; sta_0 += staStep1;
+    sta1 = *sta_1; sta_1 -= staStep1;
+    accu0 = SMLABT( accu0, sta0, flt);
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = *p_flt;
+    sta0 = *sta_0;
+    sta1 = *sta_1;
+    accu0 = SMLABB( accu0, sta0, flt);
+    accu1 = SMLABB( accu1, sta1, flt);
+
+    *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+    *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+  }
+  else
+  {
+    int pfltStep = QMF_NO_POLY * (p_stride-1);
+
+    flt = p_flt[0];
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMULBB(        sta1, flt);
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = p_flt[1];
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABB( accu1, sta1, flt);
+    sta1 = *sta_1;  sta_1 -= staStep1;
+    accu1 = SMLABT( accu1, sta1, flt);
+
+    flt = p_flt[2]; p_flt += pfltStep;
+    sta1 = *sta_1;  sta_1 += staStep2;
+    accu1 = SMLABB( accu1, sta1, flt);
+    *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+
+    /* FIR filters 1..63 127..65 or 1..31 63..33 */
+    for (; --no_channels; )
+    {
+      flt = p_flt[0];
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMULBB(        sta0, flt);
+      accu1 = SMULBB(        sta1, flt);
+
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      flt = p_flt[1];
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      sta0 = *sta_0; sta_0 += staStep1;
+      sta1 = *sta_1; sta_1 -= staStep1;
+      accu0 = SMLABT( accu0, sta0, flt);
+      accu1 = SMLABT( accu1, sta1, flt);
+
+      flt = p_flt[2]; p_flt += pfltStep;
+      sta0 = *sta_0; sta_0 -= staStep2;
+      sta1 = *sta_1; sta_1 += staStep2;
+      accu0 = SMLABB( accu0, sta0, flt);
+      accu1 = SMLABB( accu1, sta1, flt);
+
+      *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+      *pData_1++ = FX_DBL2FX_QMF(accu1<<1);
+    }
+
+    /* FIR filter 32/64 */
+    flt = p_flt[0];
+    sta0 = *sta_0; sta_0 += staStep1;
+    accu0 = SMULBB(        sta0, flt);
+    sta0 = *sta_0; sta_0 += staStep1;
+    accu0 = SMLABT( accu0, sta0, flt);
+
+    flt = p_flt[1];
+    sta0 = *sta_0; sta_0 += staStep1;
+    accu0 = SMLABB( accu0, sta0, flt);
+    sta0 = *sta_0; sta_0 += staStep1;
+    accu0 = SMLABT( accu0, sta0, flt);
+
+    flt = p_flt[2];
+    sta0 = *sta_0;
+    accu0 = SMLABB( accu0, sta0, flt);
+    *pData_0-- = FX_DBL2FX_QMF(accu0<<1);
+  }
+}
+#endif /* FUNCTION_qmfAnaPrototypeFirSlot */
+#endif /* #if defined(__CC_ARM) && defined(__ARM_ARCH_6__) */
+
+#if ( defined(__ARM_ARCH_5TE__) && (SAMPLE_BITS == 16) ) && !defined(QMF_TABLE_FULL)
+
+#define FUNCTION_qmfSynPrototypeFirSlot
+
+#if defined(FUNCTION_qmfSynPrototypeFirSlot)
+
+#if defined(__GNUC__)	/* cppp replaced: elif */
+
+inline INT SMULWB (const LONG a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smulwb %0, %1, %2"
+    : "=r" (result)
+    : "r" (a), "r" (b)) ;
+
+  return result ;
+}
+inline INT SMULWT (const LONG a, const LONG b)
+{
+  INT result ;
+  __asm__ ("smulwt %0, %1, %2"
+    : "=r" (result)
+    : "r" (a), "r" (b)) ;
+
+  return result ;
+}
+
+inline INT SMLAWB(const LONG accu, const LONG a, const LONG b)
+{
+  INT result;
+  asm("smlawb %0, %1, %2, %3 "
+        : "=r" (result)
+        : "r" (a), "r" (b), "r" (accu) );
+  return result ;
+}
+
+inline INT SMLAWT(const LONG accu, const LONG a, const LONG b)
+{
+  INT result;
+  asm("smlawt %0, %1, %2, %3 "
+        : "=r" (result)
+        : "r" (a), "r" (b), "r" (accu) );
+  return result ;
+}
+
+#endif /* ARM compiler selector */
+
+
+static void qmfSynPrototypeFirSlot1_filter(FIXP_QMF *RESTRICT realSlot, 
+                                           FIXP_QMF *RESTRICT imagSlot, 
+                                           const FIXP_DBL *RESTRICT p_flt, 
+                                           FIXP_QSS *RESTRICT sta,
+                                           FIXP_DBL *pMyTimeOut, 
+                                           int no_channels)
+{
+  /* This code was the base for the above listed assembler sequence */
+  /* It can be used for debugging purpose or further optimizations  */
+  const FIXP_DBL *RESTRICT p_fltm = p_flt + 155;
+
+  do
+  {
+     FIXP_DBL result;
+     FIXP_DBL A, B, real, imag, sta0;
+
+     real = *--realSlot;
+     imag = *--imagSlot;
+     B = p_flt[4];                        /* Bottom=[8] Top=[9]     */
+     A = p_fltm[3];                       /* Bottom=[316] Top=[317] */
+     sta0 = sta[0];                       /* save state[0]          */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=9...........319  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=316...........6  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=8,18,    ...318  */
+     B = p_flt[3];                        /* Bottom=[6] Top=[7]     */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=317...........7  */
+     A = p_fltm[4];                       /* Bottom=[318] Top=[319] */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=7...........317  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=318...........8  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=6...........316  */
+     B = p_flt[2];                        /* Bottom=[X] Top=[5]     */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=9...........319  */
+     A = p_fltm[2];                       /* Bottom=[X] Top=[315]   */
+     *sta++ =         SMULWT( imag, B );  /* index=5,15, ...   315  */
+     result = SMLAWT( sta0,   real, A );  /* index=315...........5  */
+
+     *pMyTimeOut++ = result;
+
+     real = *--realSlot;
+     imag = *--imagSlot;
+     A = p_fltm[0];                       /* Bottom=[310] Top=[311] */
+     B = p_flt[7];                        /* Bottom=[14]  Top=[15]  */
+     result = SMLAWB( sta[0], real, A );  /* index=310...........0  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=14..........324  */
+     *pMyTimeOut++ = result;
+     B = p_flt[6];                        /* Bottom=[12]  Top=[13]  */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=311...........1  */
+     A = p_fltm[1];                       /* Bottom=[312] Top=[313] */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=13..........323  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=312...........2  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=12..........322  */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=313...........3  */
+     A = p_fltm[2];                       /* Bottom=[314] Top=[315] */
+     B = p_flt[5];                        /* Bottom=[10]  Top=[11]  */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=11..........321  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=314...........4  */
+     *sta++ =         SMULWB( imag, B );  /* index=10..........320  */
+
+
+     p_flt    += 5;
+     p_fltm   -= 5;
+  } 
+  while ((--no_channels) != 0);
+
+}
+
+
+
+INT qmfSynPrototypeFirSlot2(
+                             HANDLE_QMF_FILTER_BANK qmf,
+                             FIXP_QMF *RESTRICT realSlot,            /*!< Input: Pointer to real Slot */
+                             FIXP_QMF *RESTRICT imagSlot,            /*!< Input: Pointer to imag Slot */
+                             INT_PCM  *RESTRICT timeOut,             /*!< Time domain data */
+                             INT       stride                        /*!< Time output buffer stride factor*/
+                            )
+{
+  FIXP_QSS *RESTRICT sta = (FIXP_QSS*)qmf->FilterStates;
+  int no_channels = qmf->no_channels;
+  int scale = ((DFRACT_BITS-SAMPLE_BITS)-1-qmf->outScalefactor);
+
+  /* We map an arry of 16-bit values upon an array of 2*16-bit values to read 2 values in one shot */
+  const FIXP_DBL *RESTRICT p_flt  = (FIXP_DBL *) qmf->p_filter;           /* low=[0],   high=[1]   */
+  const FIXP_DBL *RESTRICT p_fltm = (FIXP_DBL *) qmf->p_filter + 155;     /* low=[310], high=[311] */
+
+  FDK_ASSERT(SAMPLE_BITS-1-qmf->outScalefactor >= 0); //   (DFRACT_BITS-SAMPLE_BITS)-1-qmf->outScalefactor >= 0);
+  FDK_ASSERT(qmf->p_stride==2 && qmf->no_channels == 32);
+
+  FDK_ASSERT((no_channels&3) == 0);  /* should be a multiple of 4 */
+
+  realSlot += no_channels-1;    // ~~"~~
+  imagSlot += no_channels-1;    // no_channels-1 .. 0
+
+  FIXP_DBL MyTimeOut[32];
+  FIXP_DBL *pMyTimeOut = &MyTimeOut[0];
+
+  for (no_channels = no_channels; no_channels--;)
+  {
+     FIXP_DBL result;
+     FIXP_DBL A, B, real, imag;
+
+     real = *realSlot--;
+     imag = *imagSlot--;
+     A = p_fltm[0];                       /* Bottom=[310] Top=[311] */
+     B = p_flt[7];                        /* Bottom=[14]  Top=[15]  */
+     result = SMLAWB( sta[0], real, A );  /* index=310...........0  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=14..........324  */
+     B = p_flt[6];                        /* Bottom=[12]  Top=[13]  */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=311...........1  */
+     A = p_fltm[1];                       /* Bottom=[312] Top=[313] */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=13..........323  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=312...........2  */
+     *sta++ = SMLAWB( sta[1], imag, B );  /* index=12..........322  */
+     *sta++ = SMLAWT( sta[1], real, A );  /* index=313...........3  */
+     A = p_fltm[2];                       /* Bottom=[314] Top=[315] */
+     B = p_flt[5];                        /* Bottom=[10]  Top=[11]  */
+     *sta++ = SMLAWT( sta[1], imag, B );  /* index=11..........321  */
+     *sta++ = SMLAWB( sta[1], real, A );  /* index=314...........4  */
+     *sta++ =         SMULWB( imag, B );  /* index=10..........320  */
+
+     *pMyTimeOut++ = result;
+
+     p_fltm   -= 5;
+     p_flt    += 5;
+  }
+
+  pMyTimeOut = &MyTimeOut[0];
+#if (SAMPLE_BITS == 16)      
+  const FIXP_DBL max_pos = (FIXP_DBL) 0x00007FFF << scale;
+  const FIXP_DBL max_neg = (FIXP_DBL) 0xFFFF8001 << scale;
+#else
+  scale = -scale;
+  const FIXP_DBL max_pos = (FIXP_DBL) 0x7FFFFFFF >> scale;
+  const FIXP_DBL max_neg = (FIXP_DBL) 0x80000001 >> scale;  
+#endif
+  const FIXP_DBL add_neg = (1 << scale) - 1;
+
+  no_channels = qmf->no_channels;
+
+  timeOut += no_channels*stride;
+
+  FDK_ASSERT(scale >= 0);
+
+  if (qmf->outGain != 0x80000000)
+  {
+    FIXP_DBL gain = qmf->outGain;
+    for (no_channels>>=2; no_channels--;)
+    {
+      FIXP_DBL result1, result2;
+
+      result1 = *pMyTimeOut++;
+      result2 = *pMyTimeOut++;
+
+      result1 = fMult(result1,gain);
+      timeOut -= stride;
+      if (result1 < 0)        result1 += add_neg;
+      if (result1 < max_neg)  result1 = max_neg;
+      if (result1 > max_pos)  result1 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result1 >> scale;
+#else
+      timeOut[0] = result1 << scale;
+#endif
+
+      result2 = fMult(result2,gain);
+      timeOut -= stride;
+      if (result2 < 0)        result2 += add_neg;
+      if (result2 < max_neg)  result2 = max_neg;
+      if (result2 > max_pos)  result2 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result2 >> scale;
+#else
+      timeOut[0] = result2 << scale;
+#endif
+
+      result1 = *pMyTimeOut++;
+      result2 = *pMyTimeOut++;
+
+      result1 = fMult(result1,gain);
+      timeOut -= stride;
+      if (result1 < 0)        result1 += add_neg;
+      if (result1 < max_neg)  result1 = max_neg;
+      if (result1 > max_pos)  result1 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result1 >> scale;
+#else
+      timeOut[0] = result1 << scale;
+#endif
+
+      result2 = fMult(result2,gain);
+      timeOut -= stride;
+      if (result2 < 0)        result2 += add_neg;
+      if (result2 < max_neg)  result2 = max_neg;
+      if (result2 > max_pos)  result2 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result2 >> scale;
+#else
+      timeOut[0] = result2 << scale;
+#endif
+    }
+  }
+  else
+  {
+    for (no_channels>>=2; no_channels--;)
+    {
+      FIXP_DBL result1, result2;
+      result1 = *pMyTimeOut++;
+      result2 = *pMyTimeOut++;
+      timeOut -= stride;
+      if (result1 < 0)        result1 += add_neg;
+      if (result1 < max_neg)  result1 = max_neg;
+      if (result1 > max_pos)  result1 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result1 >> scale;
+#else
+      timeOut[0] = result1 << scale;
+#endif
+      
+      timeOut -= stride;
+      if (result2 < 0)        result2 += add_neg;
+      if (result2 < max_neg)  result2 = max_neg;
+      if (result2 > max_pos)  result2 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result2 >> scale;
+#else
+      timeOut[0] = result2 << scale;
+#endif
+      
+      result1 = *pMyTimeOut++;
+      result2 = *pMyTimeOut++;
+      timeOut -= stride;
+      if (result1 < 0)        result1 += add_neg;
+      if (result1 < max_neg)  result1 = max_neg;
+      if (result1 > max_pos)  result1 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result1 >> scale;
+#else
+      timeOut[0] = result1 << scale;
+#endif
+      
+      timeOut -= stride;
+      if (result2 < 0)        result2 += add_neg;
+      if (result2 < max_neg)  result2 = max_neg;
+      if (result2 > max_pos)  result2 = max_pos;
+#if (SAMPLE_BITS == 16)
+      timeOut[0] = result2 >> scale;
+#else
+      timeOut[0] = result2 << scale;
+#endif
+    }
+  }
+  return 0;
+}
+
+static
+void qmfSynPrototypeFirSlot_fallback( HANDLE_QMF_FILTER_BANK qmf,
+                             FIXP_DBL *realSlot,      /*!< Input: Pointer to real Slot */
+                             FIXP_DBL *imagSlot,      /*!< Input: Pointer to imag Slot */
+                             INT_PCM  *timeOut,             /*!< Time domain data */
+                             const int       stride
+                            );
+
+/*!
+  \brief Perform Synthesis Prototype Filtering on a single slot of input data.
+
+  The filter takes 2 * #MAX_SYNTHESIS_CHANNELS of input data and
+  generates #MAX_SYNTHESIS_CHANNELS time domain output samples.
+*/
+
+static
+void qmfSynPrototypeFirSlot( HANDLE_QMF_FILTER_BANK qmf,
+                             FIXP_DBL *realSlot,      /*!< Input: Pointer to real Slot */
+                             FIXP_DBL *imagSlot,      /*!< Input: Pointer to imag Slot */
+                             INT_PCM  *timeOut,             /*!< Time domain data */
+                             const int       stride
+                            )
+{
+    INT err = -1;
+
+    switch (qmf->p_stride) {
+    case 2:
+      err = qmfSynPrototypeFirSlot2(qmf, realSlot, imagSlot, timeOut, stride);
+      break;
+    default:
+      err = -1;
+    }
+
+    /* fallback if configuration not available or failed */
+    if(err!=0) {
+        qmfSynPrototypeFirSlot_fallback(qmf, realSlot, imagSlot, timeOut, stride);
+    }
+}
+#endif /* FUNCTION_qmfSynPrototypeFirSlot */
+
+#endif  /*  ( defined(__CC_ARM) && defined(__ARM_ARCH_5TE__) && (SAMPLE_BITS == 16) ) && !defined(QMF_TABLE_FULL) */
+
+
+
+/* #####################################################################################*/
+
+
+
+#endif  /* (QMF_NO_POLY==5) */
+
diff --git a/libFDK/src/arm/scale_arm.cpp b/libFDK/src/arm/scale_arm.cpp
new file mode 100644
index 0000000..906766f
--- /dev/null
+++ b/libFDK/src/arm/scale_arm.cpp
@@ -0,0 +1,110 @@
+/***************************  Fraunhofer IIS FDK Tools  **********************
+
+                        (C) Copyright Fraunhofer IIS (2005)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+   $Id$
+   Author(s): Arthur Tritthart
+   Description: Scaling operations for ARM
+
+   This software and/or program is protected by copyright law and international
+   treaties. Any reproduction or distribution of this software and/or program,
+   or any portion of it, may result in severe civil and criminal penalties, and
+   will be prosecuted to the maximum extent possible under law.
+
+******************************************************************************/
+/* prevent multiple inclusion with re-definitions */
+#ifndef __INCLUDE_SCALE_ARM__
+#define __INCLUDE_SCALE_ARM__
+
+#define FUNCTION_scaleValuesWithFactor_DBL
+
+SCALE_INLINE
+void scaleValuesWithFactor(
+        FIXP_DBL *vector,
+        FIXP_DBL factor,
+        INT len,
+        INT scalefactor
+        )
+{
+  /* This code combines the fMult with the scaling             */
+  /* It performs a fMultDiv2 and increments shift by 1         */
+  int shift = scalefactor + 1;
+  FIXP_DBL *mySpec = vector;
+
+  shift = fixmin_I(shift,(INT)DFRACT_BITS-1);
+
+  if (shift >= 0)
+  {
+    for (int i=0; i<(len>>2); i++)
+    {
+      FIXP_DBL tmp0 = mySpec[0];
+      FIXP_DBL tmp1 = mySpec[1];
+      FIXP_DBL tmp2 = mySpec[2];
+      FIXP_DBL tmp3 = mySpec[3];
+      tmp0 = fMultDiv2(tmp0, factor);
+      tmp1 = fMultDiv2(tmp1, factor);
+      tmp2 = fMultDiv2(tmp2, factor);
+      tmp3 = fMultDiv2(tmp3, factor);
+      tmp0 <<= shift;
+      tmp1 <<= shift;
+      tmp2 <<= shift;
+      tmp3 <<= shift;
+      *mySpec++ = tmp0;
+      *mySpec++ = tmp1;
+      *mySpec++ = tmp2;
+      *mySpec++ = tmp3;
+    }
+    for (int i=len&3; i--;)
+    {
+      FIXP_DBL tmp0 = mySpec[0];
+      tmp0 = fMultDiv2(tmp0, factor);
+      tmp0 <<= shift;
+      *mySpec++ = tmp0;
+    }
+  }
+  else
+  {
+    shift = -shift;
+    for (int i=0; i<(len>>2); i++)
+    {
+      FIXP_DBL tmp0 = mySpec[0];
+      FIXP_DBL tmp1 = mySpec[1];
+      FIXP_DBL tmp2 = mySpec[2];
+      FIXP_DBL tmp3 = mySpec[3];
+      tmp0 = fMultDiv2(tmp0, factor);
+      tmp1 = fMultDiv2(tmp1, factor);
+      tmp2 = fMultDiv2(tmp2, factor);
+      tmp3 = fMultDiv2(tmp3, factor);
+      tmp0 >>= shift;
+      tmp1 >>= shift;
+      tmp2 >>= shift;
+      tmp3 >>= shift;
+      *mySpec++ = tmp0;
+      *mySpec++ = tmp1;
+      *mySpec++ = tmp2;
+      *mySpec++ = tmp3;
+    }
+    for (int i=len&3; i--;)
+    {
+      FIXP_DBL tmp0 = mySpec[0];
+      tmp0 = fMultDiv2(tmp0, factor);
+      tmp0 >>= shift;
+      *mySpec++ = tmp0;
+    }
+  }
+}
+
+#endif /* #ifndef __INCLUDE_SCALE_ARM__ */
-- 
cgit v1.2.3