1 files changed, 513 insertions, 0 deletions
diff --git a/libFDK/src/dct.cpp b/libFDK/src/dct.cpp
new file mode 100644
index 0000000..d05a26f
--- /dev/null
+++ b/libFDK/src/dct.cpp
@@ -0,0 +1,513 @@
+/****************************************************************************
+
+                       (C) copyright Fraunhofer IIS (2004)
+                               All Rights Reserved
+
+    Please be advised that this software and/or program delivery is
+    Confidential Information of Fraunhofer and subject to and covered by the
+
+    Fraunhofer IIS Software Evaluation Agreement
+    between Google Inc. and  Fraunhofer
+    effective and in full force since March 1, 2012.
+
+    You may use this software and/or program only under the terms and
+    conditions described in the above mentioned Fraunhofer IIS Software
+    Evaluation Agreement. Any other and/or further use requires a separate agreement.
+
+
+
+ $Id$
+
+*******************************************************************************/
+/*!
+  \file   dct.cpp
+  \brief  DCT Implementations  $Revision: 36871 $
+  Library functions to calculate standard DCTs. This will most likely be replaced by hand-optimized
+  functions for the specific target processor.
+
+  Three different implementations of the dct type II and the dct type III transforms are provided.
+
+  By default implementations which are based on a single, standard complex FFT-kernel are used (dctII_f() and dctIII_f()).
+  These are specifically helpful in cases where optimized FFT libraries are already available. The FFT used in these
+  implementation is FFT rad2 from FDK_tools.
+
+  Of course, one might also use DCT-libraries should they be available. The DCT and DST
+  type IV implementations are only available in a version based on a complex FFT kernel.
+*/
+
+#include "dct.h"
+
+
+#include "FDK_tools_rom.h"
+#include "fft.h"
+
+
+#if defined(__arm__)
+#include "arm/dct_arm.cpp"
+#endif
+
+
+/*!
+ *
+ * \brief Perform dct type 3
+ * The dct 3 is calculated by a inverse real fft, with
+ * some pre twiddeling before the inverse real fft, as discribed by Takuya OOURA.
+ * (http://momonga.t.u-tokyo.ac.jp/~ooura/fftman/ftmn2_32.html#sec2_3_2)
+ * The real inverse fft is calculated by a inverse complex fft, as described
+ * in numerical recipes in C, Cambridge University press
+ * the auxiliary function is built by reversing the odd samples
+ *
+ * Instead of doing 2 times the conjugation to calculate the
+ * inverse cfft with the cfft, one can also swap elements before
+ * calling the cfft to get the same result:
+ * swap
+ * r(1),i(1) with r(n-1),i(n-1),
+ * r(2),i(2) with r(n-2),i(n-2), ...
+ * r(n/2-1),i(n/2-1) with r(n/2+1),i(n/2+1)
+ *
+ *
+ * Scaling 1 shift in pre twiddeling,
+ *         log2(L)-1 in cfft
+ */
+#if !defined(FUNCTION_dct_III)
+void dct_III(FIXP_DBL *pDat, /*!< pointer to input/output */
+             FIXP_DBL *tmp,  /*!< pointer to temporal working buffer */
+             int L,          /*!< lenght of transform */
+             int *pDat_e
+             )
+{
+  FDK_ASSERT(L == 64 || L == 32);
+  int  i;
+  FIXP_DBL xr, accu1, accu2;
+  int inc;
+  int M = L>>1;
+  int ld_M;
+
+  if (L == 64)  ld_M = 5;
+  else          ld_M = 4;
+
+  /* This loop performs multiplication for index i (i*inc) */
+  inc = (64/2) >> ld_M; /* 64/L */
+
+  FIXP_DBL *pTmp_0 = &tmp[2];
+  FIXP_DBL *pTmp_1 = &tmp[(M-1)*2];
+
+  for(i=1; i<M>>1; i++,pTmp_0+=2,pTmp_1-=2) {
+
+    FIXP_DBL accu3,accu4,accu5,accu6;
+
+    cplxMultDiv2(&accu2, &accu1, pDat[L - i], pDat[i], sin_twiddle_L64[i*inc]);
+    cplxMultDiv2(&accu4, &accu3, pDat[M+i], pDat[M-i], sin_twiddle_L64[(M-i)*inc]);
+    accu3 >>= 1; accu4 >>= 1;
+
+    /* This method is better for ARM926, that uses operand2 shifted right by 1 always */
+    cplxMultDiv2(&accu6, &accu5, (accu3 - (accu1>>1)), ((accu2>>1) + accu4), sin_twiddle_L64[(4*i)*inc]);
+    xr = (accu1>>1) + accu3;
+    pTmp_0[0] = (xr>>1) - accu5;
+    pTmp_1[0] = (xr>>1) + accu5;
+
+    xr = (accu2>>1) - accu4;
+    pTmp_0[1] =  (xr>>1) - accu6;
+    pTmp_1[1] = -((xr>>1) + accu6);
+
+  }
+
+  xr     = fMultDiv2(pDat[M], sin_twiddle_L64[64/2].v.re );/* cos((PI/(2*L))*M); */
+  tmp[0] = ((pDat[0]>>1) + xr)>>1;
+  tmp[1] = ((pDat[0]>>1) - xr)>>1;
+
+  cplxMultDiv2(&accu2, &accu1, pDat[L - (M/2)], pDat[M/2], sin_twiddle_L64[64/4]);
+  tmp[M]   = accu1>>1;
+  tmp[M+1] = accu2>>1;
+
+  /* dit_fft expects 1 bit scaled input values */
+  fft(M, tmp, pDat_e);
+
+  /* ARM926: 12 cycles per 2-iteration, no overhead code by compiler */
+  pTmp_1 = &tmp[L];
+  for (i = M>>1; i--;)
+  {
+    FIXP_DBL tmp1, tmp2, tmp3, tmp4;
+    tmp1 = *tmp++;
+    tmp2 = *tmp++;
+    tmp3 = *--pTmp_1;
+    tmp4 = *--pTmp_1;
+    *pDat++ = tmp1;
+    *pDat++ = tmp3;
+    *pDat++ = tmp2;
+    *pDat++ = tmp4;
+  }
+
+  *pDat_e += 2;
+}
+#endif
+
+/*!
+ *
+ * \brief Perform dct type 2
+ * The dct 2 is calculated by a real inverse fft, with
+ * some pre twiddeling after the  fft, as discribed by Takuya OOURA.
+ * (http://momonga.t.u-tokyo.ac.jp/~ooura/fftman/ftmn2_32.html#sec2_3_2)
+ * The real inverse fft is calculated by a inverse complex fft, as described
+ * in numerical recipes in C, Cambridge University press
+ * the auxilery function is build by reversing the odd samples
+ *
+ * Scaling 1 shift in pre twiddeling,
+ *         5 in cfft
+ */
+#if !defined(FUNCTION_dct_II)
+void dct_II(FIXP_DBL *pDat, /*!< pointer to input/output */
+            FIXP_DBL *tmp,  /*!< pointer to temporal working buffer */
+            int L,          /*!< lenght of transform */
+            int *pDat_e
+            )
+{
+    FDK_ASSERT(L == 64 || L == 32);
+    FIXP_DBL accu1,accu2;
+    FIXP_DBL *pTmp_0, *pTmp_1;
+
+    int i;
+    int inc;
+    int M =  L>>1;
+    int ld_M;
+
+    FDK_ASSERT(L == 64 || L == 32);
+    ld_M = 4 + (L >> 6);  /* L=64: 5,  L=32: 4 */
+
+    inc = (64/2) >> ld_M; /* L=64: 1,  L=32: 2 */
+
+    FIXP_DBL *pdat  = &pDat[0];
+    FIXP_DBL accu3, accu4;
+    pTmp_0 = &tmp[0];
+    pTmp_1 = &tmp[L-1];
+    for (i = M>>1; i--; )
+    {
+      accu1 = *pdat++;
+      accu2 = *pdat++;
+      accu3 = *pdat++;
+      accu4 = *pdat++;
+      accu1 >>= 1;
+      accu2 >>= 1;
+      accu3 >>= 1;
+      accu4 >>= 1;
+      *pTmp_0++ = accu1;
+      *pTmp_0++ = accu3;
+      *pTmp_1-- = accu2;
+      *pTmp_1-- = accu4;
+    }
+
+
+    fft(M, tmp, pDat_e);
+
+    pTmp_0 = &tmp[2];
+    pTmp_1 = &tmp[(M-1)*2];
+
+    for (i=1; i<M>>1; i++,pTmp_0+=2,pTmp_1-=2) {
+
+      FIXP_DBL a1,a2;
+      FIXP_DBL accu3, accu4;
+
+      a1 = ((pTmp_0[1]>>1) + (pTmp_1[1]>>1));
+      a2 = ((pTmp_1[0]>>1) - (pTmp_0[0]>>1));
+
+      cplxMultDiv2(&accu1, &accu2, a2, a1, sin_twiddle_L64[(4*i)*inc]);
+      accu1<<=1; accu2<<=1;
+
+      a1 = ((pTmp_0[0]>>1) + (pTmp_1[0]>>1));
+      a2 = ((pTmp_0[1]>>1) - (pTmp_1[1]>>1));
+
+      cplxMultDiv2(&accu3, &accu4, (a1 + accu2), -(accu1 + a2), sin_twiddle_L64[i*inc]);
+      pDat[L - i] = accu4;
+      pDat[i]     = accu3;
+
+      cplxMultDiv2(&accu3, &accu4, (a1 - accu2), -(accu1 - a2), sin_twiddle_L64[(M-i)*inc]);
+      pDat[M + i] = accu4;
+      pDat[M - i] = accu3;
+
+    }
+
+    cplxMultDiv2(&accu1, &accu2, tmp[M], tmp[M+1], sin_twiddle_L64[(M/2)*inc]);
+    pDat[L - (M/2)] = accu2;
+    pDat[M/2]       = accu1;
+
+    pDat[0] = (tmp[0]>>1)+(tmp[1]>>1);
+    pDat[M] =  fMult(((tmp[0]>>1)-(tmp[1]>>1)), sin_twiddle_L64[64/2].v.re);/* cos((PI/(2*L))*M); */
+
+    *pDat_e += 2;
+}
+#endif
+
+static
+void getTables(const FIXP_WTP **twiddle, const FIXP_STP **sin_twiddle, int *sin_step, int length)
+{
+  int ld2_length;
+
+ /* Get ld2 of length - 2 + 1
+     -2: because first table entry is window of size 4
+     +1: because we already include +1 because of ceil(log2(length)) */
+  ld2_length = DFRACT_BITS-1-fNormz((FIXP_DBL)length) - 1;
+
+  /* Extract sort of "eigenvalue" (the 4 left most bits) of length. */
+  switch ( (length) >> (ld2_length-1) ) {
+    case 0x4: /* radix 2 */
+      *sin_twiddle = SineTable512;
+      *sin_step = 1<<(9 - ld2_length);
+      *twiddle = windowSlopes[0][0][ld2_length-1];
+      break;
+    case 0x7: /* 10 ms */
+      *sin_twiddle = SineTable480;
+      *sin_step = 1<<(8 - ld2_length);
+      *twiddle = windowSlopes[0][1][ld2_length];
+      break;
+    default:
+      *sin_twiddle = NULL;
+      *sin_step = 0;
+      *twiddle = NULL;
+      break;
+  }
+
+  FDK_ASSERT(*twiddle != NULL);
+
+  FDK_ASSERT(*sin_step > 0);
+
+}
+
+#if !defined(FUNCTION_dct_IV)
+
+void dct_IV(FIXP_DBL *pDat,
+            int L,
+            int *pDat_e)
+{
+  int sin_step = 0;
+  int M = L >> 1;
+
+  const FIXP_WTP *twiddle;
+  const FIXP_STP *sin_twiddle;
+
+  FDK_ASSERT(L >= 4);
+
+  getTables(&twiddle, &sin_twiddle, &sin_step, L);
+
+#ifdef FUNCTION_dct_IV_func1
+  if (M>=4 && (M&3) == 0) {
+     /* ARM926: 44 cycles for 2 iterations = 22 cycles/iteration */
+    dct_IV_func1(M>>2, twiddle,  &pDat[0], &pDat[L-1]);
+  } else
+#endif /* FUNCTION_dct_IV_func1 */
+  {
+    FIXP_DBL *RESTRICT pDat_0 = &pDat[0];
+    FIXP_DBL *RESTRICT pDat_1 = &pDat[L - 2];
+    register int i;
+
+    /* 29 cycles on ARM926 */
+    for (i = 0; i < M-1; i+=2,pDat_0+=2,pDat_1-=2)
+    {
+      register FIXP_DBL accu1,accu2,accu3,accu4;
+
+      accu1 = pDat_1[1]; accu2 = pDat_0[0];
+      accu3 = pDat_0[1]; accu4 = pDat_1[0];
+
+      cplxMultDiv2(&accu1, &accu2, accu1, accu2, twiddle[i]);
+      cplxMultDiv2(&accu3, &accu4, accu4, accu3, twiddle[i+1]);
+
+      pDat_0[0] = accu2; pDat_0[1] = accu1;
+      pDat_1[0] = accu4; pDat_1[1] = -accu3;
+    }
+    if (M&1)
+    {
+      register FIXP_DBL accu1,accu2;
+
+      accu1 = pDat_1[1]; accu2 = pDat_0[0];
+
+      cplxMultDiv2(&accu1, &accu2, accu1, accu2, twiddle[i]);
+
+      pDat_0[0] = accu2; pDat_0[1] = accu1;
+    }
+  }
+
+  fft(M, pDat, pDat_e);
+
+#ifdef FUNCTION_dct_IV_func2
+  if (M>=4 && (M&3) == 0) {
+     /* ARM926: 42 cycles for 2 iterations = 21 cycles/iteration */
+    dct_IV_func2(M>>2, sin_twiddle, &pDat[0], &pDat[L], sin_step);
+  } else
+#endif /* FUNCTION_dct_IV_func2 */
+  {
+    FIXP_DBL *RESTRICT pDat_0 = &pDat[0];
+    FIXP_DBL *RESTRICT pDat_1 = &pDat[L - 2];
+    register FIXP_DBL accu1,accu2,accu3,accu4;
+    int idx, i;
+
+    /* Sin and Cos values are 0.0f and 1.0f */
+    accu1 = pDat_1[0];
+    accu2 = pDat_1[1];
+
+    pDat_1[1] = -(pDat_0[1]>>1);
+    pDat_0[0] = (pDat_0[0]>>1);
+
+
+    /* 28 cycles for ARM926 */  
+    for (idx = sin_step,i=1; i<(M+1)>>1; i++, idx+=sin_step)
+    {
+      FIXP_STP twd = sin_twiddle[idx];
+      cplxMultDiv2(&accu3, &accu4, accu1, accu2, twd);
+      pDat_0[1] =  accu3;
+      pDat_1[0] =  accu4;
+      
+      pDat_0+=2;
+      pDat_1-=2;
+
+      cplxMultDiv2(&accu3, &accu4, pDat_0[1], pDat_0[0], twd);
+
+      accu1 = pDat_1[0];
+      accu2 = pDat_1[1];
+
+      pDat_1[1] = -accu3;
+      pDat_0[0] =  accu4;
+    }
+
+    if ( (M&1) == 0 )
+    {
+      /* Last Sin and Cos value pair are the same */
+      accu1 = fMultDiv2(accu1, WTC(0x5a82799a));
+      accu2 = fMultDiv2(accu2, WTC(0x5a82799a));
+
+      pDat_1[0] = accu1 + accu2;
+      pDat_0[1] = accu1 - accu2;
+    }
+  }
+
+  /* Add twiddeling scale. */
+  *pDat_e += 2;
+}
+#endif /* defined (FUNCTION_dct_IV) */
+
+#if !defined(FUNCTION_dst_IV)
+void dst_IV(FIXP_DBL *pDat,
+            int L,
+            int *pDat_e )
+{
+  int sin_step = 0;
+  int M = L >> 1;
+
+  const FIXP_WTP *twiddle;
+  const FIXP_STP *sin_twiddle;
+
+#ifdef DSTIV2_ENABLE
+  if (L == 2) {
+    const FIXP_STP tab = STCP(0x7641AF3D, 0x30FB9452);
+    FIXP_DBL tmp1, tmp2;
+
+    cplxMultDiv2(&tmp2, &tmp1, pDat[0], pDat[1], tab);
+
+    pDat[0] = tmp1;
+    pDat[1] = tmp2;
+
+    *pDat_e += 1;
+
+    return;
+  }
+#else
+  FDK_ASSERT(L >= 4);
+#endif
+
+  getTables(&twiddle, &sin_twiddle, &sin_step, L);
+
+#ifdef FUNCTION_dst_IV_func1
+  if ( (M>=4) && ((M&3) == 0) ) {
+    dst_IV_func1(M, twiddle, &pDat[0], &pDat[L]);
+  } else 
+#endif
+  {
+    FIXP_DBL *RESTRICT pDat_0 = &pDat[0];
+    FIXP_DBL *RESTRICT pDat_1 = &pDat[L - 2];
+  
+    register int i;
+
+    /* 34 cycles on ARM926 */
+    for (i = 0; i < M-1; i+=2,pDat_0+=2,pDat_1-=2) 
+    {
+      register FIXP_DBL accu1,accu2,accu3,accu4;
+
+      accu1 =  pDat_1[1]; accu2 = -pDat_0[0];
+      accu3 =  pDat_0[1]; accu4 = -pDat_1[0];
+    
+      cplxMultDiv2(&accu1, &accu2, accu1, accu2, twiddle[i]);
+      cplxMultDiv2(&accu3, &accu4, accu4, accu3, twiddle[i+1]);
+
+      pDat_0[0] = accu2; pDat_0[1] = accu1;
+      pDat_1[0] = accu4; pDat_1[1] = -accu3;
+    }
+    if (M&1)
+    {
+      register FIXP_DBL accu1,accu2;
+
+      accu1 =  pDat_1[1]; accu2 = -pDat_0[0];
+    
+      cplxMultDiv2(&accu1, &accu2, accu1, accu2, twiddle[i]);
+
+      pDat_0[0] = accu2; pDat_0[1] = accu1;
+    }
+  }
+
+  fft(M, pDat, pDat_e);
+
+#ifdef FUNCTION_dst_IV_func2
+  if ( (M>=4) && ((M&3) == 0) ) {
+    dst_IV_func2(M>>2, sin_twiddle + sin_step, &pDat[0], &pDat[L - 1], sin_step);
+  } else
+#endif /* FUNCTION_dst_IV_func2 */
+  {
+    FIXP_DBL *RESTRICT pDat_0;
+    FIXP_DBL *RESTRICT pDat_1;
+    register FIXP_DBL accu1,accu2,accu3,accu4;
+    int idx, i;
+
+    pDat_0 = &pDat[0];
+    pDat_1 = &pDat[L - 2];
+
+    /* Sin and Cos values are 0.0f and 1.0f */
+    accu1 = pDat_1[0];
+    accu2 = pDat_1[1];
+
+    pDat_1[1] = -(pDat_0[0]>>1);
+    pDat_0[0] = (pDat_0[1]>>1);
+
+    for (idx = sin_step,i=1; i<(M+1)>>1; i++, idx+=sin_step)
+    {
+      FIXP_STP twd = sin_twiddle[idx];
+
+      cplxMultDiv2(&accu3, &accu4, accu1, accu2, twd);
+      pDat_1[0] =  -accu3;
+      pDat_0[1] =  -accu4;
+
+      pDat_0+=2;
+      pDat_1-=2;
+
+      cplxMultDiv2(&accu3, &accu4, pDat_0[1], pDat_0[0], twd);
+
+      accu1 = pDat_1[0];
+      accu2 = pDat_1[1];
+
+      pDat_0[0] =  accu3;
+      pDat_1[1] = -accu4;
+    }
+
+    if ( (M&1) == 0 )
+    {
+      /* Last Sin and Cos value pair are the same */
+      accu1 = fMultDiv2(accu1, WTC(0x5a82799a));
+      accu2 = fMultDiv2(accu2, WTC(0x5a82799a));
+
+      pDat_0[1] = - accu1 - accu2;
+      pDat_1[0] =   accu2 - accu1;
+    }
+  }
+
+  /* Add twiddeling scale. */
+  *pDat_e += 2;
+}
+#endif /* !defined(FUNCTION_dst_IV) */
+
+