#include "common.h"
#include "encoder.h"

void IDCT32 (double *, double *, int);

/***********************************************************************
 An implementation of a modified window subband as seen in Kumar & Zubair's
"A high performance software implentation of mpeg audio encoder"
I think from IEEE ASCAP 1996 proceedings
 
input: shift in 32*12 (384) new samples into a 864 point buffer.
ch - which channel we're looking at.
 
This routine basically does 12 calls to window subband all in one go.
Not yet called in code. here for testing only.
************************************************************************/
#define NEWWS
void window_subband12 (short **buffer, int ch)
{
  static double x[2][864];	/* 2 channels, 864 buffer for each */
  double *xk;
  double t[12];			/* a temp buffer for summing values */
  double y[12][64];		/* 12 output arrays of 64 values */
  int i, j, k, m;
  static double c[512];		/* enwindow array */
  static int init = 0;
  double c0;

  if (!init) {
    read_ana_window (c);
    printf ("done init\n");
    init++;
  }

  xk = x[ch];			/* an easier way of referencing the array */

  /* shift 384 new samples into the buffer */
  for (i = 863; i >= 384; i--)
    xk[i] = xk[i - 384];
  for (i = 383; i >= 0; i--)
    xk[i] = (double) *(*buffer)++ / SCALE;

  for (j = 0; j < 64; j++) {
    for (k = 0; k < 12; k++)
      t[k] = 0;
    for (i = 0; i < 8; i++) {
      m = i * 64 + j;
      c0 = c[m];
      t[0] += c0 * xk[m + 352];
      t[1] += c0 * xk[m + 320];
      t[2] += c0 * xk[m + 288];
      t[3] += c0 * xk[m + 256];
      t[4] += c0 * xk[m + 224];
      t[5] += c0 * xk[m + 192];
      t[6] += c0 * xk[m + 160];
      t[7] += c0 * xk[m + 128];
      t[8] += c0 * xk[m + 96];
      t[9] += c0 * xk[m + 64];
      t[10] += c0 * xk[m + 32];
      t[11] += c0 * xk[m];
    }
    for (i = 0; i < 12; i++) {
      y[i][j] = t[i];
    }
  }
#define DB1x
#ifdef DB1
  for (i = 0; i < 12; i++) {
    printf ("--%i--\n", i);
    for (j = 0; j < 64; j++) {
      printf ("%f\t", y[i][j]);
      if ((j + 1) % 4 == 0)
	printf ("\n");
    }
  }
  exit (0);
#endif
}


/************************************************************************/
/*
/* read_ana_window()
/*
/* PURPOSE:  Reads encoder window file "enwindow" into array #ana_win#
/*
/************************************************************************/

void read_ana_window (ana_win)
     double ana_win[HAN_SIZE];
{
  int i, j[4];
  FILE *fp;
  double f[4];
  char t[150];

  if (!(fp = OpenTableFile ("enwindow"))) {
    printf ("Please check analysis window table 'enwindow'\n");
    exit (1);
  }
  for (i = 0; i < 512; i += 4) {
    fgets (t, 150, fp);
    sscanf (t, "C[%d] = %lf C[%d] = %lf C[%d] = %lf C[%d] = %lf\n", j, f,
	    j + 1, f + 1, j + 2, f + 2, j + 3, f + 3);
    if (i == j[0]) {
      ana_win[i] = f[0];
      ana_win[i + 1] = f[1];
      ana_win[i + 2] = f[2];
      ana_win[i + 3] = f[3];
    } else {
      printf ("Check index in analysis window table\n");
      exit (1);
    }
    fgets (t, 150, fp);
  }
  fclose (fp);
}

/************************************************************************/
/*
/* window_subband()
/*
/* PURPOSE:  Overlapping window on PCM samples
/*
/* SEMANTICS:
/* 32 16-bit pcm samples are scaled to fractional 2's complement and
/* concatenated to the end of the window buffer #x#. The updated window
/* buffer #x# is then windowed by the analysis window #c# to produce the
/* windowed sample #z#
/*
/************************************************************************/
#ifdef COMBWS
void window_subband (short **buffer, double s[SBLIMIT], int k, int sblimit)
{
  typedef double XX[2][HAN_SIZE];
  static XX *x;
  double *xk;
  int i;
  static int off[2] = { 0, 0 };
  static char init = 0;
  double t;
  static double enwindow[512];
  double *ep0, *ep1, *ep2, *ep3, *ep4, *ep5, *ep6, *ep7;
  double z[64];
  double yprime[32];
  if (!init) {
    read_ana_window (enwindow);
    x = (XX *) mem_alloc (sizeof (XX), "x");
    memset (x, 0, 2 * HAN_SIZE * sizeof (double));
    init = 1;
  }
  xk = (*x)[k];

  /* replace 32 oldest samples with 32 new samples */
  for (i = 0; i < 32; i++)
    xk[31 - i + off[k]] = (double) *(*buffer)++ / SCALE;

  ep0 = &enwindow[0];
  ep1 = &enwindow[64];
  ep2 = &enwindow[128];
  ep3 = &enwindow[192];
  ep4 = &enwindow[256];
  ep5 = &enwindow[320];
  ep6 = &enwindow[384];
  ep7 = &enwindow[448];

  /* shift samples into proper window positions */
  for (i = 0; i < 64; i++) {
    t = xk[(i + off[k]) & (512 - 1)] * *ep0++;
    t += xk[(i + 64 + off[k]) & (512 - 1)] * *ep1++;
    t += xk[(i + 128 + off[k]) & (512 - 1)] * *ep2++;
    t += xk[(i + 192 + off[k]) & (512 - 1)] * *ep3++;
    t += xk[(i + 256 + off[k]) & (512 - 1)] * *ep4++;
    t += xk[(i + 320 + off[k]) & (512 - 1)] * *ep5++;
    t += xk[(i + 384 + off[k]) & (512 - 1)] * *ep6++;
    t += xk[(i + 448 + off[k]) & (512 - 1)] * *ep7++;
    z[i] = t;
  }

  off[k] += 480;		/*offset is modulo (HAN_SIZE-1) */
  off[k] &= HAN_SIZE - 1;

  yprime[0] = z[16];
  for (i = 1; i <= 16; i++)
    yprime[i] = z[i + 16] + z[16 - i];
  for (i = 17; i <= 31; i++)
    yprime[i] = z[i + 16] - z[80 - i];
  IDCT32 (yprime, s, sblimit);
  /*    filter_subband (z,s);  */
}
#else
void window_subband (short **buffer, double z[64], int k)
{
  typedef double XX[2][HAN_SIZE];
  static XX *x;
  double *xk;
  int i;
  static int off[2] = { 0, 0 };
  static char init = 0;
  double t;
  static double enwindow[512];
  double *ep0, *ep1, *ep2, *ep3, *ep4, *ep5, *ep6, *ep7;
  if (!init) {
    read_ana_window (enwindow);
    x = (XX *) mem_alloc (sizeof (XX), "x");
    memset (x, 0, 2 * HAN_SIZE * sizeof (double));
    init = 1;
  }
  xk = (*x)[k];

  /* replace 32 oldest samples with 32 new samples */
  /* PDS old code:   */
  /* for (i=0;i<32;i++)
     xk[31-i+off[k]] = (double) *(*buffer)++/SCALE;
   */
  {
    register double *xk_t = xk + off[k];
    for (i = 32; i--;)
      xk_t[i] = (double) *(*buffer)++;
  }

  ep0 = &enwindow[0];
  ep1 = &enwindow[64];
  ep2 = &enwindow[128];
  ep3 = &enwindow[192];
  ep4 = &enwindow[256];
  ep5 = &enwindow[320];
  ep6 = &enwindow[384];
  ep7 = &enwindow[448];

  /* shift samples into proper window positions */
  for (i = 0; i < 64; i++) {
    t = xk[(i + off[k]) & (512 - 1)] * *ep0++;
    t += xk[(i + 64 + off[k]) & (512 - 1)] * *ep1++;
    t += xk[(i + 128 + off[k]) & (512 - 1)] * *ep2++;
    t += xk[(i + 192 + off[k]) & (512 - 1)] * *ep3++;
    t += xk[(i + 256 + off[k]) & (512 - 1)] * *ep4++;
    t += xk[(i + 320 + off[k]) & (512 - 1)] * *ep5++;
    t += xk[(i + 384 + off[k]) & (512 - 1)] * *ep6++;
    t += xk[(i + 448 + off[k]) & (512 - 1)] * *ep7++;
    z[i] = t;
  }

  off[k] += 480;		/*offset is modulo (HAN_SIZE-1) */
  off[k] &= HAN_SIZE - 1;

}
#endif
/************************************************************************/
/*
/* create_ana_filter()
/*
/* PURPOSE:  Calculates the analysis filter bank coefficients
/*
/* SEMANTICS:
/* Calculates the analysis filterbank coefficients and rounds to the
/* 9th decimal place accuracy of the filterbank tables in the ISO
/* document.  The coefficients are stored in #filter#
/*
/************************************************************************/

void create_ana_filter (filter)
     double filter[SBLIMIT][64];
{
  register int i, k;

  for (i = 0; i < 32; i++)
    for (k = 0; k < 64; k++) {
      if ((filter[i][k] =
	   1e9 * cos ((double) ((2 * i + 1) * (16 - k) * PI64))) >= 0)
	modf (filter[i][k] + 0.5, &filter[i][k]);
      else
	modf (filter[i][k] - 0.5, &filter[i][k]);
      filter[i][k] *= 1e-9;
    }
}

/************************************************************************
*
* filter_subband()
*
* PURPOSE:  Calculates the analysis filter bank coefficients
*
* SEMANTICS:
*      The windowed samples #z# is filtered by the digital filter matrix #m#
* to produce the subband samples #s#. This done by first selectively
* picking out values from the windowed samples, and then multiplying
* them by the filter matrix, producing 32 subband samples.
*
************************************************************************/
void create_dct_matrix (filter)
     double filter[16][32];
{
  register int i, k;

  for (i = 0; i < 16; i++)
    for (k = 0; k < 32; k++) {
      if ((filter[i][k] = 1e9 * cos ((double) ((2 * i + 1) * k * PI64))) >= 0)
	modf (filter[i][k] + 0.5, &filter[i][k]);
      else
	modf (filter[i][k] - 0.5, &filter[i][k]);
      filter[i][k] *= 1e-9;
      filter[i][k] /= (double) SCALE;	/* PDS */
    }
  /* PDS this code could/should be replaced; use simple cos  */
  /* and don't do additional rounding ??? See LAME(e.g.3.34) */
}

void IDCT32 (xin, xout, sblimit)
     double *xin, *xout;
     int sblimit;
{
  int i, j;
  double s0, s1;
  typedef double MM[16][32];
  static MM *m = 0;
  if (m == 0) {
    m = (MM *) mem_alloc (sizeof (MM), "filter");
    create_dct_matrix (*m);
  }
  /* Only compute subband filter info for frequency ranges which */
  /* will be really needed/encoded later. Code is "general", but */
  /* only produces speed up in low qual/bps coding situations.   */
  /* Added/adapted by PDS Oct 1999.                              */
  for (i = ((sblimit > 16) ? SBLIMIT - sblimit : sblimit); i--;) {
    s0 = 0.0;
    for (j = 0; j < 32; j++) {
      s0 += (*m)[i][j] * xin[j + 0];
    }
    xout[i] = s0;
  }
  for (i = SBLIMIT - sblimit; i < 16; i++) {
    s0 = s1 = 0.0;
    for (j = 0; j < 32; j += 2) {
      s0 += (*m)[i][j] * xin[j];
      s1 += (*m)[i][j + 1] * xin[j + 1];
    }
    xout[i] = s0 + s1;
    xout[31 - i] = s0 - s1;
  }
  /* PDS TODO: use pointers instead of arrays ???  */
  /* PDS TODO: is '--' j loop faster then original */
  /* code:   ' for( j=0; j<32; j+=2 )  { ... } ' ? */
}

void filter_subband (z, s, sblimit)
     double z[HAN_SIZE], s[SBLIMIT];
     int sblimit;
{
  double yprime[32];
  int i, j;

  {
    yprime[0] = z[16];
    for (i = 1; i <= 16; i++)
      yprime[i] = z[i + 16] + z[16 - i];
    for (i = 17; i <= 31; i++)
      yprime[i] = z[i + 16] - z[80 - i];
    IDCT32 (yprime, s, sblimit);
  }
}