/*
Copyright (C) 2007, 2008, 2009, 2010, 2011 Her Majesty the Queen in
Right of Canada (Communications Research Center Canada)
Written by
2012, Matthias P. Braendli, matthias.braendli@mpb.li
This block implements a FIR filter. The real filter taps are given
as floats, and the block can take advantage of SSE.
For better performance, filtering is done in another thread, leading
to a pipeline delay of two calls to FIRFilter::process
*/
/*
This file is part of CRC-DADMOD.
CRC-DADMOD is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
CRC-DADMOD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRC-DADMOD. If not, see .
*/
#include "FIRFilter.h"
#include "PcDebug.h"
#include
#include
#include
#include
#ifdef __SSE__
# include
#endif
#include
void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd)
{
size_t i;
struct timespec time_start;
struct timespec time_end;
// This thread creates the dataOut buffer, and deletes
// the incoming buffer
while(running) {
Buffer* dataIn;
fwd->input_queue.wait_and_pop(dataIn);
Buffer* dataOut;
dataOut = new Buffer();
dataOut->setLength(dataIn->getLength());
PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength());
#if __SSE__
// The SSE accelerated version cannot work on the complex values,
// it is necessary to do the convolution on the real and imaginary
// parts separately. Thankfully, the taps are real, simplifying the
// procedure.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
if ((uintptr_t)(&out[0]) % 16 != 0) {
fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
throw std::runtime_error("FIRFilterWorker: out not aligned");
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
__m128 SSEout;
__m128 SSEtaps;
__m128 SSEin;
for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) {
SSEout = _mm_setr_ps(0,0,0,0);
for (int j = 0; j < fwd->n_taps; j++) {
if ((uintptr_t)(&in[i+2*j]) % 16 == 0) {
SSEin = _mm_load_ps(&in[i+2*j]); //faster when aligned
}
else {
SSEin = _mm_loadu_ps(&in[i+2*j]);
}
SSEtaps = _mm_load1_ps(&fwd->taps[j]);
SSEout = _mm_add_ps(SSEout, _mm_mul_ps(SSEin, SSEtaps));
}
_mm_store_ps(&out[i], SSEout);
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
#else
// No SSE ? Loop unrolling should make this faster. As for the SSE,
// the real and imaginary parts are calculated separately.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
// Convolve by aligning both frame and taps at zero.
for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) {
out[i] = 0.0;
out[i+1] = 0.0;
out[i+2] = 0.0;
out[i+3] = 0.0;
for (int j = 0; j < fwd->n_taps; j++) {
out[i] += in[i + 2*j] * fwd->taps[j];
out[i+1] += in[i+1 + 2*j] * fwd->taps[j];
out[i+2] += in[i+2 + 2*j] * fwd->taps[j];
out[i+3] += in[i+3 + 2*j] * fwd->taps[j];
}
}
// At the end of the frame, we cut the convolution off.
// The beginning of the next frame starts with a NULL symbol
// anyway.
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
#endif
// The following implementations are for debugging only.
#if 0
// Same thing as above, without loop unrolling. For debugging.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 1) {
out[i] = 0.0;
for (int j = 0; j < fwd->n_taps; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
#elif 0
// An unrolled loop, but this time, the input data is cast to complex float.
// Makes indices more natural. For debugging.
const complexf* in = reinterpret_cast(dataIn->getData());
complexf* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(complexf);
for (i = 0; i < sizeIn - fwd->n_taps; i += 4) {
out[i] = 0.0;
out[i+1] = 0.0;
out[i+2] = 0.0;
out[i+3] = 0.0;
for (int j = 0; j < fwd->n_taps; j++) {
out[i] += in[i+j ] * fwd->taps[j];
out[i+1] += in[i+1+j] * fwd->taps[j];
out[i+2] += in[i+2+j] * fwd->taps[j];
out[i+3] += in[i+3+j] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; j+i < sizeIn; j++) {
out[i] += in[i+j] * fwd->taps[j];
}
}
#elif 0
// Simple implementation. Slow. For debugging.
const complexf* in = reinterpret_cast(dataIn->getData());
complexf* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(complexf);
for (i = 0; i < sizeIn - fwd->n_taps; i += 1) {
out[i] = 0.0;
for (int j = 0; j < fwd->n_taps; j++) {
out[i] += in[i+j ] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; j+i < sizeIn; j++) {
out[i] += in[i+j] * fwd->taps[j];
}
}
#endif
calculationTime += (time_end.tv_sec - time_start.tv_sec) * 1000000000L +
time_end.tv_nsec - time_start.tv_nsec;
fwd->output_queue.push(dataOut);
delete dataIn;
}
}
FIRFilter::FIRFilter(char* taps_file) :
ModCodec(ModFormat(sizeof(complexf)), ModFormat(sizeof(complexf)))
{
PDEBUG("FIRFilter::FIRFilter(%s) @ %p\n",
taps_file, this);
number_of_runs = 0;
std::ifstream taps_fstream(taps_file);
if(!taps_fstream) {
fprintf(stderr, "FIRFilter: file %s could not be opened !\n", taps_file);
throw std::runtime_error("FIRFilter: Could not open file with taps! ");
}
int n_taps;
taps_fstream >> n_taps;
my_Ntaps = n_taps;
fprintf(stderr, "FIRFilter: Reading %d taps...\n", my_Ntaps);
myFilter = new float[my_Ntaps];
int n;
for (n = 0; n < n_taps; n++) {
taps_fstream >> myFilter[n];
PDEBUG("FIRFilter: tap: %f\n", myFilter[n] );
if (taps_fstream.eof()) {
fprintf(stderr, "FIRFilter: file %s should contains %d taps, but EOF reached "\
"after %d taps !\n", taps_file, n_taps, n);
throw std::runtime_error("FIRFilter: filtertaps file invalid ! ");
}
}
firwd.taps = myFilter;
firwd.n_taps = my_Ntaps;
PDEBUG("FIRFilter: Starting worker\n" );
worker.start(&firwd);
}
FIRFilter::~FIRFilter()
{
PDEBUG("FIRFilter::~FIRFilter() @ %p\n", this);
worker.stop();
if (myFilter != NULL) {
delete[] myFilter;
}
}
int FIRFilter::process(Buffer* const dataIn, Buffer* dataOut)
{
PDEBUG("FIRFilter::process(dataIn: %p, dataOut: %p)\n",
dataIn, dataOut);
// This thread creates the dataIn buffer, and deletes
// the outgoing buffer
Buffer* inbuffer = new Buffer(dataIn->getLength(), dataIn->getData());
firwd.input_queue.push(inbuffer);
if (number_of_runs > 2) {
Buffer* outbuffer;
firwd.output_queue.wait_and_pop(outbuffer);
dataOut->setData(outbuffer->getData(), outbuffer->getLength());
delete outbuffer;
}
else {
dataOut->setLength(dataIn->getLength());
memset(dataOut->getData(), 0, dataOut->getLength());
number_of_runs++;
}
return dataOut->getLength();
}