aboutsummaryrefslogtreecommitdiffstats
path: root/src/FIRFilter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/FIRFilter.cpp')
-rw-r--r--src/FIRFilter.cpp311
1 files changed, 311 insertions, 0 deletions
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp
new file mode 100644
index 0000000..2ba4294
--- /dev/null
+++ b/src/FIRFilter.cpp
@@ -0,0 +1,311 @@
+/*
+ Copyright (C) 2007, 2008, 2009, 2010, 2011 Her Majesty the Queen in
+ Right of Canada (Communications Research Center Canada)
+
+ Written by
+ 2012, Matthias P. Braendli, matthias.braendli@mpb.li
+
+ This block implements a FIR filter. The real filter taps are given
+ as floats, and the block can take advantage of SSE.
+ For better performance, filtering is done in another thread, leading
+ to a pipeline delay of two calls to FIRFilter::process
+ */
+/*
+ This file is part of CRC-DADMOD.
+
+ CRC-DADMOD is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ CRC-DADMOD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with CRC-DADMOD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "FIRFilter.h"
+#include "PcDebug.h"
+
+#include <stdio.h>
+#include <stdexcept>
+
+#include <iostream>
+#include <fstream>
+
+#ifdef __SSE__
+# include <xmmintrin.h>
+#endif
+
+
+#include <sys/time.h>
+
+void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd)
+{
+ size_t i;
+ struct timespec time_start;
+ struct timespec time_end;
+
+ // This thread creates the dataOut buffer, and deletes
+ // the incoming buffer
+
+ while(running) {
+ Buffer* dataIn;
+ fwd->input_queue.wait_and_pop(dataIn);
+
+ Buffer* dataOut;
+ dataOut = new Buffer();
+ dataOut->setLength(dataIn->getLength());
+
+ PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength());
+
+#if __SSE__
+ // The SSE accelerated version cannot work on the complex values,
+ // it is necessary to do the convolution on the real and imaginary
+ // parts separately. Thankfully, the taps are real, simplifying the
+ // procedure.
+
+ const float* in = reinterpret_cast<const float*>(dataIn->getData());
+ float* out = reinterpret_cast<float*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(float);
+
+ if ((uintptr_t)(&out[0]) % 16 != 0) {
+ fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
+ throw std::runtime_error("FIRFilterWorker: out not aligned");
+ }
+
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
+
+ __m128 SSEout;
+ __m128 SSEtaps;
+ __m128 SSEin;
+ for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) {
+ SSEout = _mm_setr_ps(0,0,0,0);
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ if ((uintptr_t)(&in[i+2*j]) % 16 == 0) {
+ SSEin = _mm_load_ps(&in[i+2*j]); //faster when aligned
+ }
+ else {
+ SSEin = _mm_loadu_ps(&in[i+2*j]);
+ }
+
+ SSEtaps = _mm_load1_ps(&fwd->taps[j]);
+
+ SSEout = _mm_add_ps(SSEout, _mm_mul_ps(SSEin, SSEtaps));
+ }
+ _mm_store_ps(&out[i], SSEout);
+ }
+
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; i+2*j < sizeIn; j++) {
+ out[i] += in[i+2*j] * fwd->taps[j];
+ }
+ }
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
+
+#else
+ // No SSE ? Loop unrolling should make this faster. As for the SSE,
+ // the real and imaginary parts are calculated separately.
+ const float* in = reinterpret_cast<const float*>(dataIn->getData());
+ float* out = reinterpret_cast<float*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(float);
+
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
+
+ // Convolve by aligning both frame and taps at zero.
+ for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) {
+ out[i] = 0.0;
+ out[i+1] = 0.0;
+ out[i+2] = 0.0;
+ out[i+3] = 0.0;
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ out[i] += in[i + 2*j] * fwd->taps[j];
+ out[i+1] += in[i+1 + 2*j] * fwd->taps[j];
+ out[i+2] += in[i+2 + 2*j] * fwd->taps[j];
+ out[i+3] += in[i+3 + 2*j] * fwd->taps[j];
+ }
+ }
+
+ // At the end of the frame, we cut the convolution off.
+ // The beginning of the next frame starts with a NULL symbol
+ // anyway.
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; i+2*j < sizeIn; j++) {
+ out[i] += in[i+2*j] * fwd->taps[j];
+ }
+ }
+
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
+
+
+#endif
+
+ // The following implementations are for debugging only.
+#if 0
+ // Same thing as above, without loop unrolling. For debugging.
+ const float* in = reinterpret_cast<const float*>(dataIn->getData());
+ float* out = reinterpret_cast<float*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(float);
+
+ for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 1) {
+ out[i] = 0.0;
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ out[i] += in[i+2*j] * fwd->taps[j];
+ }
+ }
+
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; i+2*j < sizeIn; j++) {
+ out[i] += in[i+2*j] * fwd->taps[j];
+ }
+ }
+
+#elif 0
+ // An unrolled loop, but this time, the input data is cast to complex float.
+ // Makes indices more natural. For debugging.
+ const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData());
+ complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(complexf);
+
+ for (i = 0; i < sizeIn - fwd->n_taps; i += 4) {
+ out[i] = 0.0;
+ out[i+1] = 0.0;
+ out[i+2] = 0.0;
+ out[i+3] = 0.0;
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ out[i] += in[i+j ] * fwd->taps[j];
+ out[i+1] += in[i+1+j] * fwd->taps[j];
+ out[i+2] += in[i+2+j] * fwd->taps[j];
+ out[i+3] += in[i+3+j] * fwd->taps[j];
+ }
+ }
+
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; j+i < sizeIn; j++) {
+ out[i] += in[i+j] * fwd->taps[j];
+ }
+ }
+
+#elif 0
+ // Simple implementation. Slow. For debugging.
+ const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData());
+ complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(complexf);
+
+ for (i = 0; i < sizeIn - fwd->n_taps; i += 1) {
+ out[i] = 0.0;
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ out[i] += in[i+j ] * fwd->taps[j];
+ }
+ }
+
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; j+i < sizeIn; j++) {
+ out[i] += in[i+j] * fwd->taps[j];
+ }
+ }
+#endif
+
+ calculationTime += (time_end.tv_sec - time_start.tv_sec) * 1000000000L +
+ time_end.tv_nsec - time_start.tv_nsec;
+ fwd->output_queue.push(dataOut);
+ delete dataIn;
+ }
+}
+
+
+FIRFilter::FIRFilter(char* taps_file) :
+ ModCodec(ModFormat(sizeof(complexf)), ModFormat(sizeof(complexf)))
+{
+ PDEBUG("FIRFilter::FIRFilter(%s) @ %p\n",
+ taps_file, this);
+
+ number_of_runs = 0;
+
+ std::ifstream taps_fstream(taps_file);
+ if(!taps_fstream) {
+ fprintf(stderr, "FIRFilter: file %s could not be opened !\n", taps_file);
+ throw std::runtime_error("FIRFilter: Could not open file with taps! ");
+ }
+ int n_taps;
+ taps_fstream >> n_taps;
+
+ my_Ntaps = n_taps;
+
+ fprintf(stderr, "FIRFilter: Reading %d taps...\n", my_Ntaps);
+
+ myFilter = new float[my_Ntaps];
+
+ int n;
+ for (n = 0; n < n_taps; n++) {
+ taps_fstream >> myFilter[n];
+ PDEBUG("FIRFilter: tap: %f\n", myFilter[n] );
+ if (taps_fstream.eof()) {
+ fprintf(stderr, "FIRFilter: file %s should contains %d taps, but EOF reached "\
+ "after %d taps !\n", taps_file, n_taps, n);
+ throw std::runtime_error("FIRFilter: filtertaps file invalid ! ");
+ }
+ }
+
+ firwd.taps = myFilter;
+ firwd.n_taps = my_Ntaps;
+
+ PDEBUG("FIRFilter: Starting worker\n" );
+ worker.start(&firwd);
+}
+
+
+FIRFilter::~FIRFilter()
+{
+ PDEBUG("FIRFilter::~FIRFilter() @ %p\n", this);
+
+ worker.stop();
+
+ if (myFilter != NULL) {
+ delete[] myFilter;
+ }
+}
+
+
+int FIRFilter::process(Buffer* const dataIn, Buffer* dataOut)
+{
+ PDEBUG("FIRFilter::process(dataIn: %p, dataOut: %p)\n",
+ dataIn, dataOut);
+
+ // This thread creates the dataIn buffer, and deletes
+ // the outgoing buffer
+
+ Buffer* inbuffer = new Buffer(dataIn->getLength(), dataIn->getData());
+
+ firwd.input_queue.push(inbuffer);
+
+ if (number_of_runs > 2) {
+ Buffer* outbuffer;
+ firwd.output_queue.wait_and_pop(outbuffer);
+
+ dataOut->setData(outbuffer->getData(), outbuffer->getLength());
+
+ delete outbuffer;
+ }
+ else {
+ dataOut->setLength(dataIn->getLength());
+ memset(dataOut->getData(), 0, dataOut->getLength());
+ number_of_runs++;
+ }
+
+ return dataOut->getLength();
+
+}