diff options
author | Matthias (think) <matthias@mpb.li> | 2012-07-11 12:00:56 +0200 |
---|---|---|
committer | Matthias (think) <matthias@mpb.li> | 2012-07-11 12:00:56 +0200 |
commit | 4b814d92fe9787bf72ed3a9632e0866f4cedd27f (patch) | |
tree | feb407e9f2c0658dda896c332864fe6d265aaece /src/FIRFilter.cpp | |
parent | e92f9c408634810828e75d4ad6da408e1c142195 (diff) | |
download | dabmod-4b814d92fe9787bf72ed3a9632e0866f4cedd27f.tar.gz dabmod-4b814d92fe9787bf72ed3a9632e0866f4cedd27f.tar.bz2 dabmod-4b814d92fe9787bf72ed3a9632e0866f4cedd27f.zip |
added crc-dabmod patch
Diffstat (limited to 'src/FIRFilter.cpp')
-rw-r--r-- | src/FIRFilter.cpp | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp new file mode 100644 index 0000000..2ba4294 --- /dev/null +++ b/src/FIRFilter.cpp @@ -0,0 +1,311 @@ +/* + Copyright (C) 2007, 2008, 2009, 2010, 2011 Her Majesty the Queen in + Right of Canada (Communications Research Center Canada) + + Written by + 2012, Matthias P. Braendli, matthias.braendli@mpb.li + + This block implements a FIR filter. The real filter taps are given + as floats, and the block can take advantage of SSE. + For better performance, filtering is done in another thread, leading + to a pipeline delay of two calls to FIRFilter::process + */ +/* + This file is part of CRC-DADMOD. + + CRC-DADMOD is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + CRC-DADMOD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with CRC-DADMOD. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "FIRFilter.h" +#include "PcDebug.h" + +#include <stdio.h> +#include <stdexcept> + +#include <iostream> +#include <fstream> + +#ifdef __SSE__ +# include <xmmintrin.h> +#endif + + +#include <sys/time.h> + +void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd) +{ + size_t i; + struct timespec time_start; + struct timespec time_end; + + // This thread creates the dataOut buffer, and deletes + // the incoming buffer + + while(running) { + Buffer* dataIn; + fwd->input_queue.wait_and_pop(dataIn); + + Buffer* dataOut; + dataOut = new Buffer(); + dataOut->setLength(dataIn->getLength()); + + PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength()); + +#if __SSE__ + // The SSE accelerated version cannot work on the complex values, + // it is necessary to do the convolution on the real and imaginary + // parts separately. Thankfully, the taps are real, simplifying the + // procedure. + + const float* in = reinterpret_cast<const float*>(dataIn->getData()); + float* out = reinterpret_cast<float*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(float); + + if ((uintptr_t)(&out[0]) % 16 != 0) { + fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out); + throw std::runtime_error("FIRFilterWorker: out not aligned"); + } + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start); + + __m128 SSEout; + __m128 SSEtaps; + __m128 SSEin; + for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) { + SSEout = _mm_setr_ps(0,0,0,0); + + for (int j = 0; j < fwd->n_taps; j++) { + if ((uintptr_t)(&in[i+2*j]) % 16 == 0) { + SSEin = _mm_load_ps(&in[i+2*j]); //faster when aligned + } + else { + SSEin = _mm_loadu_ps(&in[i+2*j]); + } + + SSEtaps = _mm_load1_ps(&fwd->taps[j]); + + SSEout = _mm_add_ps(SSEout, _mm_mul_ps(SSEin, SSEtaps)); + } + _mm_store_ps(&out[i], SSEout); + } + + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; i+2*j < sizeIn; j++) { + out[i] += in[i+2*j] * fwd->taps[j]; + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end); + +#else + // No SSE ? Loop unrolling should make this faster. As for the SSE, + // the real and imaginary parts are calculated separately. + const float* in = reinterpret_cast<const float*>(dataIn->getData()); + float* out = reinterpret_cast<float*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(float); + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start); + + // Convolve by aligning both frame and taps at zero. + for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 4) { + out[i] = 0.0; + out[i+1] = 0.0; + out[i+2] = 0.0; + out[i+3] = 0.0; + + for (int j = 0; j < fwd->n_taps; j++) { + out[i] += in[i + 2*j] * fwd->taps[j]; + out[i+1] += in[i+1 + 2*j] * fwd->taps[j]; + out[i+2] += in[i+2 + 2*j] * fwd->taps[j]; + out[i+3] += in[i+3 + 2*j] * fwd->taps[j]; + } + } + + // At the end of the frame, we cut the convolution off. + // The beginning of the next frame starts with a NULL symbol + // anyway. + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; i+2*j < sizeIn; j++) { + out[i] += in[i+2*j] * fwd->taps[j]; + } + } + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end); + + +#endif + + // The following implementations are for debugging only. +#if 0 + // Same thing as above, without loop unrolling. For debugging. + const float* in = reinterpret_cast<const float*>(dataIn->getData()); + float* out = reinterpret_cast<float*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(float); + + for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 1) { + out[i] = 0.0; + + for (int j = 0; j < fwd->n_taps; j++) { + out[i] += in[i+2*j] * fwd->taps[j]; + } + } + + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; i+2*j < sizeIn; j++) { + out[i] += in[i+2*j] * fwd->taps[j]; + } + } + +#elif 0 + // An unrolled loop, but this time, the input data is cast to complex float. + // Makes indices more natural. For debugging. + const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData()); + complexf* out = reinterpret_cast<complexf*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(complexf); + + for (i = 0; i < sizeIn - fwd->n_taps; i += 4) { + out[i] = 0.0; + out[i+1] = 0.0; + out[i+2] = 0.0; + out[i+3] = 0.0; + + for (int j = 0; j < fwd->n_taps; j++) { + out[i] += in[i+j ] * fwd->taps[j]; + out[i+1] += in[i+1+j] * fwd->taps[j]; + out[i+2] += in[i+2+j] * fwd->taps[j]; + out[i+3] += in[i+3+j] * fwd->taps[j]; + } + } + + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; j+i < sizeIn; j++) { + out[i] += in[i+j] * fwd->taps[j]; + } + } + +#elif 0 + // Simple implementation. Slow. For debugging. + const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData()); + complexf* out = reinterpret_cast<complexf*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(complexf); + + for (i = 0; i < sizeIn - fwd->n_taps; i += 1) { + out[i] = 0.0; + + for (int j = 0; j < fwd->n_taps; j++) { + out[i] += in[i+j ] * fwd->taps[j]; + } + } + + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; j+i < sizeIn; j++) { + out[i] += in[i+j] * fwd->taps[j]; + } + } +#endif + + calculationTime += (time_end.tv_sec - time_start.tv_sec) * 1000000000L + + time_end.tv_nsec - time_start.tv_nsec; + fwd->output_queue.push(dataOut); + delete dataIn; + } +} + + +FIRFilter::FIRFilter(char* taps_file) : + ModCodec(ModFormat(sizeof(complexf)), ModFormat(sizeof(complexf))) +{ + PDEBUG("FIRFilter::FIRFilter(%s) @ %p\n", + taps_file, this); + + number_of_runs = 0; + + std::ifstream taps_fstream(taps_file); + if(!taps_fstream) { + fprintf(stderr, "FIRFilter: file %s could not be opened !\n", taps_file); + throw std::runtime_error("FIRFilter: Could not open file with taps! "); + } + int n_taps; + taps_fstream >> n_taps; + + my_Ntaps = n_taps; + + fprintf(stderr, "FIRFilter: Reading %d taps...\n", my_Ntaps); + + myFilter = new float[my_Ntaps]; + + int n; + for (n = 0; n < n_taps; n++) { + taps_fstream >> myFilter[n]; + PDEBUG("FIRFilter: tap: %f\n", myFilter[n] ); + if (taps_fstream.eof()) { + fprintf(stderr, "FIRFilter: file %s should contains %d taps, but EOF reached "\ + "after %d taps !\n", taps_file, n_taps, n); + throw std::runtime_error("FIRFilter: filtertaps file invalid ! "); + } + } + + firwd.taps = myFilter; + firwd.n_taps = my_Ntaps; + + PDEBUG("FIRFilter: Starting worker\n" ); + worker.start(&firwd); +} + + +FIRFilter::~FIRFilter() +{ + PDEBUG("FIRFilter::~FIRFilter() @ %p\n", this); + + worker.stop(); + + if (myFilter != NULL) { + delete[] myFilter; + } +} + + +int FIRFilter::process(Buffer* const dataIn, Buffer* dataOut) +{ + PDEBUG("FIRFilter::process(dataIn: %p, dataOut: %p)\n", + dataIn, dataOut); + + // This thread creates the dataIn buffer, and deletes + // the outgoing buffer + + Buffer* inbuffer = new Buffer(dataIn->getLength(), dataIn->getData()); + + firwd.input_queue.push(inbuffer); + + if (number_of_runs > 2) { + Buffer* outbuffer; + firwd.output_queue.wait_and_pop(outbuffer); + + dataOut->setData(outbuffer->getData(), outbuffer->getLength()); + + delete outbuffer; + } + else { + dataOut->setLength(dataIn->getLength()); + memset(dataOut->getData(), 0, dataOut->getLength()); + number_of_runs++; + } + + return dataOut->getLength(); + +} |