/*
Copyright (C) 2007, 2008, 2009, 2010, 2011 Her Majesty the Queen in
Right of Canada (Communications Research Center Canada)
Written by
2012, Matthias P. Braendli, matthias.braendli@mpb.li
This block implements a FIR filter. The real filter taps are given
as floats, and the block can take advantage of SSE.
For better performance, filtering is done in another thread, leading
to a pipeline delay of two calls to FIRFilter::process
*/
/*
This file is part of ODR-DabMod.
ODR-DabMod is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
ODR-DabMod is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with ODR-DabMod. If not, see .
*/
#include "FIRFilter.h"
#include "PcDebug.h"
#include
#include
#include
#include
#include
#ifdef __AVX__
# include
#else
# ifdef __SSE__
# include
# endif
#endif
using namespace std;
#include
void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd)
{
size_t i;
struct timespec time_start;
struct timespec time_end;
// This thread creates the dataOut buffer, and deletes
// the incoming buffer
while(running) {
std::shared_ptr dataIn;
fwd->input_queue.wait_and_pop(dataIn);
std::shared_ptr dataOut = make_shared();
dataOut->setLength(dataIn->getLength());
PDEBUG("FIRFilterWorker: dataIn->getLength() %zu\n", dataIn->getLength());
#if __AVX__
#define _mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
#warning FIRFilter uses experimental AVX code
// The AVX accelerated version cannot work on the complex values,
// it is necessary to do the convolution on the real and imaginary
// parts separately. Thankfully, the taps are real, simplifying the
// procedure.
//
// The AVX version is not enabled by default, because the performance
// on my test machine (sandy bridge i7) is slightly worse with AVX than
// with SSE. TODO: Try with Ivy Bridge or newer.
//
// Interesting links:
// http://software.intel.com/en-us/forums/topic/283753
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
if ((uintptr_t)(&out[0]) % 32 != 0) {
fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
throw std::runtime_error("FIRFilterWorker: out not aligned");
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
__m256 AVXout;
__m256 AVXtaps;
__m256 AVXin;
{
boost::mutex::scoped_lock lock(fwd->taps_mutex);
for (i = 0; i < sizeIn - 2*fwd->taps.size(); i += 8) {
AVXout = _mm256_setr_ps(0,0,0,0,0,0,0,0);
for (size_t j = 0; j < fwd->taps.size; j++) {
if ((uintptr_t)(&in[i+2*j]) % 32 == 0) {
AVXin = _mm256_load_ps(&in[i+2*j]); //faster when aligned
}
else {
AVXin = _mm256_loadu_ps(&in[i+2*j]);
}
AVXtaps = _mm256_load1_ps(fwd->taps[j]);
AVXout = _mm256_add_ps(AVXout, _mm256_mul_ps(AVXin, AVXtaps));
}
_mm256_store_ps(&out[i], AVXout);
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
#elif __SSE__
// The SSE accelerated version cannot work on the complex values,
// it is necessary to do the convolution on the real and imaginary
// parts separately. Thankfully, the taps are real, simplifying the
// procedure.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
if ((uintptr_t)(&out[0]) % 16 != 0) {
fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
throw std::runtime_error("FIRFilterWorker: out not aligned");
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
__m128 SSEout;
__m128 SSEtaps;
__m128 SSEin;
{
boost::mutex::scoped_lock lock(fwd->taps_mutex);
for (i = 0; i < sizeIn - 2*fwd->taps.size(); i += 4) {
SSEout = _mm_setr_ps(0,0,0,0);
for (size_t j = 0; j < fwd->taps.size(); j++) {
if ((uintptr_t)(&in[i+2*j]) % 16 == 0) {
SSEin = _mm_load_ps(&in[i+2*j]); //faster when aligned
}
else {
SSEin = _mm_loadu_ps(&in[i+2*j]);
}
SSEtaps = _mm_load1_ps(&fwd->taps[j]);
SSEout = _mm_add_ps(SSEout, _mm_mul_ps(SSEin, SSEtaps));
}
_mm_store_ps(&out[i], SSEout);
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
#else
// No SSE ? Loop unrolling should make this faster. As for the SSE,
// the real and imaginary parts are calculated separately.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
{
boost::mutex::scoped_lock lock(fwd->taps_mutex);
// Convolve by aligning both frame and taps at zero.
for (i = 0; i < sizeIn - 2*fwd->taps.size(); i += 4) {
out[i] = 0.0;
out[i+1] = 0.0;
out[i+2] = 0.0;
out[i+3] = 0.0;
for (size_t j = 0; j < fwd->taps.size(); j++) {
out[i] += in[i + 2*j] * fwd->taps[j];
out[i+1] += in[i+1 + 2*j] * fwd->taps[j];
out[i+2] += in[i+2 + 2*j] * fwd->taps[j];
out[i+3] += in[i+3 + 2*j] * fwd->taps[j];
}
}
// At the end of the frame, we cut the convolution off.
// The beginning of the next frame starts with a NULL symbol
// anyway.
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
#endif
// The following implementations are for debugging only.
#if 0
// Same thing as above, without loop unrolling. For debugging.
const float* in = reinterpret_cast(dataIn->getData());
float* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(float);
for (i = 0; i < sizeIn - 2*fwd->taps.size(); i += 1) {
out[i] = 0.0;
for (size_t j = 0; j < fwd->taps.size(); j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; i+2*j < sizeIn; j++) {
out[i] += in[i+2*j] * fwd->taps[j];
}
}
#elif 0
// An unrolled loop, but this time, the input data is cast to complex float.
// Makes indices more natural. For debugging.
const complexf* in = reinterpret_cast(dataIn->getData());
complexf* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(complexf);
for (i = 0; i < sizeIn - fwd->taps.size(); i += 4) {
out[i] = 0.0;
out[i+1] = 0.0;
out[i+2] = 0.0;
out[i+3] = 0.0;
for (size_t j = 0; j < fwd->taps.size(); j++) {
out[i] += in[i+j ] * fwd->taps[j];
out[i+1] += in[i+1+j] * fwd->taps[j];
out[i+2] += in[i+2+j] * fwd->taps[j];
out[i+3] += in[i+3+j] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; j+i < sizeIn; j++) {
out[i] += in[i+j] * fwd->taps[j];
}
}
#elif 0
// Simple implementation. Slow. For debugging.
const complexf* in = reinterpret_cast(dataIn->getData());
complexf* out = reinterpret_cast(dataOut->getData());
size_t sizeIn = dataIn->getLength() / sizeof(complexf);
for (i = 0; i < sizeIn - fwd->taps.size(); i += 1) {
out[i] = 0.0;
for (size_t j = 0; j < fwd->taps.size(); j++) {
out[i] += in[i+j ] * fwd->taps[j];
}
}
for (; i < sizeIn; i++) {
out[i] = 0.0;
for (int j = 0; j+i < sizeIn; j++) {
out[i] += in[i+j] * fwd->taps[j];
}
}
#endif
calculationTime += (time_end.tv_sec - time_start.tv_sec) * 1000000000L +
time_end.tv_nsec - time_start.tv_nsec;
fwd->output_queue.push(dataOut);
}
}
FIRFilter::FIRFilter(std::string& taps_file) :
ModCodec(ModFormat(sizeof(complexf)), ModFormat(sizeof(complexf))),
RemoteControllable("firfilter"),
myTapsFile(taps_file)
{
PDEBUG("FIRFilter::FIRFilter(%s) @ %p\n",
taps_file.c_str(), this);
RC_ADD_PARAMETER(ntaps, "(Read-only) number of filter taps.");
RC_ADD_PARAMETER(tapsfile, "Filename containing filter taps. When written to, the new file gets automatically loaded.");
number_of_runs = 0;
load_filter_taps(myTapsFile);
#if __AVX__
fprintf(stderr, "FIRFilter: WARNING: using experimental AVX code !\n");
#endif
PDEBUG("FIRFilter: Starting worker\n" );
worker.start(&firwd);
}
void FIRFilter::load_filter_taps(std::string tapsFile)
{
std::ifstream taps_fstream(tapsFile.c_str());
if(!taps_fstream) {
fprintf(stderr, "FIRFilter: file %s could not be opened !\n", tapsFile.c_str());
throw std::runtime_error("FIRFilter: Could not open file with taps! ");
}
int n_taps;
taps_fstream >> n_taps;
if (n_taps <= 0) {
fprintf(stderr, "FIRFilter: warning: taps file has invalid format\n");
throw std::runtime_error("FIRFilter: taps file has invalid format.");
}
if (n_taps > 100) {
fprintf(stderr, "FIRFilter: warning: taps file has more than 100 taps\n");
}
fprintf(stderr, "FIRFilter: Reading %d taps...\n", n_taps);
std::vector filter_taps(n_taps);
int n;
for (n = 0; n < n_taps; n++) {
taps_fstream >> filter_taps[n];
PDEBUG("FIRFilter: tap: %f\n", filter_taps[n] );
if (taps_fstream.eof()) {
fprintf(stderr, "FIRFilter: file %s should contains %d taps, but EOF reached "\
"after %d taps !\n", tapsFile.c_str(), n_taps, n);
throw std::runtime_error("FIRFilter: filtertaps file invalid ! ");
}
}
{
boost::mutex::scoped_lock lock(firwd.taps_mutex);
firwd.taps = filter_taps;
}
}
FIRFilter::~FIRFilter()
{
PDEBUG("FIRFilter::~FIRFilter() @ %p\n", this);
worker.stop();
}
int FIRFilter::process(Buffer* const dataIn, Buffer* dataOut)
{
PDEBUG("FIRFilter::process(dataIn: %p, dataOut: %p)\n",
dataIn, dataOut);
// This thread creates the dataIn buffer, and deletes
// the outgoing buffer
std::shared_ptr inbuffer =
make_shared(dataIn->getLength(), dataIn->getData());
firwd.input_queue.push(inbuffer);
if (number_of_runs > 2) {
std::shared_ptr outbuffer;
firwd.output_queue.wait_and_pop(outbuffer);
dataOut->setData(outbuffer->getData(), outbuffer->getLength());
}
else {
dataOut->setLength(dataIn->getLength());
memset(dataOut->getData(), 0, dataOut->getLength());
number_of_runs++;
}
return dataOut->getLength();
}
void FIRFilter::set_parameter(const string& parameter, const string& value)
{
stringstream ss(value);
ss.exceptions ( stringstream::failbit | stringstream::badbit );
if (parameter == "ntaps") {
throw ParameterError("Parameter 'ntaps' is read-only");
}
else if (parameter == "tapsfile") {
try {
load_filter_taps(value);
myTapsFile = value;
}
catch (std::runtime_error &e) {
throw ParameterError(e.what());
}
}
else {
stringstream ss;
ss << "Parameter '" << parameter << "' is not exported by controllable " << get_rc_name();
throw ParameterError(ss.str());
}
}
const string FIRFilter::get_parameter(const string& parameter) const
{
stringstream ss;
if (parameter == "ntaps") {
ss << firwd.taps.size();
}
else if (parameter == "tapsfile") {
ss << myTapsFile;
}
else {
ss << "Parameter '" << parameter << "' is not exported by controllable " << get_rc_name();
throw ParameterError(ss.str());
}
return ss.str();
}