/* Copyright (C) 2007, 2008, 2009, 2010, 2011 Her Majesty the Queen in Right of Canada (Communications Research Center Canada) Copyright (C) 2017 Matthias P. Braendli, matthias.braendli@mpb.li http://opendigitalradio.org This block implements a FIR filter. The real filter taps are given as floats, and the block can take advantage of SSE. For better performance, filtering is done in another thread, leading to a pipeline delay of two calls to FIRFilter::process */ /* This file is part of ODR-DabMod. ODR-DabMod is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. ODR-DabMod is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with ODR-DabMod. If not, see . */ #include "FIRFilter.h" #include "PcDebug.h" #include "Utils.h" #include #include #include #include #include #include #ifdef __SSE__ # include #endif using namespace std; #include /* This is the FIR Filter calculated with the doc/fir-filter/generate-filter.py script * with settings * gain = 1 * sampling_freq = 2.048e6 * cutoff = 810e3 * transition_width = 250e3 * * It is a good default filter for the common scenarios. */ static const std::array default_filter_taps({ -0.00110450468492, 0.00120703084394, -0.000840645749122, -0.000187368263141, 0.00184351124335, -0.00355578539893, 0.00419321097434, -0.00254214904271, -0.00183473504148, 0.00781436730176, -0.0125957569107, 0.0126200336963, -0.00537294941023, -0.00866683479398, 0.0249746385962, -0.0356550291181, 0.0319730602205, -0.00795613788068, -0.0363943465054, 0.0938014090061, -0.151176810265, 0.193567320704, 0.791776955128, 0.193567320704, -0.151176810265, 0.0938014090061, -0.0363943465054, -0.00795613788068, 0.0319730602205, -0.0356550291181, 0.0249746385962, -0.00866683479398, -0.00537294941023, 0.0126200336963, -0.0125957569107, 0.00781436730176, -0.00183473504148, -0.00254214904271, 0.00419321097434, -0.00355578539893, 0.00184351124335, -0.000187368263141, -0.000840645749122, 0.00120703084394, -0.00110450468492}); FIRFilter::FIRFilter(const std::string& taps_file) : PipelinedModCodec(), RemoteControllable("firfilter"), m_taps_file(taps_file) { PDEBUG("FIRFilter::FIRFilter(%s) @ %p\n", taps_file.c_str(), this); RC_ADD_PARAMETER(ntaps, "(Read-only) number of filter taps."); RC_ADD_PARAMETER(tapsfile, "Filename containing filter taps. When written to, the new file gets automatically loaded."); load_filter_taps(m_taps_file); } void FIRFilter::load_filter_taps(const std::string &tapsFile) { std::vector filter_taps; if (tapsFile == "default") { std::copy(default_filter_taps.begin(), default_filter_taps.end(), std::back_inserter(filter_taps)); } else { std::ifstream taps_fstream(tapsFile.c_str()); if(!taps_fstream) { fprintf(stderr, "FIRFilter: file %s could not be opened !\n", tapsFile.c_str()); throw std::runtime_error("FIRFilter: Could not open file with taps! "); } int n_taps; taps_fstream >> n_taps; if (n_taps <= 0) { fprintf(stderr, "FIRFilter: warning: taps file has invalid format\n"); throw std::runtime_error("FIRFilter: taps file has invalid format."); } if (n_taps > 100) { fprintf(stderr, "FIRFilter: warning: taps file has more than 100 taps\n"); } fprintf(stderr, "FIRFilter: Reading %d taps...\n", n_taps); filter_taps.resize(n_taps); int n; for (n = 0; n < n_taps; n++) { taps_fstream >> filter_taps[n]; PDEBUG("FIRFilter: tap: %f\n", filter_taps[n] ); if (taps_fstream.eof()) { fprintf(stderr, "FIRFilter: file %s should contains %d taps, but EOF reached "\ "after %d taps !\n", tapsFile.c_str(), n_taps, n); throw std::runtime_error("FIRFilter: filtertaps file invalid ! "); } } } { std::lock_guard lock(m_taps_mutex); m_taps = filter_taps; } } int FIRFilter::internal_process(Buffer* const dataIn, Buffer* dataOut) { size_t i; #if __SSE__ // The SSE accelerated version cannot work on the complex values, // it is necessary to do the convolution on the real and imaginary // parts separately. Thankfully, the taps are real, simplifying the // procedure. const float* in = reinterpret_cast(dataIn->getData()); float* out = reinterpret_cast(dataOut->getData()); size_t sizeIn = dataIn->getLength() / sizeof(float); if ((uintptr_t)(&out[0]) % 16 != 0) { fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out); throw std::runtime_error("FIRFilterWorker: out not aligned"); } __m128 SSEout; __m128 SSEtaps; __m128 SSEin; { std::lock_guard lock(m_taps_mutex); for (i = 0; i < sizeIn - 2*m_taps.size(); i += 4) { SSEout = _mm_setr_ps(0,0,0,0); for (size_t j = 0; j < m_taps.size(); j++) { if ((uintptr_t)(&in[i+2*j]) % 16 == 0) { SSEin = _mm_load_ps(&in[i+2*j]); //faster when aligned } else { SSEin = _mm_loadu_ps(&in[i+2*j]); } SSEtaps = _mm_load1_ps(&m_taps[j]); SSEout = _mm_add_ps(SSEout, _mm_mul_ps(SSEin, SSEtaps)); } _mm_store_ps(&out[i], SSEout); } for (; i < sizeIn; i++) { out[i] = 0.0; for (int j = 0; i+2*j < sizeIn; j++) { out[i] += in[i+2*j] * m_taps[j]; } } } #else // No SSE ? Loop unrolling should make this faster. As for the SSE, // the real and imaginary parts are calculated separately. const float* in = reinterpret_cast(dataIn->getData()); float* out = reinterpret_cast(dataOut->getData()); size_t sizeIn = dataIn->getLength() / sizeof(float); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start); { std::lock_guard lock(m_taps_mutex); // Convolve by aligning both frame and taps at zero. for (i = 0; i < sizeIn - 2*m_taps.size(); i += 4) { out[i] = 0.0; out[i+1] = 0.0; out[i+2] = 0.0; out[i+3] = 0.0; for (size_t j = 0; j < m_taps.size(); j++) { out[i] += in[i + 2*j] * m_taps[j]; out[i+1] += in[i+1 + 2*j] * m_taps[j]; out[i+2] += in[i+2 + 2*j] * m_taps[j]; out[i+3] += in[i+3 + 2*j] * m_taps[j]; } } // At the end of the frame, we cut the convolution off. // The beginning of the next frame starts with a NULL symbol // anyway. for (; i < sizeIn; i++) { out[i] = 0.0; for (int j = 0; i+2*j < sizeIn; j++) { out[i] += in[i+2*j] * m_taps[j]; } } } clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end); #endif // The following implementations are for debugging only. #if 0 // Same thing as above, without loop unrolling. For debugging. const float* in = reinterpret_cast(dataIn->getData()); float* out = reinterpret_cast(dataOut->getData()); size_t sizeIn = dataIn->getLength() / sizeof(float); std::lock_guard lock(m_taps_mutex); for (i = 0; i < sizeIn - 2*m_taps.size(); i += 1) { out[i] = 0.0; for (size_t j = 0; j < m_taps.size(); j++) { out[i] += in[i+2*j] * m_taps[j]; } } for (; i < sizeIn; i++) { out[i] = 0.0; for (int j = 0; i+2*j < sizeIn; j++) { out[i] += in[i+2*j] * m_taps[j]; } } #elif 0 // An unrolled loop, but this time, the input data is cast to complex float. // Makes indices more natural. For debugging. const complexf* in = reinterpret_cast(dataIn->getData()); complexf* out = reinterpret_cast(dataOut->getData()); size_t sizeIn = dataIn->getLength() / sizeof(complexf); std::lock_guard lock(m_taps_mutex); for (i = 0; i < sizeIn - m_taps.size(); i += 4) { out[i] = 0.0; out[i+1] = 0.0; out[i+2] = 0.0; out[i+3] = 0.0; for (size_t j = 0; j < m_taps.size(); j++) { out[i] += in[i+j ] * m_taps[j]; out[i+1] += in[i+1+j] * m_taps[j]; out[i+2] += in[i+2+j] * m_taps[j]; out[i+3] += in[i+3+j] * m_taps[j]; } } for (; i < sizeIn; i++) { out[i] = 0.0; for (int j = 0; j+i < sizeIn; j++) { out[i] += in[i+j] * m_taps[j]; } } #elif 0 // Simple implementation. Slow. For debugging. const complexf* in = reinterpret_cast(dataIn->getData()); complexf* out = reinterpret_cast(dataOut->getData()); size_t sizeIn = dataIn->getLength() / sizeof(complexf); std::lock_guard lock(m_taps_mutex); for (i = 0; i < sizeIn - m_taps.size(); i += 1) { out[i] = 0.0; for (size_t j = 0; j < m_taps.size(); j++) { out[i] += in[i+j ] * m_taps[j]; } } for (; i < sizeIn; i++) { out[i] = 0.0; for (int j = 0; j+i < sizeIn; j++) { out[i] += in[i+j] * m_taps[j]; } } #endif return dataOut->getLength(); } void FIRFilter::set_parameter(const string& parameter, const string& value) { stringstream ss(value); ss.exceptions ( stringstream::failbit | stringstream::badbit ); if (parameter == "ntaps") { throw ParameterError("Parameter 'ntaps' is read-only"); } else if (parameter == "tapsfile") { try { load_filter_taps(value); m_taps_file = value; } catch (std::runtime_error &e) { throw ParameterError(e.what()); } } else { stringstream ss; ss << "Parameter '" << parameter << "' is not exported by controllable " << get_rc_name(); throw ParameterError(ss.str()); } } const string FIRFilter::get_parameter(const string& parameter) const { stringstream ss; if (parameter == "ntaps") { ss << m_taps.size(); } else if (parameter == "tapsfile") { ss << m_taps_file; } else { ss << "Parameter '" << parameter << "' is not exported by controllable " << get_rc_name(); throw ParameterError(ss.str()); } return ss.str(); }