diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/FIRFilter.cpp | 74 |
1 files changed, 3 insertions, 71 deletions
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp index 77e2305..86f558b 100644 --- a/src/FIRFilter.cpp +++ b/src/FIRFilter.cpp @@ -38,12 +38,8 @@ #include <fstream> #include <memory> -#ifdef __AVX__ -# include <immintrin.h> -#else -# ifdef __SSE__ -# include <xmmintrin.h> -# endif +#ifdef __SSE__ +# include <xmmintrin.h> #endif using namespace std; @@ -71,67 +67,7 @@ void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd) PDEBUG("FIRFilterWorker: dataIn->getLength() %zu\n", dataIn->getLength()); -#if __AVX__ -#define _mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x) -#warning FIRFilter uses experimental AVX code - - // The AVX accelerated version cannot work on the complex values, - // it is necessary to do the convolution on the real and imaginary - // parts separately. Thankfully, the taps are real, simplifying the - // procedure. - // - // The AVX version is not enabled by default, because the performance - // on my test machine (sandy bridge i7) is slightly worse with AVX than - // with SSE. TODO: Try with Ivy Bridge or newer. - // - // Interesting links: - // http://software.intel.com/en-us/forums/topic/283753 - - const float* in = reinterpret_cast<const float*>(dataIn->getData()); - float* out = reinterpret_cast<float*>(dataOut->getData()); - size_t sizeIn = dataIn->getLength() / sizeof(float); - - if ((uintptr_t)(&out[0]) % 32 != 0) { - fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out); - throw std::runtime_error("FIRFilterWorker: out not aligned"); - } - - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start); - - __m256 AVXout; - __m256 AVXtaps; - __m256 AVXin; - { - boost::mutex::scoped_lock lock(fwd->taps_mutex); - - for (i = 0; i < sizeIn - 2*fwd->taps.size(); i += 8) { - AVXout = _mm256_setr_ps(0,0,0,0,0,0,0,0); - - for (size_t j = 0; j < fwd->taps.size; j++) { - if ((uintptr_t)(&in[i+2*j]) % 32 == 0) { - AVXin = _mm256_load_ps(&in[i+2*j]); //faster when aligned - } - else { - AVXin = _mm256_loadu_ps(&in[i+2*j]); - } - - AVXtaps = _mm256_load1_ps(fwd->taps[j]); - - AVXout = _mm256_add_ps(AVXout, _mm256_mul_ps(AVXin, AVXtaps)); - } - _mm256_store_ps(&out[i], AVXout); - } - - for (; i < sizeIn; i++) { - out[i] = 0.0; - for (int j = 0; i+2*j < sizeIn; j++) { - out[i] += in[i+2*j] * fwd->taps[j]; - } - } - } - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end); - -#elif __SSE__ +#if __SSE__ // The SSE accelerated version cannot work on the complex values, // it is necessary to do the convolution on the real and imaginary // parts separately. Thankfully, the taps are real, simplifying the @@ -317,10 +253,6 @@ FIRFilter::FIRFilter(std::string& taps_file) : load_filter_taps(myTapsFile); -#if __AVX__ - fprintf(stderr, "FIRFilter: WARNING: using experimental AVX code !\n"); -#endif - PDEBUG("FIRFilter: Starting worker\n" ); worker.start(&firwd); } |