diff options
author | Matthias P. Braendli (think) <matthias@mpb.li> | 2012-09-26 19:07:28 +0200 |
---|---|---|
committer | Matthias P. Braendli (think) <matthias@mpb.li> | 2012-09-26 19:07:28 +0200 |
commit | f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f (patch) | |
tree | 03c49ff7cd732d430ee6c2b98e59e1b2ef743388 | |
parent | 7970dc6aedc3d9ac7bafc7a63d5ef954f241a378 (diff) | |
download | dabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.tar.gz dabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.tar.bz2 dabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.zip |
crc-dabmod: added AVX FIRFilter implementation
that is disabled by default because it's slower
than the SSE version.
-rw-r--r-- | src/Buffer.cpp | 6 | ||||
-rw-r--r-- | src/FIRFilter.cpp | 74 |
2 files changed, 75 insertions, 5 deletions
diff --git a/src/Buffer.cpp b/src/Buffer.cpp index cef4ad5..7fe0334 100644 --- a/src/Buffer.cpp +++ b/src/Buffer.cpp @@ -69,8 +69,10 @@ void Buffer::setLength(size_t len) { if (len > size) { void *tmp = data; - //data = _mm_malloc(len, 16); - data = memalign(16, len); + + /* Align to 32-byte boundary for AVX. */ + data = memalign(32, len); + memcpy(data, tmp, this->len); free(tmp); size = len; diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp index 91a52ec..5d112d0 100644 --- a/src/FIRFilter.cpp +++ b/src/FIRFilter.cpp @@ -36,8 +36,12 @@ #include <iostream> #include <fstream> -#ifdef __SSE__ -# include <xmmintrin.h> +#ifdef __AVX__ +# include <immintrin.h> +#else +# ifdef __SSE__ +# include <xmmintrin.h> +# endif #endif @@ -62,7 +66,67 @@ void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd) PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength()); -#if __SSE__ +#if __AVX__ +#define _mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x) +#warning FIRFilter uses experimental AVX code + + // The AVX accelerated version cannot work on the complex values, + // it is necessary to do the convolution on the real and imaginary + // parts separately. Thankfully, the taps are real, simplifying the + // procedure. + // + // The AVX version is not enabled by default, because the performance + // on my test machine (sandy bridge i7) is slightly worse with AVX than + // with SSE. TODO: Try with Ivy Bridge or newer. + // + // Interesting links: + // http://software.intel.com/en-us/forums/topic/283753 + + const float* in = reinterpret_cast<const float*>(dataIn->getData()); + float* out = reinterpret_cast<float*>(dataOut->getData()); + size_t sizeIn = dataIn->getLength() / sizeof(float); + + if ((uintptr_t)(&out[0]) % 32 != 0) { + fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out); + throw std::runtime_error("FIRFilterWorker: out not aligned"); + } + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start); + + __m256 AVXout; + __m256 AVXtaps; + __m256 AVXin; + { + boost::mutex::scoped_lock lock(fwd->taps_mutex); + + for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 8) { + AVXout = _mm256_setr_ps(0,0,0,0,0,0,0,0); + + for (int j = 0; j < fwd->n_taps; j++) { + if ((uintptr_t)(&in[i+2*j]) % 32 == 0) { + AVXin = _mm256_load_ps(&in[i+2*j]); //faster when aligned + } + else { + AVXin = _mm256_loadu_ps(&in[i+2*j]); + } + + AVXtaps = _mm256_load1_ps(fwd->taps[j]); + + AVXout = _mm256_add_ps(AVXout, _mm256_mul_ps(AVXin, AVXtaps)); + } + _mm256_store_ps(&out[i], AVXout); + } + + for (; i < sizeIn; i++) { + out[i] = 0.0; + for (int j = 0; i+2*j < sizeIn; j++) { + out[i] += in[i+2*j] * fwd->taps[j]; + } + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end); + +#elif __SSE__ // The SSE accelerated version cannot work on the complex values, // it is necessary to do the convolution on the real and imaginary // parts separately. Thankfully, the taps are real, simplifying the @@ -251,6 +315,10 @@ FIRFilter::FIRFilter(std::string taps_file) : load_filter_taps(); +#if __AVX__ + fprintf(stderr, "FIRFilter: WARNING: using experimental AVX code !\n"); +#endif + PDEBUG("FIRFilter: Starting worker\n" ); worker.start(&firwd); } |