aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMatthias P. Braendli (think) <matthias@mpb.li>2012-09-26 19:07:28 +0200
committerMatthias P. Braendli (think) <matthias@mpb.li>2012-09-26 19:07:28 +0200
commitf2118c3b98b2e6cce2cb78ca41977a228a7e3b5f (patch)
tree03c49ff7cd732d430ee6c2b98e59e1b2ef743388 /src
parent7970dc6aedc3d9ac7bafc7a63d5ef954f241a378 (diff)
downloaddabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.tar.gz
dabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.tar.bz2
dabmod-f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f.zip
crc-dabmod: added AVX FIRFilter implementation
that is disabled by default because it's slower than the SSE version.
Diffstat (limited to 'src')
-rw-r--r--src/Buffer.cpp6
-rw-r--r--src/FIRFilter.cpp74
2 files changed, 75 insertions, 5 deletions
diff --git a/src/Buffer.cpp b/src/Buffer.cpp
index cef4ad5..7fe0334 100644
--- a/src/Buffer.cpp
+++ b/src/Buffer.cpp
@@ -69,8 +69,10 @@ void Buffer::setLength(size_t len)
{
if (len > size) {
void *tmp = data;
- //data = _mm_malloc(len, 16);
- data = memalign(16, len);
+
+ /* Align to 32-byte boundary for AVX. */
+ data = memalign(32, len);
+
memcpy(data, tmp, this->len);
free(tmp);
size = len;
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp
index 91a52ec..5d112d0 100644
--- a/src/FIRFilter.cpp
+++ b/src/FIRFilter.cpp
@@ -36,8 +36,12 @@
#include <iostream>
#include <fstream>
-#ifdef __SSE__
-# include <xmmintrin.h>
+#ifdef __AVX__
+# include <immintrin.h>
+#else
+# ifdef __SSE__
+# include <xmmintrin.h>
+# endif
#endif
@@ -62,7 +66,67 @@ void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd)
PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength());
-#if __SSE__
+#if __AVX__
+#define _mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
+#warning FIRFilter uses experimental AVX code
+
+ // The AVX accelerated version cannot work on the complex values,
+ // it is necessary to do the convolution on the real and imaginary
+ // parts separately. Thankfully, the taps are real, simplifying the
+ // procedure.
+ //
+ // The AVX version is not enabled by default, because the performance
+ // on my test machine (sandy bridge i7) is slightly worse with AVX than
+ // with SSE. TODO: Try with Ivy Bridge or newer.
+ //
+ // Interesting links:
+ // http://software.intel.com/en-us/forums/topic/283753
+
+ const float* in = reinterpret_cast<const float*>(dataIn->getData());
+ float* out = reinterpret_cast<float*>(dataOut->getData());
+ size_t sizeIn = dataIn->getLength() / sizeof(float);
+
+ if ((uintptr_t)(&out[0]) % 32 != 0) {
+ fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
+ throw std::runtime_error("FIRFilterWorker: out not aligned");
+ }
+
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
+
+ __m256 AVXout;
+ __m256 AVXtaps;
+ __m256 AVXin;
+ {
+ boost::mutex::scoped_lock lock(fwd->taps_mutex);
+
+ for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 8) {
+ AVXout = _mm256_setr_ps(0,0,0,0,0,0,0,0);
+
+ for (int j = 0; j < fwd->n_taps; j++) {
+ if ((uintptr_t)(&in[i+2*j]) % 32 == 0) {
+ AVXin = _mm256_load_ps(&in[i+2*j]); //faster when aligned
+ }
+ else {
+ AVXin = _mm256_loadu_ps(&in[i+2*j]);
+ }
+
+ AVXtaps = _mm256_load1_ps(fwd->taps[j]);
+
+ AVXout = _mm256_add_ps(AVXout, _mm256_mul_ps(AVXin, AVXtaps));
+ }
+ _mm256_store_ps(&out[i], AVXout);
+ }
+
+ for (; i < sizeIn; i++) {
+ out[i] = 0.0;
+ for (int j = 0; i+2*j < sizeIn; j++) {
+ out[i] += in[i+2*j] * fwd->taps[j];
+ }
+ }
+ }
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
+
+#elif __SSE__
// The SSE accelerated version cannot work on the complex values,
// it is necessary to do the convolution on the real and imaginary
// parts separately. Thankfully, the taps are real, simplifying the
@@ -251,6 +315,10 @@ FIRFilter::FIRFilter(std::string taps_file) :
load_filter_taps();
+#if __AVX__
+ fprintf(stderr, "FIRFilter: WARNING: using experimental AVX code !\n");
+#endif
+
PDEBUG("FIRFilter: Starting worker\n" );
worker.start(&firwd);
}