From f2118c3b98b2e6cce2cb78ca41977a228a7e3b5f Mon Sep 17 00:00:00 2001
From: "Matthias P. Braendli (think)" <matthias@mpb.li>
Date: Wed, 26 Sep 2012 19:07:28 +0200
Subject: crc-dabmod: added AVX FIRFilter implementation that is disabled by
 default because it's slower than the SSE version.

---
 src/Buffer.cpp    |  6 +++--
 src/FIRFilter.cpp | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 5 deletions(-)
diff --git a/src/Buffer.cpp b/src/Buffer.cpp
index cef4ad5..7fe0334 100644
--- a/src/Buffer.cpp
+++ b/src/Buffer.cpp
@@ -69,8 +69,10 @@ void Buffer::setLength(size_t len)
 {
     if (len > size) {
         void *tmp = data;
-        //data = _mm_malloc(len, 16);
-        data = memalign(16, len);
+
+        /* Align to 32-byte boundary for AVX. */
+        data = memalign(32, len);
+
         memcpy(data, tmp, this->len);
         free(tmp);
         size = len;
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp
index 91a52ec..5d112d0 100644
--- a/src/FIRFilter.cpp
+++ b/src/FIRFilter.cpp
@@ -36,8 +36,12 @@
 #include <iostream>
 #include <fstream>
 
-#ifdef __SSE__
-#   include <xmmintrin.h>
+#ifdef __AVX__
+#   include <immintrin.h>
+#else
+#    ifdef __SSE__
+#        include <xmmintrin.h>
+#    endif
 #endif
 
 
@@ -62,7 +66,67 @@ void FIRFilterWorker::process(struct FIRFilterWorkerData *fwd)
 
         PDEBUG("FIRFilterWorker: dataIn->getLength() %d\n", dataIn->getLength());
 
-#if __SSE__
+#if __AVX__
+#define _mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
+#warning FIRFilter uses experimental AVX code
+
+        // The AVX accelerated version cannot work on the complex values,
+        // it is necessary to do the convolution on the real and imaginary
+        // parts separately. Thankfully, the taps are real, simplifying the
+        // procedure.
+        //
+        // The AVX version is not enabled by default, because the performance
+        // on my test machine (sandy bridge i7) is slightly worse with AVX than
+        // with SSE. TODO: Try with Ivy Bridge or newer.
+        //
+        // Interesting links:
+        // http://software.intel.com/en-us/forums/topic/283753
+
+        const float* in = reinterpret_cast<const float*>(dataIn->getData());
+        float* out      = reinterpret_cast<float*>(dataOut->getData());
+        size_t sizeIn   = dataIn->getLength() / sizeof(float);
+
+        if ((uintptr_t)(&out[0]) % 32 != 0) {
+            fprintf(stderr, "FIRFilterWorker: out not aligned %p ", out);
+            throw std::runtime_error("FIRFilterWorker: out not aligned");
+        }
+            
+        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_start);
+
+        __m256 AVXout;
+        __m256 AVXtaps;
+        __m256 AVXin;
+        {
+            boost::mutex::scoped_lock lock(fwd->taps_mutex);
+
+            for (i = 0; i < sizeIn - 2*fwd->n_taps; i += 8) {
+                AVXout = _mm256_setr_ps(0,0,0,0,0,0,0,0);
+
+                for (int j = 0; j < fwd->n_taps; j++) {
+                    if ((uintptr_t)(&in[i+2*j]) % 32 == 0) {
+                        AVXin = _mm256_load_ps(&in[i+2*j]); //faster when aligned
+                    }
+                    else {
+                        AVXin = _mm256_loadu_ps(&in[i+2*j]);
+                    }
+
+                    AVXtaps = _mm256_load1_ps(fwd->taps[j]);
+
+                    AVXout = _mm256_add_ps(AVXout, _mm256_mul_ps(AVXin, AVXtaps));
+                }
+                _mm256_store_ps(&out[i], AVXout);
+            }
+
+            for (; i < sizeIn; i++) {
+                out[i] = 0.0;
+                for (int j = 0; i+2*j < sizeIn; j++) {
+                    out[i] += in[i+2*j] * fwd->taps[j];
+                }
+            }
+        }
+        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time_end);
+
+#elif __SSE__
         // The SSE accelerated version cannot work on the complex values,
         // it is necessary to do the convolution on the real and imaginary
         // parts separately. Thankfully, the taps are real, simplifying the
@@ -251,6 +315,10 @@ FIRFilter::FIRFilter(std::string taps_file) :
 
     load_filter_taps();
 
+#if __AVX__
+    fprintf(stderr, "FIRFilter: WARNING: using experimental AVX code !\n");
+#endif
+
     PDEBUG("FIRFilter: Starting worker\n" );
     worker.start(&firwd);
 }
-- 
cgit v1.2.3