convert: Add SSE implementation for sc12

Implementation uses SSSE3 intructions to perform 12-bit sample pack/unpack operations to/from standard 16 and 32 bit host values. Input/output shuffle orderings for a single 128-bit SSE register with 16-bit integers shown below. 16-bit interleaved I/Q --------------------------------------- | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Input --------------------------------------- | 127 0 | 12-bit packed I/Q byteswapped ----------------------- | I0 | Q0 | I1 | 0 |-----------------------| | I1 | Q1 | I2 | Q2 | Output |-----------------------| | Q2 | I3 | Q3 | |-----------------------| | Unused | 3 ----------------------- 31 0 Fixes: #1740, #966 Related: #1739
author: Tom Tsou <tom.tsou@ettus.com> 2017-07-07 15:32:20 -0700
committer: Martin Braun <martin.braun@ettus.com> 2017-07-25 10:15:37 -0700
commit: 0e9f204029e5eac51d94f16ceb19f003e3faf7e8 (patch)
tree: ad71a66f4aa84fc17e965d8682741584f64aa18d
parent: 8223a289727bbda353bd7129512daf00d46d898c (diff)
download: uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.tar.gz
uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.tar.bz2
uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.zip
7 files changed, 719 insertions, 211 deletions
diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt
index 10376ba9c..cfd3c7f34 100644
--- a/host/lib/convert/CMakeLists.txt
+++ b/host/lib/convert/CMakeLists.txt
@@ -26,6 +26,7 @@ MESSAGE(STATUS "")
 ########################################################################
 IF(CMAKE_COMPILER_IS_GNUCXX)
     SET(EMMINTRIN_FLAGS -msse2)
+    SET(TMMINTRIN_FLAGS -mssse3)
 ELSEIF(MSVC)
     SET(EMMINTRIN_FLAGS /arch:SSE2)
 ENDIF()
@@ -34,6 +35,10 @@ SET(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
 CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
 SET(CMAKE_REQUIRED_FLAGS)
 
+SET(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
+CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
+SET(CMAKE_REQUIRED_FLAGS)
+
 IF(HAVE_EMMINTRIN_H)
     SET(convert_with_sse2_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
@@ -53,6 +58,18 @@ IF(HAVE_EMMINTRIN_H)
     LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
 ENDIF(HAVE_EMMINTRIN_H)
 
+IF(HAVE_TMMINTRIN_H)
+    SET(convert_with_ssse3_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
+    )
+    SET_SOURCE_FILES_PROPERTIES(
+        ${convert_with_ssse3_sources}
+        PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
+    )
+    LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
+ENDIF(HAVE_TMMINTRIN_H)
+
 ########################################################################
 # Check for NEON SIMD headers
 ########################################################################
diff --git a/host/lib/convert/convert_pack_sc12.cpp b/host/lib/convert/convert_pack_sc12.cpp
index 2e45e19f5..85194dcdd 100644
--- a/host/lib/convert/convert_pack_sc12.cpp
+++ b/host/lib/convert/convert_pack_sc12.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright 2013 Ettus Research LLC
+// Copyright 2017 Ettus Research LLC
 //
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@@ -15,122 +15,10 @@
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 
-#include "convert_common.hpp"
-#include <uhd/utils/byteswap.hpp>
-#include <uhd/utils/log.hpp>
-#include <boost/math/special_functions/round.hpp>
-#include <vector>
-#include <type_traits>
+#include "convert_pack_sc12.hpp"
 
 using namespace uhd::convert;
 
-typedef uint32_t (*towire32_type)(uint32_t);
-
-/* C language specification requires this to be packed
- * (i.e., line0, line1, line2 will be in adjacent memory locations).
- * If this was not true, we'd need compiler flags here to specify
- * alignment/packing.
- */
-struct item32_sc12_3x
-{
-    item32_t line0;
-    item32_t line1;
-    item32_t line2;
-};
-
-enum item32_sc12_3x_enable {
-    CONVERT12_LINE0 = 0x01,
-    CONVERT12_LINE1 = 0x02,
-    CONVERT12_LINE2 = 0x04,
-    CONVERT12_LINE_ALL = 0x07,
-};
-
-/*
- * Packed 12-bit converter with selective line enable
- *
- * The converter operates on 4 complex inputs and selectively writes to one to
- * three 32-bit lines. Line selection allows for partial writes of less than
- * 4 complex samples, or a full 3 x 32-bit struct. Writes are always full 32-bit
- * lines, so in the case of partial writes, the number of bytes written will
- * exceed the the number of bytes filled by actual samples.
- *
- *  _ _ _ _ _ _ _ _
- * |_ _ _1_ _ _|_ _| 0
- * |_2_ _ _|_ _ _3_|
- * |_ _|_ _ _4_ _ _| 2
- * 31              0
- */
-template <towire32_type towire>
-inline void pack(item32_sc12_3x &output, int enable, const int32_t i[4], const int32_t q[4])
-{
-    if (enable & CONVERT12_LINE0)
-        output.line0 = towire(i[0] << 20 | q[0] <<  8 | i[1] >> 4);
-    if (enable & CONVERT12_LINE1)
-        output.line1 = towire(i[1] << 28 | q[1] << 16 | i[2] << 4 | q[2] >> 8);
-    if (enable & CONVERT12_LINE2)
-        output.line2 = towire(q[2] << 24 | i[3] << 12 | q[3]);
-}
-
-template <typename type, towire32_type towire>
-void convert_star_4_to_sc12_item32_3
-(
-    const std::complex<type> &in0,
-    const std::complex<type> &in1,
-    const std::complex<type> &in2,
-    const std::complex<type> &in3,
-    const int enable,
-    item32_sc12_3x &output,
-    const double scalar,
-    typename std::enable_if<std::is_floating_point<type>::value>::type* = NULL
-)
-{
-    int32_t i[4] {
-        int32_t(in0.real()*scalar) & 0xfff,
-        int32_t(in1.real()*scalar) & 0xfff,
-        int32_t(in2.real()*scalar) & 0xfff,
-        int32_t(in3.real()*scalar) & 0xfff,
-    };
-
-    int32_t q[4] {
-        int32_t(in0.imag()*scalar) & 0xfff,
-        int32_t(in1.imag()*scalar) & 0xfff,
-        int32_t(in2.imag()*scalar) & 0xfff,
-        int32_t(in3.imag()*scalar) & 0xfff,
-    };
-
-    pack<towire>(output, enable, i, q);
-}
-
-template <typename type, towire32_type towire>
-void convert_star_4_to_sc12_item32_3
-(
-    const std::complex<type> &in0,
-    const std::complex<type> &in1,
-    const std::complex<type> &in2,
-    const std::complex<type> &in3,
-    const int enable,
-    item32_sc12_3x &output,
-    const double,
-    typename std::enable_if<std::is_same<type, short>::value>::type* = NULL
-)
-{
-    int32_t i[4] {
-        int32_t(in0.real() >> 4) & 0xfff,
-        int32_t(in1.real() >> 4) & 0xfff,
-        int32_t(in2.real() >> 4) & 0xfff,
-        int32_t(in3.real() >> 4) & 0xfff,
-    };
-
-    int32_t q[4] {
-        int32_t(in0.imag() >> 4) & 0xfff,
-        int32_t(in1.imag() >> 4) & 0xfff,
-        int32_t(in2.imag() >> 4) & 0xfff,
-        int32_t(in3.imag() >> 4) & 0xfff,
-    };
-
-    pack<towire>(output, enable, i, q);
-}
-
 template <typename type, towire32_type towire>
 struct convert_star_1_to_sc12_item32_1 : public converter
 {
diff --git a/host/lib/convert/convert_pack_sc12.hpp b/host/lib/convert/convert_pack_sc12.hpp
new file mode 100644
index 000000000..754c47cd2
--- /dev/null
+++ b/host/lib/convert/convert_pack_sc12.hpp
@@ -0,0 +1,123 @@
+//
+// Copyright 2017 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include <type_traits>
+#include <uhd/utils/byteswap.hpp>
+#include "convert_common.hpp"
+
+using namespace uhd::convert;
+
+typedef uint32_t (*towire32_type)(uint32_t);
+
+/* C language specification requires this to be packed
+ * (i.e., line0, line1, line2 will be in adjacent memory locations).
+ * If this was not true, we'd need compiler flags here to specify
+ * alignment/packing.
+ */
+struct item32_sc12_3x
+{
+    item32_t line0;
+    item32_t line1;
+    item32_t line2;
+};
+
+enum item32_sc12_3x_enable {
+    CONVERT12_LINE0 = 0x01,
+    CONVERT12_LINE1 = 0x02,
+    CONVERT12_LINE2 = 0x04,
+    CONVERT12_LINE_ALL = 0x07,
+};
+
+/*
+ * Packed 12-bit converter with selective line enable
+ *
+ * The converter operates on 4 complex inputs and selectively writes to one to
+ * three 32-bit lines. Line selection allows for partial writes of less than
+ * 4 complex samples, or a full 3 x 32-bit struct. Writes are always full 32-bit
+ * lines, so in the case of partial writes, the number of bytes written will
+ * exceed the the number of bytes filled by actual samples.
+ *
+ *  _ _ _ _ _ _ _ _
+ * |_ _ _1_ _ _|_ _| 0
+ * |_2_ _ _|_ _ _3_|
+ * |_ _|_ _ _4_ _ _| 2
+ * 31              0
+ */
+template <towire32_type towire>
+void pack(item32_sc12_3x &output, int enable, const int32_t iq[8])
+{
+    if (enable & CONVERT12_LINE0)
+        output.line0 = towire(iq[0] << 20 | iq[1] <<  8 | iq[2] >> 4);
+    if (enable & CONVERT12_LINE1)
+        output.line1 = towire(iq[2] << 28 | iq[3] << 16 | iq[4] << 4 | iq[5] >> 8);
+    if (enable & CONVERT12_LINE2)
+        output.line2 = towire(iq[5] << 24 | iq[6] << 12 | iq[7] << 0);
+}
+
+template <typename type, towire32_type towire>
+void convert_star_4_to_sc12_item32_3
+(
+    const std::complex<type> &in0,
+    const std::complex<type> &in1,
+    const std::complex<type> &in2,
+    const std::complex<type> &in3,
+    const int enable,
+    item32_sc12_3x &output,
+    const double scalar,
+    typename std::enable_if<std::is_floating_point<type>::value>::type* = NULL
+)
+{
+    int32_t iq[8] {
+        int32_t(in0.real()*scalar) & 0xfff,
+        int32_t(in0.imag()*scalar) & 0xfff,
+        int32_t(in1.real()*scalar) & 0xfff,
+        int32_t(in1.imag()*scalar) & 0xfff,
+
+        int32_t(in2.real()*scalar) & 0xfff,
+        int32_t(in2.imag()*scalar) & 0xfff,
+        int32_t(in3.real()*scalar) & 0xfff,
+        int32_t(in3.imag()*scalar) & 0xfff,
+    };
+    pack<towire>(output, enable, iq);
+}
+
+template <typename type, towire32_type towire>
+void convert_star_4_to_sc12_item32_3
+(
+    const std::complex<type> &in0,
+    const std::complex<type> &in1,
+    const std::complex<type> &in2,
+    const std::complex<type> &in3,
+    const int enable,
+    item32_sc12_3x &output,
+    const double,
+    typename std::enable_if<std::is_same<type, short>::value>::type* = NULL
+)
+{
+    int32_t iq[8] {
+        int32_t(in0.real() >> 4) & 0xfff,
+        int32_t(in0.imag() >> 4) & 0xfff,
+        int32_t(in1.real() >> 4) & 0xfff,
+        int32_t(in1.imag() >> 4) & 0xfff,
+
+        int32_t(in2.real() >> 4) & 0xfff,
+        int32_t(in2.imag() >> 4) & 0xfff,
+        int32_t(in3.real() >> 4) & 0xfff,
+        int32_t(in3.imag() >> 4) & 0xfff,
+    };
+    pack<towire>(output, enable, iq);
+}
diff --git a/host/lib/convert/convert_unpack_sc12.cpp b/host/lib/convert/convert_unpack_sc12.cpp
index 07f9cffa0..43c35ee3b 100644
--- a/host/lib/convert/convert_unpack_sc12.cpp
+++ b/host/lib/convert/convert_unpack_sc12.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright 2013 Ettus Research LLC
+// Copyright 2017 Ettus Research LLC
 //
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@@ -15,105 +15,10 @@
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 
-#include "convert_common.hpp"
-#include <uhd/utils/byteswap.hpp>
-#include <uhd/utils/log.hpp>
-#include <boost/math/special_functions/round.hpp>
-#include <vector>
-#include <type_traits>
+#include "convert_unpack_sc12.hpp"
 
 using namespace uhd::convert;
 
-typedef uint32_t (*tohost32_type)(uint32_t);
-
-/* C language specification requires this to be packed
- * (i.e., line0, line1, line2 will be in adjacent memory locations).
- * If this was not true, we'd need compiler flags here to specify
- * alignment/packing.
- */
-struct item32_sc12_3x
-{
-    item32_t line0;
-    item32_t line1;
-    item32_t line2;
-};
-
-/*
- * convert_sc12_item32_3_to_star_4 takes in 3 lines with 32 bit each
- * and converts them 4 samples of type 'std::complex<type>'.
- * The structure of the 3 lines is as follows:
- *  _ _ _ _ _ _ _ _
- * |_ _ _1_ _ _|_ _|
- * |_2_ _ _|_ _ _3_|
- * |_ _|_ _ _4_ _ _|
- *
- * The numbers mark the position of one complex sample.
- */
-template <typename type, tohost32_type tohost>
-void convert_sc12_item32_3_to_star_4
-(
-    const item32_sc12_3x &input,
-    std::complex<type> &out0,
-    std::complex<type> &out1,
-    std::complex<type> &out2,
-    std::complex<type> &out3,
-    const double scalar,
-    typename std::enable_if<std::is_floating_point<type>::value>::type* = NULL
-)
-{
-    //step 0: extract the lines from the input buffer
-    const item32_t line0 = tohost(input.line0);
-    const item32_t line1 = tohost(input.line1);
-    const item32_t line2 = tohost(input.line2);
-    const uint64_t line01 = (uint64_t(line0) << 32) | line1;
-    const uint64_t line12 = (uint64_t(line1) << 32) | line2;
-
-    //step 1: shift out and mask off the individual numbers
-    const type i0 = type(int16_t((line0 >> 16) & 0xfff0)*scalar);
-    const type q0 = type(int16_t((line0 >> 4) & 0xfff0)*scalar);
-
-    const type i1 = type(int16_t((line01 >> 24) & 0xfff0)*scalar);
-    const type q1 = type(int16_t((line1 >> 12) & 0xfff0)*scalar);
-
-    const type i2 = type(int16_t((line1 >> 0) & 0xfff0)*scalar);
-    const type q2 = type(int16_t((line12 >> 20) & 0xfff0)*scalar);
-
-    const type i3 = type(int16_t((line2 >> 8) & 0xfff0)*scalar);
-    const type q3 = type(int16_t((line2 << 4) & 0xfff0)*scalar);
-
-    //step 2: load the outputs
-    out0 = std::complex<type>(i0, q0);
-    out1 = std::complex<type>(i1, q1);
-    out2 = std::complex<type>(i2, q2);
-    out3 = std::complex<type>(i3, q3);
-}
-
-template <typename type, tohost32_type tohost>
-void convert_sc12_item32_3_to_star_4
-(
-    const item32_sc12_3x &input,
-    std::complex<type> &out0,
-    std::complex<type> &out1,
-    std::complex<type> &out2,
-    std::complex<type> &out3,
-    const double,
-    typename std::enable_if<std::is_integral<type>::value>::type* = NULL
-)
-{
-    //step 0: extract the lines from the input buffer
-    const item32_t line0 = tohost(input.line0);
-    const item32_t line1 = tohost(input.line1);
-    const item32_t line2 = tohost(input.line2);
-    const uint64_t line01 = (uint64_t(line0) << 32) | line1;
-    const uint64_t line12 = (uint64_t(line1) << 32) | line2;
-
-    //step 1: extract and load the outputs
-    out0 = std::complex<type>(line0  >> 16 & 0xfff0, line0  >>  4 & 0xfff0);
-    out1 = std::complex<type>(line01 >> 24 & 0xfff0, line1  >> 12 & 0xfff0);
-    out2 = std::complex<type>(line1  >>  0 & 0xfff0, line12 >> 20 & 0xfff0);
-    out3 = std::complex<type>(line2  >>  8 & 0xfff0, line2  <<  4 & 0xfff0);
-}
-
 template <typename type, tohost32_type tohost>
 struct convert_sc12_item32_1_to_star_1 : public converter
 {
diff --git a/host/lib/convert/convert_unpack_sc12.hpp b/host/lib/convert/convert_unpack_sc12.hpp
new file mode 100644
index 000000000..46e7d58fb
--- /dev/null
+++ b/host/lib/convert/convert_unpack_sc12.hpp
@@ -0,0 +1,112 @@
+//
+// Copyright 2017 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include <type_traits>
+#include <uhd/utils/byteswap.hpp>
+#include "convert_common.hpp"
+
+using namespace uhd::convert;
+
+typedef uint32_t (*tohost32_type)(uint32_t);
+
+/* C language specification requires this to be packed
+ * (i.e., line0, line1, line2 will be in adjacent memory locations).
+ * If this was not true, we'd need compiler flags here to specify
+ * alignment/packing.
+ */
+struct item32_sc12_3x
+{
+    item32_t line0;
+    item32_t line1;
+    item32_t line2;
+};
+
+/*
+ * convert_sc12_item32_3_to_star_4 takes in 3 lines with 32 bit each
+ * and converts them 4 samples of type 'std::complex<type>'.
+ * The structure of the 3 lines is as follows:
+ *  _ _ _ _ _ _ _ _
+ * |_ _ _1_ _ _|_ _|
+ * |_2_ _ _|_ _ _3_|
+ * |_ _|_ _ _4_ _ _|
+ *
+ * The numbers mark the position of one complex sample.
+ */
+template <typename type, tohost32_type tohost>
+void convert_sc12_item32_3_to_star_4
+(
+    const item32_sc12_3x &input,
+    std::complex<type> &out0,
+    std::complex<type> &out1,
+    std::complex<type> &out2,
+    std::complex<type> &out3,
+    const double scalar,
+    typename std::enable_if<std::is_floating_point<type>::value>::type* = NULL
+)
+{
+    //step 0: extract the lines from the input buffer
+    const item32_t line0 = tohost(input.line0);
+    const item32_t line1 = tohost(input.line1);
+    const item32_t line2 = tohost(input.line2);
+    const uint64_t line01 = (uint64_t(line0) << 32) | line1;
+    const uint64_t line12 = (uint64_t(line1) << 32) | line2;
+
+    //step 1: shift out and mask off the individual numbers
+    const type i0 = type(int16_t((line0 >> 16) & 0xfff0)*scalar);
+    const type q0 = type(int16_t((line0 >> 4) & 0xfff0)*scalar);
+
+    const type i1 = type(int16_t((line01 >> 24) & 0xfff0)*scalar);
+    const type q1 = type(int16_t((line1 >> 12) & 0xfff0)*scalar);
+
+    const type i2 = type(int16_t((line1 >> 0) & 0xfff0)*scalar);
+    const type q2 = type(int16_t((line12 >> 20) & 0xfff0)*scalar);
+
+    const type i3 = type(int16_t((line2 >> 8) & 0xfff0)*scalar);
+    const type q3 = type(int16_t((line2 << 4) & 0xfff0)*scalar);
+
+    //step 2: load the outputs
+    out0 = std::complex<type>(i0, q0);
+    out1 = std::complex<type>(i1, q1);
+    out2 = std::complex<type>(i2, q2);
+    out3 = std::complex<type>(i3, q3);
+}
+
+template <typename type, tohost32_type tohost>
+void convert_sc12_item32_3_to_star_4
+(
+    const item32_sc12_3x &input,
+    std::complex<type> &out0,
+    std::complex<type> &out1,
+    std::complex<type> &out2,
+    std::complex<type> &out3,
+    const double,
+    typename std::enable_if<std::is_integral<type>::value>::type* = NULL
+)
+{
+    //step 0: extract the lines from the input buffer
+    const item32_t line0 = tohost(input.line0);
+    const item32_t line1 = tohost(input.line1);
+    const item32_t line2 = tohost(input.line2);
+    const uint64_t line01 = (uint64_t(line0) << 32) | line1;
+    const uint64_t line12 = (uint64_t(line1) << 32) | line2;
+
+    //step 1: extract and load the outputs
+    out0 = std::complex<type>(line0  >> 16 & 0xfff0, line0  >>  4 & 0xfff0);
+    out1 = std::complex<type>(line01 >> 24 & 0xfff0, line1  >> 12 & 0xfff0);
+    out2 = std::complex<type>(line1  >>  0 & 0xfff0, line12 >> 20 & 0xfff0);
+    out3 = std::complex<type>(line2  >>  8 & 0xfff0, line2  <<  4 & 0xfff0);
+}
diff --git a/host/lib/convert/ssse3_pack_sc12.cpp b/host/lib/convert/ssse3_pack_sc12.cpp
new file mode 100644
index 000000000..42c429b67
--- /dev/null
+++ b/host/lib/convert/ssse3_pack_sc12.cpp
@@ -0,0 +1,244 @@
+//
+// Copyright 2017 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include <tmmintrin.h>
+#include "convert_pack_sc12.hpp"
+
+/*
+ * Shuffle Orderings - Single 128-bit SSE register
+ *
+ *   16-bit interleaved I/Q
+ *  ---------------------------------------
+ * | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Input
+ *  ---------------------------------------
+ * | 127                                 0 |
+ *
+ *
+ *   12-bit deinterleaved unpacked I/Q
+ *  ---------------------------------------
+ * | I3 | I2 | I1 | I0 | Q3 | Q2 | Q1 | Q0 | Shuffle-1
+ *  ---------------------------------------
+ * | High bit aligned  |  4-bit >> offset  |
+ *
+ *
+ *   12-bit interleaved packed I/Q
+ *  ---------------------------------------
+ * |I0|Q0|I1|Q1|I2|Q2|I3|Q3|               | Shuffle-2 | Shuffle-3
+ *  ---------------------------------------
+ * | 127                32 | 31  Empty   0 |
+ *
+ *
+ *     12-bit packed I/Q byteswapped
+ *      -----------------------
+ *     |   I0   |   Q0   |  I1 | 0
+ *     |-----------------------|
+ *     | I1 |  Q1  |  I2  | Q2 |             Output
+ *     |-----------------------|
+ *     | Q2  |   I3   |   Q3   |
+ *     |-----------------------|
+ *     |        Unused         | 3
+ *      -----------------------
+ *     31                     0
+ */
+#define SC12_SHIFT_MASK      0xfff0fff0, 0xfff0fff0, 0x0fff0fff, 0x0fff0fff
+#define SC12_PACK_SHUFFLE1   13,12,9,8,5,4,1,0,15,14,11,10,7,6,3,2
+#define SC12_PACK_SHUFFLE2   9,8,0,11,10,2,13,12,4,15,14,6,0,0,0,0
+#define SC12_PACK_SHUFFLE3   8,1,8,8,3,8,8,5,8,8,7,8,8,8,8,8
+
+template <typename type>
+inline void convert_star_4_to_sc12_item32_3
+(
+    const std::complex<type> *in,
+    item32_sc12_3x &output,
+    const double scalar,
+    typename std::enable_if<std::is_same<type, float>::value>::type* = NULL
+)
+{
+    __m128 m0, m1, m2;
+    m0 = _mm_set1_ps(scalar);
+    m1 = _mm_loadu_ps((const float *) &in[0]);
+    m2 = _mm_loadu_ps((const float *) &in[2]);
+    m1 = _mm_mul_ps(m1, m0);
+    m2 = _mm_mul_ps(m2, m0);
+    m0 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(2, 0, 2, 0));
+    m1 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(3, 1, 3, 1));
+
+    __m128i m3, m4, m5, m6, m7;
+    m3 = _mm_set_epi32(SC12_SHIFT_MASK);
+    m4 = _mm_set_epi8(SC12_PACK_SHUFFLE2);
+    m5 = _mm_set_epi8(SC12_PACK_SHUFFLE3);
+
+    m6 = _mm_cvtps_epi32(m0);
+    m7 = _mm_cvtps_epi32(m1);
+    m6 = _mm_slli_epi32(m6, 4);
+    m6 = _mm_packs_epi32(m7, m6);
+    m6 = _mm_and_si128(m6, m3);
+    m7 = _mm_move_epi64(m6);
+
+    m6 = _mm_shuffle_epi8(m6, m4);
+    m7 = _mm_shuffle_epi8(m7, m5);
+    m6 = _mm_or_si128(m6, m7);
+
+    m6 = _mm_shuffle_epi32(m6, _MM_SHUFFLE(0, 1, 2, 3));
+    _mm_storeu_si128((__m128i*) &output, m6);
+}
+
+template <typename type>
+static void convert_star_4_to_sc12_item32_3
+(
+    const std::complex<type> *in,
+    item32_sc12_3x &output,
+    const double,
+    typename std::enable_if<std::is_same<type, short>::value>::type* = NULL
+)
+{
+    __m128i m0, m1, m2, m3, m4, m5;
+    m0 = _mm_set_epi32(SC12_SHIFT_MASK);
+    m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1);
+    m2 = _mm_set_epi8(SC12_PACK_SHUFFLE2);
+    m3 = _mm_set_epi8(SC12_PACK_SHUFFLE3);
+
+    m4 = _mm_loadu_si128((__m128i*) in);
+    m4 = _mm_shuffle_epi8(m4, m1);
+    m5 = _mm_srli_epi16(m4, 4);
+    m4 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(0, 0, 3, 2));
+    m4 = _mm_unpacklo_epi64(m5, m4);
+
+    m4 = _mm_and_si128(m4, m0);
+    m5 = _mm_move_epi64(m4);
+    m4 = _mm_shuffle_epi8(m4, m2);
+    m5 = _mm_shuffle_epi8(m5, m3);
+    m3 = _mm_or_si128(m4, m5);
+
+    m3 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 2, 3));
+    _mm_storeu_si128((__m128i*) &output, m3);
+}
+
+template <typename type, towire32_type towire>
+struct convert_star_1_to_sc12_item32_2 : public converter
+{
+    convert_star_1_to_sc12_item32_2(void):_scalar(0.0)
+    {
+    }
+
+    void set_scalar(const double scalar)
+    {
+        _scalar = scalar;
+    }
+
+    void operator()(const input_type &inputs, const output_type &outputs, const size_t nsamps)
+    {
+        const std::complex<type> *input = reinterpret_cast<const std::complex<type> *>(inputs[0]);
+
+        const size_t head_samps = size_t(outputs[0]) & 0x3;
+        int enable;
+        size_t rewind = 0;
+        switch(head_samps)
+        {
+            case 0: break;
+            case 1: rewind = 9; break;
+            case 2: rewind = 6; break;
+            case 3: rewind = 3; break;
+        }
+        item32_sc12_3x *output = reinterpret_cast<item32_sc12_3x *>(size_t(outputs[0]) - rewind);
+
+        //helper variables
+        size_t i = 0, o = 0;
+
+        //handle the head case
+        switch (head_samps)
+        {
+        case 0:
+            break; //no head
+        case 1:
+            enable = CONVERT12_LINE2;
+            convert_star_4_to_sc12_item32_3<type, towire>(0, 0, 0, input[0], enable, output[o++], _scalar);
+            break;
+        case 2:
+            enable = CONVERT12_LINE2 | CONVERT12_LINE1;
+            convert_star_4_to_sc12_item32_3<type, towire>(0, 0, input[0], input[1], enable, output[o++], _scalar);
+            break;
+        case 3:
+            enable = CONVERT12_LINE2 | CONVERT12_LINE1 | CONVERT12_LINE0;
+            convert_star_4_to_sc12_item32_3<type, towire>(0, input[0], input[1], input[2], enable, output[o++], _scalar);
+            break;
+        }
+        i += head_samps;
+
+        // SSE packed write output is 16 bytes which overwrites the 12-bit
+        // packed struct by 4 bytes. There is no concern if there are
+        // subsequent samples to be converted (writes will simply happen
+        // twice). So set the conversion loop to force a tail case on the
+        // final 4 or fewer samples.
+        while (i+4 < nsamps)
+        {
+            convert_star_4_to_sc12_item32_3<type>(&input[i], output[o], _scalar);
+            o++; i += 4;
+        }
+
+        //handle the tail case
+        const size_t tail_samps = nsamps - i;
+        switch (tail_samps)
+        {
+        case 0:
+            break; //no tail
+        case 1:
+            enable = CONVERT12_LINE0;
+            convert_star_4_to_sc12_item32_3<type, towire>(input[i+0], 0, 0, 0, enable, output[o], _scalar);
+            break;
+        case 2:
+            enable = CONVERT12_LINE0 | CONVERT12_LINE1;
+            convert_star_4_to_sc12_item32_3<type, towire>(input[i+0], input[i+1], 0, 0, enable, output[o], _scalar);
+            break;
+        case 3:
+            enable = CONVERT12_LINE0 | CONVERT12_LINE1 | CONVERT12_LINE2;
+            convert_star_4_to_sc12_item32_3<type, towire>(input[i+0], input[i+1], input[i+2], 0, enable, output[o], _scalar);
+            break;
+        case 4:
+            enable = CONVERT12_LINE_ALL;
+            convert_star_4_to_sc12_item32_3<type, towire>(input[i+0], input[i+1], input[i+2], input[i+3], enable, output[o], _scalar);
+            break;
+        }
+    }
+
+    double _scalar;
+};
+
+static converter::sptr make_convert_fc32_1_to_sc12_item32_le_1(void)
+{
+    return converter::sptr(new convert_star_1_to_sc12_item32_2<float, uhd::wtohx>());
+}
+
+static converter::sptr make_convert_sc16_1_to_sc12_item32_le_1(void)
+{
+    return converter::sptr(new convert_star_1_to_sc12_item32_2<short, uhd::wtohx>());
+}
+
+UHD_STATIC_BLOCK(register_sse_pack_sc12)
+{
+    uhd::convert::id_type id;
+    id.num_inputs = 1;
+    id.num_outputs = 1;
+
+    id.input_format = "fc32";
+    id.output_format = "sc12_item32_le";
+    uhd::convert::register_converter(id, &make_convert_fc32_1_to_sc12_item32_le_1, PRIORITY_SIMD);
+
+    id.input_format = "sc16";
+    id.output_format = "sc12_item32_le";
+    uhd::convert::register_converter(id, &make_convert_sc16_1_to_sc12_item32_le_1, PRIORITY_SIMD);
+}
diff --git a/host/lib/convert/ssse3_unpack_sc12.cpp b/host/lib/convert/ssse3_unpack_sc12.cpp
new file mode 100644
index 000000000..245e64ebc
--- /dev/null
+++ b/host/lib/convert/ssse3_unpack_sc12.cpp
@@ -0,0 +1,219 @@
+//
+// Copyright 2017 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include "convert_unpack_sc12.hpp"
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+using namespace uhd::convert;
+
+/*
+ * Shuffle Orderings - Single 128-bit SSE register
+ *
+ *     12-bit packed I/Q byteswapped
+ *      -----------------------
+ *     |   I0   |   Q0   |  I1 | 0
+ *     |-----------------------|
+ *     | I1 |  Q1  |  I2  | Q2 |             Input
+ *     |-----------------------|
+ *     | Q2  |   I3   |   Q3   | 2
+ *      -----------------------
+ *     31                     0
+ *
+ *
+ *   12-bit interleaved packed I/Q
+ *  ---------------------------------------
+ * |I0|Q0|I1|Q1|I2|Q2|I3|Q3|               | Byteswap Removed
+ *  ---------------------------------------
+ * | 127                32 | 31  Empty   0 |
+ *
+ *
+ *           Packed   Unpacked
+ *  Sample    Index    Index   Offset
+ * =====================================
+ *    I0      15,14     0,1      0
+ *    Q0      14,13     8,9      4
+ *    I1      12,11     2,3      0
+ *    Q1      11,10    10,11     4           12-bit Indices
+ *    I2       9,8      4,5      0
+ *    Q2       8,7     12,13     4
+ *    I3       6,5      6,7      0
+ *    Q3       5,4     14,15     4
+ *
+ *
+ *   12-bit deinterleaved unpacked I/Q
+ *  ---------------------------------------
+ * | Q3 | Q2 | Q1 | Q0 | I3 | I2 | I1 | I0 | Shuffle-1
+ *  ---------------------------------------
+ * |  4-bit >> offset  | High bit aligned  |
+ *
+ *
+ *   16-bit interleaved I/Q
+ *  ---------------------------------------
+ * | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Output (Shuffle-2)
+ *  ---------------------------------------
+ * | 127                                 0 |
+ *
+ */
+#define SC12_SHIFT_MASK      0x0fff0fff, 0x0fff0fff, 0xfff0fff0, 0xfff0fff0
+#define SC12_PACK_SHUFFLE1   5,4,8,7,11,10,14,13,6,5,9,8,12,11,15,14
+#define SC12_PACK_SHUFFLE2   15,14,7,6,13,12,5,4,11,10,3,2,9,8,1,0
+
+template <typename type, tohost32_type tohost>
+inline void convert_sc12_item32_3_to_star_4
+(
+    const item32_sc12_3x &input,
+    std::complex<type> *out,
+    double scalar,
+    typename std::enable_if<std::is_same<type, float>::value>::type* = NULL
+)
+{
+    __m128i m0, m1, m2, m3, m4;
+    m0 = _mm_set_epi32(SC12_SHIFT_MASK);
+    m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1);
+    m2 = _mm_loadu_si128((__m128i*) &input);
+    m2 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 1, 2, 3));
+    m3 = _mm_shuffle_epi8(m2, m1);
+    m3 = _mm_and_si128(m3, m0);
+
+    m4 = _mm_setzero_si128();
+    m1 = _mm_unpacklo_epi16(m4, m3);
+    m2 = _mm_unpackhi_epi16(m4, m3);
+    m2 = _mm_slli_epi32(m2, 4);
+    m3 = _mm_unpacklo_epi32(m1, m2);
+    m4 = _mm_unpackhi_epi32(m1, m2);
+
+    __m128 m5, m6, m7;
+    m5 = _mm_set_ps1(scalar/(1 << 16));
+    m6 = _mm_cvtepi32_ps(m3);
+    m7 = _mm_cvtepi32_ps(m4);
+    m6 = _mm_mul_ps(m6, m5);
+    m7 = _mm_mul_ps(m7, m5);
+
+    _mm_storeu_ps(reinterpret_cast<float*>(&out[0]), m6);
+    _mm_storeu_ps(reinterpret_cast<float*>(&out[2]), m7);
+}
+
+template <typename type, tohost32_type tohost>
+inline void convert_sc12_item32_3_to_star_4
+(
+    const item32_sc12_3x &input,
+    std::complex<type> *out,
+    double,
+    typename std::enable_if<std::is_same<type, short>::value>::type* = NULL
+)
+{
+    __m128i m0, m1, m2, m3;
+    m0 = _mm_set_epi32(SC12_SHIFT_MASK);
+    m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1);
+    m2 = _mm_set_epi8(SC12_PACK_SHUFFLE2);
+
+    m3 = _mm_loadu_si128((__m128i*) &input);
+    m3 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 2, 3));
+    m3 = _mm_shuffle_epi8(m3, m1);
+    m3 = _mm_and_si128(m3, m0);
+
+    m0 = _mm_slli_epi16(m3, 4);
+    m1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(1, 0, 0, 0));
+    m0 = _mm_unpackhi_epi64(m1, m0);
+    m1 = _mm_shuffle_epi8(m0, m2);
+
+    _mm_storeu_si128((__m128i*) out, m1);
+}
+
+template <typename type, tohost32_type tohost>
+struct convert_sc12_item32_1_to_star_2 : public converter
+{
+    convert_sc12_item32_1_to_star_2(void):_scalar(0.0)
+    {
+        //NOP
+    }
+
+    void set_scalar(const double scalar)
+    {
+        const int unpack_growth = 16;
+        _scalar = scalar/unpack_growth;
+    }
+
+    void operator()(const input_type &inputs, const output_type &outputs, const size_t nsamps)
+    {
+        const size_t head_samps = size_t(inputs[0]) & 0x3;
+        size_t rewind = 0;
+        switch(head_samps)
+        {
+            case 0: break;
+            case 1: rewind = 9; break;
+            case 2: rewind = 6; break;
+            case 3: rewind = 3; break;
+        }
+
+        const item32_sc12_3x *input = reinterpret_cast<const item32_sc12_3x *>(size_t(inputs[0]) - rewind);
+        std::complex<type> *output = reinterpret_cast<std::complex<type> *>(outputs[0]);
+        std::complex<type> dummy;
+        size_t i = 0, o = 0;
+        switch (head_samps)
+        {
+        case 0: break; //no head
+        case 1: convert_sc12_item32_3_to_star_4<type, tohost>(input[i++], dummy, dummy, dummy, output[0], _scalar); break;
+        case 2: convert_sc12_item32_3_to_star_4<type, tohost>(input[i++], dummy, dummy, output[0], output[1], _scalar); break;
+        case 3: convert_sc12_item32_3_to_star_4<type, tohost>(input[i++], dummy, output[0], output[1], output[2], _scalar); break;
+        }
+        o += head_samps;
+
+        //convert the body
+        while (o+3 < nsamps)
+        {
+           convert_sc12_item32_3_to_star_4<type, tohost>(input[i], &output[o], _scalar);
+            i += 1; o += 4;
+        }
+
+        const size_t tail_samps = nsamps - o;
+        switch (tail_samps)
+        {
+        case 0: break; //no tail
+        case 1: convert_sc12_item32_3_to_star_4<type, tohost>(input[i], output[o+0], dummy, dummy, dummy, _scalar); break;
+        case 2: convert_sc12_item32_3_to_star_4<type, tohost>(input[i], output[o+0], output[o+1], dummy, dummy, _scalar); break;
+        case 3: convert_sc12_item32_3_to_star_4<type, tohost>(input[i], output[o+0], output[o+1], output[o+2], dummy, _scalar); break;
+        }
+    }
+
+    double _scalar;
+};
+
+static converter::sptr make_convert_sc12_item32_le_1_to_fc32_1(void)
+{
+    return converter::sptr(new convert_sc12_item32_1_to_star_2<float, uhd::wtohx>());
+}
+
+static converter::sptr make_convert_sc12_item32_le_1_to_sc16_1(void)
+{
+    return converter::sptr(new convert_sc12_item32_1_to_star_2<short, uhd::wtohx>());
+}
+
+UHD_STATIC_BLOCK(register_sse_unpack_sc12)
+{
+    uhd::convert::id_type id;
+    id.num_inputs = 1;
+    id.num_outputs = 1;
+    id.output_format = "fc32";
+    id.input_format = "sc12_item32_le";
+    uhd::convert::register_converter(id, &make_convert_sc12_item32_le_1_to_fc32_1, PRIORITY_SIMD);
+
+    id.output_format = "sc16";
+    id.input_format = "sc12_item32_le";
+    uhd::convert::register_converter(id, &make_convert_sc12_item32_le_1_to_sc16_1, PRIORITY_SIMD);
+}
author	Tom Tsou <tom.tsou@ettus.com>	2017-07-07 15:32:20 -0700
committer	Martin Braun <martin.braun@ettus.com>	2017-07-25 10:15:37 -0700
commit	0e9f204029e5eac51d94f16ceb19f003e3faf7e8 (patch)
tree	ad71a66f4aa84fc17e965d8682741584f64aa18d
parent	8223a289727bbda353bd7129512daf00d46d898c (diff)
download	uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.tar.gz uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.tar.bz2 uhd-0e9f204029e5eac51d94f16ceb19f003e3faf7e8.zip