From 0e9f204029e5eac51d94f16ceb19f003e3faf7e8 Mon Sep 17 00:00:00 2001 From: Tom Tsou Date: Fri, 7 Jul 2017 15:32:20 -0700 Subject: convert: Add SSE implementation for sc12 Implementation uses SSSE3 intructions to perform 12-bit sample pack/unpack operations to/from standard 16 and 32 bit host values. Input/output shuffle orderings for a single 128-bit SSE register with 16-bit integers shown below. 16-bit interleaved I/Q --------------------------------------- | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Input --------------------------------------- | 127 0 | 12-bit packed I/Q byteswapped ----------------------- | I0 | Q0 | I1 | 0 |-----------------------| | I1 | Q1 | I2 | Q2 | Output |-----------------------| | Q2 | I3 | Q3 | |-----------------------| | Unused | 3 ----------------------- 31 0 Fixes: #1740, #966 Related: #1739 --- host/lib/convert/CMakeLists.txt | 17 +++ host/lib/convert/convert_pack_sc12.cpp | 116 +-------------- host/lib/convert/convert_pack_sc12.hpp | 123 ++++++++++++++++ host/lib/convert/convert_unpack_sc12.cpp | 99 +------------ host/lib/convert/convert_unpack_sc12.hpp | 112 ++++++++++++++ host/lib/convert/ssse3_pack_sc12.cpp | 244 +++++++++++++++++++++++++++++++ host/lib/convert/ssse3_unpack_sc12.cpp | 219 +++++++++++++++++++++++++++ 7 files changed, 719 insertions(+), 211 deletions(-) create mode 100644 host/lib/convert/convert_pack_sc12.hpp create mode 100644 host/lib/convert/convert_unpack_sc12.hpp create mode 100644 host/lib/convert/ssse3_pack_sc12.cpp create mode 100644 host/lib/convert/ssse3_unpack_sc12.cpp diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt index 10376ba9c..cfd3c7f34 100644 --- a/host/lib/convert/CMakeLists.txt +++ b/host/lib/convert/CMakeLists.txt @@ -26,6 +26,7 @@ MESSAGE(STATUS "") ######################################################################## IF(CMAKE_COMPILER_IS_GNUCXX) SET(EMMINTRIN_FLAGS -msse2) + SET(TMMINTRIN_FLAGS -mssse3) ELSEIF(MSVC) SET(EMMINTRIN_FLAGS /arch:SSE2) ENDIF() @@ -34,6 +35,10 @@ SET(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS}) CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H) SET(CMAKE_REQUIRED_FLAGS) +SET(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS}) +CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H) +SET(CMAKE_REQUIRED_FLAGS) + IF(HAVE_EMMINTRIN_H) SET(convert_with_sse2_sources ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp @@ -53,6 +58,18 @@ IF(HAVE_EMMINTRIN_H) LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources}) ENDIF(HAVE_EMMINTRIN_H) +IF(HAVE_TMMINTRIN_H) + SET(convert_with_ssse3_sources + ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp + ) + SET_SOURCE_FILES_PROPERTIES( + ${convert_with_ssse3_sources} + PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}" + ) + LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources}) +ENDIF(HAVE_TMMINTRIN_H) + ######################################################################## # Check for NEON SIMD headers ######################################################################## diff --git a/host/lib/convert/convert_pack_sc12.cpp b/host/lib/convert/convert_pack_sc12.cpp index 2e45e19f5..85194dcdd 100644 --- a/host/lib/convert/convert_pack_sc12.cpp +++ b/host/lib/convert/convert_pack_sc12.cpp @@ -1,5 +1,5 @@ // -// Copyright 2013 Ettus Research LLC +// Copyright 2017 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -15,122 +15,10 @@ // along with this program. If not, see . // -#include "convert_common.hpp" -#include -#include -#include -#include -#include +#include "convert_pack_sc12.hpp" using namespace uhd::convert; -typedef uint32_t (*towire32_type)(uint32_t); - -/* C language specification requires this to be packed - * (i.e., line0, line1, line2 will be in adjacent memory locations). - * If this was not true, we'd need compiler flags here to specify - * alignment/packing. - */ -struct item32_sc12_3x -{ - item32_t line0; - item32_t line1; - item32_t line2; -}; - -enum item32_sc12_3x_enable { - CONVERT12_LINE0 = 0x01, - CONVERT12_LINE1 = 0x02, - CONVERT12_LINE2 = 0x04, - CONVERT12_LINE_ALL = 0x07, -}; - -/* - * Packed 12-bit converter with selective line enable - * - * The converter operates on 4 complex inputs and selectively writes to one to - * three 32-bit lines. Line selection allows for partial writes of less than - * 4 complex samples, or a full 3 x 32-bit struct. Writes are always full 32-bit - * lines, so in the case of partial writes, the number of bytes written will - * exceed the the number of bytes filled by actual samples. - * - * _ _ _ _ _ _ _ _ - * |_ _ _1_ _ _|_ _| 0 - * |_2_ _ _|_ _ _3_| - * |_ _|_ _ _4_ _ _| 2 - * 31 0 - */ -template -inline void pack(item32_sc12_3x &output, int enable, const int32_t i[4], const int32_t q[4]) -{ - if (enable & CONVERT12_LINE0) - output.line0 = towire(i[0] << 20 | q[0] << 8 | i[1] >> 4); - if (enable & CONVERT12_LINE1) - output.line1 = towire(i[1] << 28 | q[1] << 16 | i[2] << 4 | q[2] >> 8); - if (enable & CONVERT12_LINE2) - output.line2 = towire(q[2] << 24 | i[3] << 12 | q[3]); -} - -template -void convert_star_4_to_sc12_item32_3 -( - const std::complex &in0, - const std::complex &in1, - const std::complex &in2, - const std::complex &in3, - const int enable, - item32_sc12_3x &output, - const double scalar, - typename std::enable_if::value>::type* = NULL -) -{ - int32_t i[4] { - int32_t(in0.real()*scalar) & 0xfff, - int32_t(in1.real()*scalar) & 0xfff, - int32_t(in2.real()*scalar) & 0xfff, - int32_t(in3.real()*scalar) & 0xfff, - }; - - int32_t q[4] { - int32_t(in0.imag()*scalar) & 0xfff, - int32_t(in1.imag()*scalar) & 0xfff, - int32_t(in2.imag()*scalar) & 0xfff, - int32_t(in3.imag()*scalar) & 0xfff, - }; - - pack(output, enable, i, q); -} - -template -void convert_star_4_to_sc12_item32_3 -( - const std::complex &in0, - const std::complex &in1, - const std::complex &in2, - const std::complex &in3, - const int enable, - item32_sc12_3x &output, - const double, - typename std::enable_if::value>::type* = NULL -) -{ - int32_t i[4] { - int32_t(in0.real() >> 4) & 0xfff, - int32_t(in1.real() >> 4) & 0xfff, - int32_t(in2.real() >> 4) & 0xfff, - int32_t(in3.real() >> 4) & 0xfff, - }; - - int32_t q[4] { - int32_t(in0.imag() >> 4) & 0xfff, - int32_t(in1.imag() >> 4) & 0xfff, - int32_t(in2.imag() >> 4) & 0xfff, - int32_t(in3.imag() >> 4) & 0xfff, - }; - - pack(output, enable, i, q); -} - template struct convert_star_1_to_sc12_item32_1 : public converter { diff --git a/host/lib/convert/convert_pack_sc12.hpp b/host/lib/convert/convert_pack_sc12.hpp new file mode 100644 index 000000000..754c47cd2 --- /dev/null +++ b/host/lib/convert/convert_pack_sc12.hpp @@ -0,0 +1,123 @@ +// +// Copyright 2017 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include +#include +#include "convert_common.hpp" + +using namespace uhd::convert; + +typedef uint32_t (*towire32_type)(uint32_t); + +/* C language specification requires this to be packed + * (i.e., line0, line1, line2 will be in adjacent memory locations). + * If this was not true, we'd need compiler flags here to specify + * alignment/packing. + */ +struct item32_sc12_3x +{ + item32_t line0; + item32_t line1; + item32_t line2; +}; + +enum item32_sc12_3x_enable { + CONVERT12_LINE0 = 0x01, + CONVERT12_LINE1 = 0x02, + CONVERT12_LINE2 = 0x04, + CONVERT12_LINE_ALL = 0x07, +}; + +/* + * Packed 12-bit converter with selective line enable + * + * The converter operates on 4 complex inputs and selectively writes to one to + * three 32-bit lines. Line selection allows for partial writes of less than + * 4 complex samples, or a full 3 x 32-bit struct. Writes are always full 32-bit + * lines, so in the case of partial writes, the number of bytes written will + * exceed the the number of bytes filled by actual samples. + * + * _ _ _ _ _ _ _ _ + * |_ _ _1_ _ _|_ _| 0 + * |_2_ _ _|_ _ _3_| + * |_ _|_ _ _4_ _ _| 2 + * 31 0 + */ +template +void pack(item32_sc12_3x &output, int enable, const int32_t iq[8]) +{ + if (enable & CONVERT12_LINE0) + output.line0 = towire(iq[0] << 20 | iq[1] << 8 | iq[2] >> 4); + if (enable & CONVERT12_LINE1) + output.line1 = towire(iq[2] << 28 | iq[3] << 16 | iq[4] << 4 | iq[5] >> 8); + if (enable & CONVERT12_LINE2) + output.line2 = towire(iq[5] << 24 | iq[6] << 12 | iq[7] << 0); +} + +template +void convert_star_4_to_sc12_item32_3 +( + const std::complex &in0, + const std::complex &in1, + const std::complex &in2, + const std::complex &in3, + const int enable, + item32_sc12_3x &output, + const double scalar, + typename std::enable_if::value>::type* = NULL +) +{ + int32_t iq[8] { + int32_t(in0.real()*scalar) & 0xfff, + int32_t(in0.imag()*scalar) & 0xfff, + int32_t(in1.real()*scalar) & 0xfff, + int32_t(in1.imag()*scalar) & 0xfff, + + int32_t(in2.real()*scalar) & 0xfff, + int32_t(in2.imag()*scalar) & 0xfff, + int32_t(in3.real()*scalar) & 0xfff, + int32_t(in3.imag()*scalar) & 0xfff, + }; + pack(output, enable, iq); +} + +template +void convert_star_4_to_sc12_item32_3 +( + const std::complex &in0, + const std::complex &in1, + const std::complex &in2, + const std::complex &in3, + const int enable, + item32_sc12_3x &output, + const double, + typename std::enable_if::value>::type* = NULL +) +{ + int32_t iq[8] { + int32_t(in0.real() >> 4) & 0xfff, + int32_t(in0.imag() >> 4) & 0xfff, + int32_t(in1.real() >> 4) & 0xfff, + int32_t(in1.imag() >> 4) & 0xfff, + + int32_t(in2.real() >> 4) & 0xfff, + int32_t(in2.imag() >> 4) & 0xfff, + int32_t(in3.real() >> 4) & 0xfff, + int32_t(in3.imag() >> 4) & 0xfff, + }; + pack(output, enable, iq); +} diff --git a/host/lib/convert/convert_unpack_sc12.cpp b/host/lib/convert/convert_unpack_sc12.cpp index 07f9cffa0..43c35ee3b 100644 --- a/host/lib/convert/convert_unpack_sc12.cpp +++ b/host/lib/convert/convert_unpack_sc12.cpp @@ -1,5 +1,5 @@ // -// Copyright 2013 Ettus Research LLC +// Copyright 2017 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -15,105 +15,10 @@ // along with this program. If not, see . // -#include "convert_common.hpp" -#include -#include -#include -#include -#include +#include "convert_unpack_sc12.hpp" using namespace uhd::convert; -typedef uint32_t (*tohost32_type)(uint32_t); - -/* C language specification requires this to be packed - * (i.e., line0, line1, line2 will be in adjacent memory locations). - * If this was not true, we'd need compiler flags here to specify - * alignment/packing. - */ -struct item32_sc12_3x -{ - item32_t line0; - item32_t line1; - item32_t line2; -}; - -/* - * convert_sc12_item32_3_to_star_4 takes in 3 lines with 32 bit each - * and converts them 4 samples of type 'std::complex'. - * The structure of the 3 lines is as follows: - * _ _ _ _ _ _ _ _ - * |_ _ _1_ _ _|_ _| - * |_2_ _ _|_ _ _3_| - * |_ _|_ _ _4_ _ _| - * - * The numbers mark the position of one complex sample. - */ -template -void convert_sc12_item32_3_to_star_4 -( - const item32_sc12_3x &input, - std::complex &out0, - std::complex &out1, - std::complex &out2, - std::complex &out3, - const double scalar, - typename std::enable_if::value>::type* = NULL -) -{ - //step 0: extract the lines from the input buffer - const item32_t line0 = tohost(input.line0); - const item32_t line1 = tohost(input.line1); - const item32_t line2 = tohost(input.line2); - const uint64_t line01 = (uint64_t(line0) << 32) | line1; - const uint64_t line12 = (uint64_t(line1) << 32) | line2; - - //step 1: shift out and mask off the individual numbers - const type i0 = type(int16_t((line0 >> 16) & 0xfff0)*scalar); - const type q0 = type(int16_t((line0 >> 4) & 0xfff0)*scalar); - - const type i1 = type(int16_t((line01 >> 24) & 0xfff0)*scalar); - const type q1 = type(int16_t((line1 >> 12) & 0xfff0)*scalar); - - const type i2 = type(int16_t((line1 >> 0) & 0xfff0)*scalar); - const type q2 = type(int16_t((line12 >> 20) & 0xfff0)*scalar); - - const type i3 = type(int16_t((line2 >> 8) & 0xfff0)*scalar); - const type q3 = type(int16_t((line2 << 4) & 0xfff0)*scalar); - - //step 2: load the outputs - out0 = std::complex(i0, q0); - out1 = std::complex(i1, q1); - out2 = std::complex(i2, q2); - out3 = std::complex(i3, q3); -} - -template -void convert_sc12_item32_3_to_star_4 -( - const item32_sc12_3x &input, - std::complex &out0, - std::complex &out1, - std::complex &out2, - std::complex &out3, - const double, - typename std::enable_if::value>::type* = NULL -) -{ - //step 0: extract the lines from the input buffer - const item32_t line0 = tohost(input.line0); - const item32_t line1 = tohost(input.line1); - const item32_t line2 = tohost(input.line2); - const uint64_t line01 = (uint64_t(line0) << 32) | line1; - const uint64_t line12 = (uint64_t(line1) << 32) | line2; - - //step 1: extract and load the outputs - out0 = std::complex(line0 >> 16 & 0xfff0, line0 >> 4 & 0xfff0); - out1 = std::complex(line01 >> 24 & 0xfff0, line1 >> 12 & 0xfff0); - out2 = std::complex(line1 >> 0 & 0xfff0, line12 >> 20 & 0xfff0); - out3 = std::complex(line2 >> 8 & 0xfff0, line2 << 4 & 0xfff0); -} - template struct convert_sc12_item32_1_to_star_1 : public converter { diff --git a/host/lib/convert/convert_unpack_sc12.hpp b/host/lib/convert/convert_unpack_sc12.hpp new file mode 100644 index 000000000..46e7d58fb --- /dev/null +++ b/host/lib/convert/convert_unpack_sc12.hpp @@ -0,0 +1,112 @@ +// +// Copyright 2017 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include +#include +#include "convert_common.hpp" + +using namespace uhd::convert; + +typedef uint32_t (*tohost32_type)(uint32_t); + +/* C language specification requires this to be packed + * (i.e., line0, line1, line2 will be in adjacent memory locations). + * If this was not true, we'd need compiler flags here to specify + * alignment/packing. + */ +struct item32_sc12_3x +{ + item32_t line0; + item32_t line1; + item32_t line2; +}; + +/* + * convert_sc12_item32_3_to_star_4 takes in 3 lines with 32 bit each + * and converts them 4 samples of type 'std::complex'. + * The structure of the 3 lines is as follows: + * _ _ _ _ _ _ _ _ + * |_ _ _1_ _ _|_ _| + * |_2_ _ _|_ _ _3_| + * |_ _|_ _ _4_ _ _| + * + * The numbers mark the position of one complex sample. + */ +template +void convert_sc12_item32_3_to_star_4 +( + const item32_sc12_3x &input, + std::complex &out0, + std::complex &out1, + std::complex &out2, + std::complex &out3, + const double scalar, + typename std::enable_if::value>::type* = NULL +) +{ + //step 0: extract the lines from the input buffer + const item32_t line0 = tohost(input.line0); + const item32_t line1 = tohost(input.line1); + const item32_t line2 = tohost(input.line2); + const uint64_t line01 = (uint64_t(line0) << 32) | line1; + const uint64_t line12 = (uint64_t(line1) << 32) | line2; + + //step 1: shift out and mask off the individual numbers + const type i0 = type(int16_t((line0 >> 16) & 0xfff0)*scalar); + const type q0 = type(int16_t((line0 >> 4) & 0xfff0)*scalar); + + const type i1 = type(int16_t((line01 >> 24) & 0xfff0)*scalar); + const type q1 = type(int16_t((line1 >> 12) & 0xfff0)*scalar); + + const type i2 = type(int16_t((line1 >> 0) & 0xfff0)*scalar); + const type q2 = type(int16_t((line12 >> 20) & 0xfff0)*scalar); + + const type i3 = type(int16_t((line2 >> 8) & 0xfff0)*scalar); + const type q3 = type(int16_t((line2 << 4) & 0xfff0)*scalar); + + //step 2: load the outputs + out0 = std::complex(i0, q0); + out1 = std::complex(i1, q1); + out2 = std::complex(i2, q2); + out3 = std::complex(i3, q3); +} + +template +void convert_sc12_item32_3_to_star_4 +( + const item32_sc12_3x &input, + std::complex &out0, + std::complex &out1, + std::complex &out2, + std::complex &out3, + const double, + typename std::enable_if::value>::type* = NULL +) +{ + //step 0: extract the lines from the input buffer + const item32_t line0 = tohost(input.line0); + const item32_t line1 = tohost(input.line1); + const item32_t line2 = tohost(input.line2); + const uint64_t line01 = (uint64_t(line0) << 32) | line1; + const uint64_t line12 = (uint64_t(line1) << 32) | line2; + + //step 1: extract and load the outputs + out0 = std::complex(line0 >> 16 & 0xfff0, line0 >> 4 & 0xfff0); + out1 = std::complex(line01 >> 24 & 0xfff0, line1 >> 12 & 0xfff0); + out2 = std::complex(line1 >> 0 & 0xfff0, line12 >> 20 & 0xfff0); + out3 = std::complex(line2 >> 8 & 0xfff0, line2 << 4 & 0xfff0); +} diff --git a/host/lib/convert/ssse3_pack_sc12.cpp b/host/lib/convert/ssse3_pack_sc12.cpp new file mode 100644 index 000000000..42c429b67 --- /dev/null +++ b/host/lib/convert/ssse3_pack_sc12.cpp @@ -0,0 +1,244 @@ +// +// Copyright 2017 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include +#include "convert_pack_sc12.hpp" + +/* + * Shuffle Orderings - Single 128-bit SSE register + * + * 16-bit interleaved I/Q + * --------------------------------------- + * | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Input + * --------------------------------------- + * | 127 0 | + * + * + * 12-bit deinterleaved unpacked I/Q + * --------------------------------------- + * | I3 | I2 | I1 | I0 | Q3 | Q2 | Q1 | Q0 | Shuffle-1 + * --------------------------------------- + * | High bit aligned | 4-bit >> offset | + * + * + * 12-bit interleaved packed I/Q + * --------------------------------------- + * |I0|Q0|I1|Q1|I2|Q2|I3|Q3| | Shuffle-2 | Shuffle-3 + * --------------------------------------- + * | 127 32 | 31 Empty 0 | + * + * + * 12-bit packed I/Q byteswapped + * ----------------------- + * | I0 | Q0 | I1 | 0 + * |-----------------------| + * | I1 | Q1 | I2 | Q2 | Output + * |-----------------------| + * | Q2 | I3 | Q3 | + * |-----------------------| + * | Unused | 3 + * ----------------------- + * 31 0 + */ +#define SC12_SHIFT_MASK 0xfff0fff0, 0xfff0fff0, 0x0fff0fff, 0x0fff0fff +#define SC12_PACK_SHUFFLE1 13,12,9,8,5,4,1,0,15,14,11,10,7,6,3,2 +#define SC12_PACK_SHUFFLE2 9,8,0,11,10,2,13,12,4,15,14,6,0,0,0,0 +#define SC12_PACK_SHUFFLE3 8,1,8,8,3,8,8,5,8,8,7,8,8,8,8,8 + +template +inline void convert_star_4_to_sc12_item32_3 +( + const std::complex *in, + item32_sc12_3x &output, + const double scalar, + typename std::enable_if::value>::type* = NULL +) +{ + __m128 m0, m1, m2; + m0 = _mm_set1_ps(scalar); + m1 = _mm_loadu_ps((const float *) &in[0]); + m2 = _mm_loadu_ps((const float *) &in[2]); + m1 = _mm_mul_ps(m1, m0); + m2 = _mm_mul_ps(m2, m0); + m0 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(2, 0, 2, 0)); + m1 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(3, 1, 3, 1)); + + __m128i m3, m4, m5, m6, m7; + m3 = _mm_set_epi32(SC12_SHIFT_MASK); + m4 = _mm_set_epi8(SC12_PACK_SHUFFLE2); + m5 = _mm_set_epi8(SC12_PACK_SHUFFLE3); + + m6 = _mm_cvtps_epi32(m0); + m7 = _mm_cvtps_epi32(m1); + m6 = _mm_slli_epi32(m6, 4); + m6 = _mm_packs_epi32(m7, m6); + m6 = _mm_and_si128(m6, m3); + m7 = _mm_move_epi64(m6); + + m6 = _mm_shuffle_epi8(m6, m4); + m7 = _mm_shuffle_epi8(m7, m5); + m6 = _mm_or_si128(m6, m7); + + m6 = _mm_shuffle_epi32(m6, _MM_SHUFFLE(0, 1, 2, 3)); + _mm_storeu_si128((__m128i*) &output, m6); +} + +template +static void convert_star_4_to_sc12_item32_3 +( + const std::complex *in, + item32_sc12_3x &output, + const double, + typename std::enable_if::value>::type* = NULL +) +{ + __m128i m0, m1, m2, m3, m4, m5; + m0 = _mm_set_epi32(SC12_SHIFT_MASK); + m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1); + m2 = _mm_set_epi8(SC12_PACK_SHUFFLE2); + m3 = _mm_set_epi8(SC12_PACK_SHUFFLE3); + + m4 = _mm_loadu_si128((__m128i*) in); + m4 = _mm_shuffle_epi8(m4, m1); + m5 = _mm_srli_epi16(m4, 4); + m4 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(0, 0, 3, 2)); + m4 = _mm_unpacklo_epi64(m5, m4); + + m4 = _mm_and_si128(m4, m0); + m5 = _mm_move_epi64(m4); + m4 = _mm_shuffle_epi8(m4, m2); + m5 = _mm_shuffle_epi8(m5, m3); + m3 = _mm_or_si128(m4, m5); + + m3 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 2, 3)); + _mm_storeu_si128((__m128i*) &output, m3); +} + +template +struct convert_star_1_to_sc12_item32_2 : public converter +{ + convert_star_1_to_sc12_item32_2(void):_scalar(0.0) + { + } + + void set_scalar(const double scalar) + { + _scalar = scalar; + } + + void operator()(const input_type &inputs, const output_type &outputs, const size_t nsamps) + { + const std::complex *input = reinterpret_cast *>(inputs[0]); + + const size_t head_samps = size_t(outputs[0]) & 0x3; + int enable; + size_t rewind = 0; + switch(head_samps) + { + case 0: break; + case 1: rewind = 9; break; + case 2: rewind = 6; break; + case 3: rewind = 3; break; + } + item32_sc12_3x *output = reinterpret_cast(size_t(outputs[0]) - rewind); + + //helper variables + size_t i = 0, o = 0; + + //handle the head case + switch (head_samps) + { + case 0: + break; //no head + case 1: + enable = CONVERT12_LINE2; + convert_star_4_to_sc12_item32_3(0, 0, 0, input[0], enable, output[o++], _scalar); + break; + case 2: + enable = CONVERT12_LINE2 | CONVERT12_LINE1; + convert_star_4_to_sc12_item32_3(0, 0, input[0], input[1], enable, output[o++], _scalar); + break; + case 3: + enable = CONVERT12_LINE2 | CONVERT12_LINE1 | CONVERT12_LINE0; + convert_star_4_to_sc12_item32_3(0, input[0], input[1], input[2], enable, output[o++], _scalar); + break; + } + i += head_samps; + + // SSE packed write output is 16 bytes which overwrites the 12-bit + // packed struct by 4 bytes. There is no concern if there are + // subsequent samples to be converted (writes will simply happen + // twice). So set the conversion loop to force a tail case on the + // final 4 or fewer samples. + while (i+4 < nsamps) + { + convert_star_4_to_sc12_item32_3(&input[i], output[o], _scalar); + o++; i += 4; + } + + //handle the tail case + const size_t tail_samps = nsamps - i; + switch (tail_samps) + { + case 0: + break; //no tail + case 1: + enable = CONVERT12_LINE0; + convert_star_4_to_sc12_item32_3(input[i+0], 0, 0, 0, enable, output[o], _scalar); + break; + case 2: + enable = CONVERT12_LINE0 | CONVERT12_LINE1; + convert_star_4_to_sc12_item32_3(input[i+0], input[i+1], 0, 0, enable, output[o], _scalar); + break; + case 3: + enable = CONVERT12_LINE0 | CONVERT12_LINE1 | CONVERT12_LINE2; + convert_star_4_to_sc12_item32_3(input[i+0], input[i+1], input[i+2], 0, enable, output[o], _scalar); + break; + case 4: + enable = CONVERT12_LINE_ALL; + convert_star_4_to_sc12_item32_3(input[i+0], input[i+1], input[i+2], input[i+3], enable, output[o], _scalar); + break; + } + } + + double _scalar; +}; + +static converter::sptr make_convert_fc32_1_to_sc12_item32_le_1(void) +{ + return converter::sptr(new convert_star_1_to_sc12_item32_2()); +} + +static converter::sptr make_convert_sc16_1_to_sc12_item32_le_1(void) +{ + return converter::sptr(new convert_star_1_to_sc12_item32_2()); +} + +UHD_STATIC_BLOCK(register_sse_pack_sc12) +{ + uhd::convert::id_type id; + id.num_inputs = 1; + id.num_outputs = 1; + + id.input_format = "fc32"; + id.output_format = "sc12_item32_le"; + uhd::convert::register_converter(id, &make_convert_fc32_1_to_sc12_item32_le_1, PRIORITY_SIMD); + + id.input_format = "sc16"; + id.output_format = "sc12_item32_le"; + uhd::convert::register_converter(id, &make_convert_sc16_1_to_sc12_item32_le_1, PRIORITY_SIMD); +} diff --git a/host/lib/convert/ssse3_unpack_sc12.cpp b/host/lib/convert/ssse3_unpack_sc12.cpp new file mode 100644 index 000000000..245e64ebc --- /dev/null +++ b/host/lib/convert/ssse3_unpack_sc12.cpp @@ -0,0 +1,219 @@ +// +// Copyright 2017 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "convert_unpack_sc12.hpp" +#include +#include + +using namespace uhd::convert; + +/* + * Shuffle Orderings - Single 128-bit SSE register + * + * 12-bit packed I/Q byteswapped + * ----------------------- + * | I0 | Q0 | I1 | 0 + * |-----------------------| + * | I1 | Q1 | I2 | Q2 | Input + * |-----------------------| + * | Q2 | I3 | Q3 | 2 + * ----------------------- + * 31 0 + * + * + * 12-bit interleaved packed I/Q + * --------------------------------------- + * |I0|Q0|I1|Q1|I2|Q2|I3|Q3| | Byteswap Removed + * --------------------------------------- + * | 127 32 | 31 Empty 0 | + * + * + * Packed Unpacked + * Sample Index Index Offset + * ===================================== + * I0 15,14 0,1 0 + * Q0 14,13 8,9 4 + * I1 12,11 2,3 0 + * Q1 11,10 10,11 4 12-bit Indices + * I2 9,8 4,5 0 + * Q2 8,7 12,13 4 + * I3 6,5 6,7 0 + * Q3 5,4 14,15 4 + * + * + * 12-bit deinterleaved unpacked I/Q + * --------------------------------------- + * | Q3 | Q2 | Q1 | Q0 | I3 | I2 | I1 | I0 | Shuffle-1 + * --------------------------------------- + * | 4-bit >> offset | High bit aligned | + * + * + * 16-bit interleaved I/Q + * --------------------------------------- + * | Q3 | I3 | Q2 | I2 | Q1 | I1 | Q0 | I0 | Output (Shuffle-2) + * --------------------------------------- + * | 127 0 | + * + */ +#define SC12_SHIFT_MASK 0x0fff0fff, 0x0fff0fff, 0xfff0fff0, 0xfff0fff0 +#define SC12_PACK_SHUFFLE1 5,4,8,7,11,10,14,13,6,5,9,8,12,11,15,14 +#define SC12_PACK_SHUFFLE2 15,14,7,6,13,12,5,4,11,10,3,2,9,8,1,0 + +template +inline void convert_sc12_item32_3_to_star_4 +( + const item32_sc12_3x &input, + std::complex *out, + double scalar, + typename std::enable_if::value>::type* = NULL +) +{ + __m128i m0, m1, m2, m3, m4; + m0 = _mm_set_epi32(SC12_SHIFT_MASK); + m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1); + m2 = _mm_loadu_si128((__m128i*) &input); + m2 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 1, 2, 3)); + m3 = _mm_shuffle_epi8(m2, m1); + m3 = _mm_and_si128(m3, m0); + + m4 = _mm_setzero_si128(); + m1 = _mm_unpacklo_epi16(m4, m3); + m2 = _mm_unpackhi_epi16(m4, m3); + m2 = _mm_slli_epi32(m2, 4); + m3 = _mm_unpacklo_epi32(m1, m2); + m4 = _mm_unpackhi_epi32(m1, m2); + + __m128 m5, m6, m7; + m5 = _mm_set_ps1(scalar/(1 << 16)); + m6 = _mm_cvtepi32_ps(m3); + m7 = _mm_cvtepi32_ps(m4); + m6 = _mm_mul_ps(m6, m5); + m7 = _mm_mul_ps(m7, m5); + + _mm_storeu_ps(reinterpret_cast(&out[0]), m6); + _mm_storeu_ps(reinterpret_cast(&out[2]), m7); +} + +template +inline void convert_sc12_item32_3_to_star_4 +( + const item32_sc12_3x &input, + std::complex *out, + double, + typename std::enable_if::value>::type* = NULL +) +{ + __m128i m0, m1, m2, m3; + m0 = _mm_set_epi32(SC12_SHIFT_MASK); + m1 = _mm_set_epi8(SC12_PACK_SHUFFLE1); + m2 = _mm_set_epi8(SC12_PACK_SHUFFLE2); + + m3 = _mm_loadu_si128((__m128i*) &input); + m3 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 2, 3)); + m3 = _mm_shuffle_epi8(m3, m1); + m3 = _mm_and_si128(m3, m0); + + m0 = _mm_slli_epi16(m3, 4); + m1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(1, 0, 0, 0)); + m0 = _mm_unpackhi_epi64(m1, m0); + m1 = _mm_shuffle_epi8(m0, m2); + + _mm_storeu_si128((__m128i*) out, m1); +} + +template +struct convert_sc12_item32_1_to_star_2 : public converter +{ + convert_sc12_item32_1_to_star_2(void):_scalar(0.0) + { + //NOP + } + + void set_scalar(const double scalar) + { + const int unpack_growth = 16; + _scalar = scalar/unpack_growth; + } + + void operator()(const input_type &inputs, const output_type &outputs, const size_t nsamps) + { + const size_t head_samps = size_t(inputs[0]) & 0x3; + size_t rewind = 0; + switch(head_samps) + { + case 0: break; + case 1: rewind = 9; break; + case 2: rewind = 6; break; + case 3: rewind = 3; break; + } + + const item32_sc12_3x *input = reinterpret_cast(size_t(inputs[0]) - rewind); + std::complex *output = reinterpret_cast *>(outputs[0]); + std::complex dummy; + size_t i = 0, o = 0; + switch (head_samps) + { + case 0: break; //no head + case 1: convert_sc12_item32_3_to_star_4(input[i++], dummy, dummy, dummy, output[0], _scalar); break; + case 2: convert_sc12_item32_3_to_star_4(input[i++], dummy, dummy, output[0], output[1], _scalar); break; + case 3: convert_sc12_item32_3_to_star_4(input[i++], dummy, output[0], output[1], output[2], _scalar); break; + } + o += head_samps; + + //convert the body + while (o+3 < nsamps) + { + convert_sc12_item32_3_to_star_4(input[i], &output[o], _scalar); + i += 1; o += 4; + } + + const size_t tail_samps = nsamps - o; + switch (tail_samps) + { + case 0: break; //no tail + case 1: convert_sc12_item32_3_to_star_4(input[i], output[o+0], dummy, dummy, dummy, _scalar); break; + case 2: convert_sc12_item32_3_to_star_4(input[i], output[o+0], output[o+1], dummy, dummy, _scalar); break; + case 3: convert_sc12_item32_3_to_star_4(input[i], output[o+0], output[o+1], output[o+2], dummy, _scalar); break; + } + } + + double _scalar; +}; + +static converter::sptr make_convert_sc12_item32_le_1_to_fc32_1(void) +{ + return converter::sptr(new convert_sc12_item32_1_to_star_2()); +} + +static converter::sptr make_convert_sc12_item32_le_1_to_sc16_1(void) +{ + return converter::sptr(new convert_sc12_item32_1_to_star_2()); +} + +UHD_STATIC_BLOCK(register_sse_unpack_sc12) +{ + uhd::convert::id_type id; + id.num_inputs = 1; + id.num_outputs = 1; + id.output_format = "fc32"; + id.input_format = "sc12_item32_le"; + uhd::convert::register_converter(id, &make_convert_sc12_item32_le_1_to_fc32_1, PRIORITY_SIMD); + + id.output_format = "sc16"; + id.input_format = "sc12_item32_le"; + uhd::convert::register_converter(id, &make_convert_sc12_item32_le_1_to_sc16_1, PRIORITY_SIMD); +} -- cgit v1.2.3