diff options
author | Josh Blum <josh@joshknows.com> | 2012-04-27 17:23:18 -0700 |
---|---|---|
committer | Josh Blum <josh@joshknows.com> | 2012-05-09 23:54:03 -0700 |
commit | 4c244c78a75646ad4040fb328fa64bb752852512 (patch) | |
tree | 1261591e9c7cc112fcce73da697a236f1934127e | |
parent | 58bebf5511dffde44fb0e2a14edb3778e54aaf2b (diff) | |
download | uhd-4c244c78a75646ad4040fb328fa64bb752852512.tar.gz uhd-4c244c78a75646ad4040fb328fa64bb752852512.tar.bz2 uhd-4c244c78a75646ad4040fb328fa64bb752852512.zip |
convert: squashed converter and sse2 work
-rw-r--r-- | host/lib/convert/CMakeLists.txt | 13 | ||||
-rw-r--r-- | host/lib/convert/convert_common.hpp | 268 | ||||
-rw-r--r-- | host/lib/convert/convert_item32.cpp | 43 | ||||
-rw-r--r-- | host/lib/convert/convert_with_neon.cpp | 8 | ||||
-rw-r--r-- | host/lib/convert/gen_convert_general.py | 72 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc32_to_sc16.cpp | 103 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc32_to_sc8.cpp (renamed from host/lib/convert/convert_fc32_to_sc8_with_sse2.cpp) | 58 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc64_to_sc16.cpp | 111 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc64_to_sc8.cpp (renamed from host/lib/convert/convert_fc64_to_sc8_with_sse2.cpp) | 53 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc16_to_fc32.cpp (renamed from host/lib/convert/convert_fc32_with_sse2.cpp) | 99 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc16_to_fc64.cpp (renamed from host/lib/convert/convert_fc64_with_sse2.cpp) | 103 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc8_to_fc32.cpp | 129 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc8_to_fc64.cpp | 151 |
13 files changed, 714 insertions, 497 deletions
diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt index c42a0a434..0d9d0983f 100644 --- a/host/lib/convert/CMakeLists.txt +++ b/host/lib/convert/CMakeLists.txt @@ -71,10 +71,14 @@ UNSET(CMAKE_REQUIRED_FLAGS) IF(HAVE_EMMINTRIN_H) SET(convert_with_sse2_sources - ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc32_with_sse2.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc64_with_sse2.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc32_to_sc8_with_sse2.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc64_to_sc8_with_sse2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_fc64_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_fc32_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_fc64_to_sc8.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_fc32_to_sc8.cpp ) SET_SOURCE_FILES_PROPERTIES( ${convert_with_sse2_sources} @@ -117,4 +121,5 @@ LIBUHD_PYTHON_GEN_SOURCE( LIBUHD_APPEND_SOURCES( ${CMAKE_CURRENT_SOURCE_DIR}/convert_with_tables.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/convert_item32.cpp ) diff --git a/host/lib/convert/convert_common.hpp b/host/lib/convert/convert_common.hpp index 7626e4d87..602e60230 100644 --- a/host/lib/convert/convert_common.hpp +++ b/host/lib/convert/convert_common.hpp @@ -53,12 +53,12 @@ static const int PRIORITY_EMPTY = -1; #ifdef __ARM_NEON__ static const int PRIORITY_LIBORC = 3; -static const int PRIORITY_SIMD = 1; //neon conversions could be implemented better, orc wins -static const int PRIORITY_TABLE = 2; //tables require large cache, so they are slower on arm +static const int PRIORITY_SIMD = 2; //neon conversions could be implemented better, orc wins +static const int PRIORITY_TABLE = 1; //tables require large cache, so they are slower on arm #else -static const int PRIORITY_LIBORC = 1; +static const int PRIORITY_LIBORC = 2; static const int PRIORITY_SIMD = 3; -static const int PRIORITY_TABLE = 2; +static const int PRIORITY_TABLE = 1; #endif /*********************************************************************** @@ -77,173 +77,173 @@ typedef boost::int8_t s8_t; typedef boost::uint32_t item32_t; -/*********************************************************************** - * Convert complex short buffer to items32 sc16 - **********************************************************************/ -static UHD_INLINE item32_t sc16_to_item32_sc16(sc16_t num, double){ - boost::uint16_t real = num.real(); - boost::uint16_t imag = num.imag(); - return (item32_t(real) << 16) | (item32_t(imag) << 0); -} - -/*********************************************************************** - * Convert items32 sc16 buffer to complex short - **********************************************************************/ -static UHD_INLINE sc16_t item32_sc16_to_sc16(item32_t item, double){ - return sc16_t( - boost::int16_t(item >> 16), - boost::int16_t(item >> 0) - ); -} +typedef item32_t (*xtox_t)(item32_t); /*********************************************************************** - * Convert complex float buffer to items32 sc16 + * Convert xx to items32 sc16 buffer **********************************************************************/ -static UHD_INLINE item32_t fc32_to_item32_sc16(fc32_t num, double scale_factor){ +template <typename T> UHD_INLINE item32_t xx_to_item32_sc16_x1( + const std::complex<T> &num, const double scale_factor +){ boost::uint16_t real = boost::int16_t(num.real()*float(scale_factor)); boost::uint16_t imag = boost::int16_t(num.imag()*float(scale_factor)); return (item32_t(real) << 16) | (item32_t(imag) << 0); } -/*********************************************************************** - * Convert items32 sc16 buffer to complex float - **********************************************************************/ -static UHD_INLINE fc32_t item32_sc16_to_fc32(item32_t item, double scale_factor){ - return fc32_t( - float(boost::int16_t(item >> 16)*float(scale_factor)), - float(boost::int16_t(item >> 0)*float(scale_factor)) - ); -} - -/*********************************************************************** - * Convert complex double buffer to items32 sc16 - **********************************************************************/ -static UHD_INLINE item32_t fc64_to_item32_sc16(fc64_t num, double scale_factor){ - boost::uint16_t real = boost::int16_t(num.real()*scale_factor); - boost::uint16_t imag = boost::int16_t(num.imag()*scale_factor); +template <> UHD_INLINE item32_t xx_to_item32_sc16_x1( + const sc16_t &num, const double +){ + boost::uint16_t real = boost::int16_t(num.real()); + boost::uint16_t imag = boost::int16_t(num.imag()); return (item32_t(real) << 16) | (item32_t(imag) << 0); } -/*********************************************************************** - * Convert items32 sc16 buffer to complex double - **********************************************************************/ -static UHD_INLINE fc64_t item32_sc16_to_fc64(item32_t item, double scale_factor){ - return fc64_t( - float(boost::int16_t(item >> 16)*scale_factor), - float(boost::int16_t(item >> 0)*scale_factor) - ); +template <xtox_t to_wire, typename T> +UHD_INLINE void xx_to_item32_sc16( + const std::complex<T> *input, + item32_t *output, + const size_t nsamps, + const double scale_factor +){ + for (size_t i = 0; i < nsamps; i++){ + const item32_t item = xx_to_item32_sc16_x1(input[i], scale_factor); + output[i] = to_wire(item); + } } /*********************************************************************** - * Convert items32 sc8 buffer to complex char + * Convert items32 sc16 buffer to xx **********************************************************************/ -static UHD_INLINE void item32_sc8_to_sc8(item32_t item, sc8_t &out0, sc8_t &out1, double){ - out0 = sc8_t( - boost::int8_t(item >> 8), - boost::int8_t(item >> 0) - ); - out1 = sc8_t( - boost::int8_t(item >> 24), - boost::int8_t(item >> 16) +template <typename T> UHD_INLINE std::complex<T> item32_sc16_x1_to_xx( + const item32_t item, const double scale_factor +){ + return std::complex<T>( + T(boost::int16_t(item >> 16)*float(scale_factor)), + T(boost::int16_t(item >> 0)*float(scale_factor)) ); } -/*********************************************************************** - * Convert items32 sc8 buffer to complex short - **********************************************************************/ -static UHD_INLINE void item32_sc8_to_sc16(item32_t item, sc16_t &out0, sc16_t &out1, double){ - out0 = sc16_t( - boost::int8_t(item >> 8), - boost::int8_t(item >> 0) - ); - out1 = sc16_t( - boost::int8_t(item >> 24), - boost::int8_t(item >> 16) - ); -} - -/*********************************************************************** - * Convert items32 sc8 buffer to complex float - **********************************************************************/ -static UHD_INLINE void item32_sc8_to_fc32(item32_t item, fc32_t &out0, fc32_t &out1, double scale_factor){ - out0 = fc32_t( - float(boost::int8_t(item >> 8)*float(scale_factor)), - float(boost::int8_t(item >> 0)*float(scale_factor)) - ); - out1 = fc32_t( - float(boost::int8_t(item >> 24)*float(scale_factor)), - float(boost::int8_t(item >> 16)*float(scale_factor)) +template <> UHD_INLINE sc16_t item32_sc16_x1_to_xx( + const item32_t item, const double +){ + return sc16_t( + boost::int16_t(item >> 16), boost::int16_t(item >> 0) ); } -/*********************************************************************** - * Convert items32 sc8 buffer to complex double - **********************************************************************/ -static UHD_INLINE void item32_sc8_to_fc64(item32_t item, fc64_t &out0, fc64_t &out1, double scale_factor){ - out0 = fc64_t( - float(boost::int8_t(item >> 8)*scale_factor), - float(boost::int8_t(item >> 0)*scale_factor) - ); - out1 = fc64_t( - float(boost::int8_t(item >> 24)*scale_factor), - float(boost::int8_t(item >> 16)*scale_factor) - ); +template <xtox_t to_host, typename T> +UHD_INLINE void item32_sc16_to_xx( + const item32_t *input, + std::complex<T> *output, + const size_t nsamps, + const double scale_factor +){ + for (size_t i = 0; i < nsamps; i++){ + const item32_t item_i = to_host(input[i]); + output[i] = item32_sc16_x1_to_xx<T>(item_i, scale_factor); + } } /*********************************************************************** - * Convert complex char to items32 sc8 buffer + * Convert xx to items32 sc8 buffer **********************************************************************/ -static UHD_INLINE item32_t sc8_to_item32_sc8(sc8_t in0, sc8_t in1, double){ - boost::uint8_t real0 = boost::int8_t(in0.real()); - boost::uint8_t imag0 = boost::int8_t(in0.imag()); - boost::uint8_t real1 = boost::int8_t(in1.real()); - boost::uint8_t imag1 = boost::int8_t(in1.imag()); +template <typename T> UHD_INLINE item32_t xx_to_item32_sc8_x1( + const std::complex<T> &in0, const std::complex<T> &in1, const double scale_factor +){ return - (item32_t(real0) << 8) | (item32_t(imag0) << 0) | - (item32_t(real1) << 24) | (item32_t(imag1) << 16) + (item32_t(boost::uint8_t(in0.real()*float(scale_factor))) << 8) | + (item32_t(boost::uint8_t(in0.imag()*float(scale_factor))) << 0) | + (item32_t(boost::uint8_t(in1.real()*float(scale_factor))) << 24) | + (item32_t(boost::uint8_t(in1.imag()*float(scale_factor))) << 16) ; } -/*********************************************************************** - * Convert complex short to items32 sc8 buffer - **********************************************************************/ -static UHD_INLINE item32_t sc16_to_item32_sc8(sc16_t in0, sc16_t in1, double){ - boost::uint8_t real0 = boost::int8_t(in0.real()); - boost::uint8_t imag0 = boost::int8_t(in0.imag()); - boost::uint8_t real1 = boost::int8_t(in1.real()); - boost::uint8_t imag1 = boost::int8_t(in1.imag()); +template <> UHD_INLINE item32_t xx_to_item32_sc8_x1( + const sc16_t &in0, const sc16_t &in1, const double +){ return - (item32_t(real0) << 8) | (item32_t(imag0) << 0) | - (item32_t(real1) << 24) | (item32_t(imag1) << 16) + (item32_t(boost::uint8_t(in0.real())) << 8) | + (item32_t(boost::uint8_t(in0.imag())) << 0) | + (item32_t(boost::uint8_t(in1.real())) << 24) | + (item32_t(boost::uint8_t(in1.imag())) << 16) ; } -/*********************************************************************** - * Convert complex float to items32 sc8 buffer - **********************************************************************/ -static UHD_INLINE item32_t fc32_to_item32_sc8(fc32_t in0, fc32_t in1, double scale_factor){ - boost::uint8_t real0 = boost::int8_t(in0.real()*float(scale_factor)); - boost::uint8_t imag0 = boost::int8_t(in0.imag()*float(scale_factor)); - boost::uint8_t real1 = boost::int8_t(in1.real()*float(scale_factor)); - boost::uint8_t imag1 = boost::int8_t(in1.imag()*float(scale_factor)); - return - (item32_t(real0) << 8) | (item32_t(imag0) << 0) | - (item32_t(real1) << 24) | (item32_t(imag1) << 16) - ; +template <xtox_t to_wire, typename T> +UHD_INLINE void xx_to_item32_sc8( + const std::complex<T> *input, + item32_t *output, + const size_t nsamps, + const double scale_factor +){ + const size_t num_pairs = nsamps/2; + for (size_t i = 0, j = 0; i < num_pairs; i++, j+=2){ + const item32_t item = xx_to_item32_sc8_x1(input[j], input[j+1], scale_factor); + output[i] = to_wire(item); + } + + if (nsamps != num_pairs*2){ + const item32_t item = xx_to_item32_sc8_x1(input[nsamps-1], std::complex<T>(0), scale_factor); + output[num_pairs] = to_wire(item); + } } /*********************************************************************** - * Convert complex double to items32 sc8 buffer + * Convert items32 sc8 buffer to xx **********************************************************************/ -static UHD_INLINE item32_t fc64_to_item32_sc8(fc64_t in0, fc64_t in1, double scale_factor){ - boost::uint8_t real0 = boost::int8_t(in0.real()*(scale_factor)); - boost::uint8_t imag0 = boost::int8_t(in0.imag()*(scale_factor)); - boost::uint8_t real1 = boost::int8_t(in1.real()*(scale_factor)); - boost::uint8_t imag1 = boost::int8_t(in1.imag()*(scale_factor)); - return - (item32_t(real0) << 8) | (item32_t(imag0) << 0) | - (item32_t(real1) << 24) | (item32_t(imag1) << 16) - ; +template <typename T> UHD_INLINE void item32_sc8_x1_to_xx( + const item32_t item, std::complex<T> &out0, std::complex<T> &out1, const double scale_factor +){ + out0 = std::complex<T>( + T(boost::int8_t(item >> 8)*float(scale_factor)), + T(boost::int8_t(item >> 0)*float(scale_factor)) + ); + out1 = std::complex<T>( + T(boost::int8_t(item >> 24)*float(scale_factor)), + T(boost::int8_t(item >> 16)*float(scale_factor)) + ); +} + +template <> UHD_INLINE void item32_sc8_x1_to_xx( + const item32_t item, sc16_t &out0, sc16_t &out1, const double +){ + out0 = sc16_t( + boost::int16_t(boost::int8_t(item >> 8)), + boost::int16_t(boost::int8_t(item >> 0)) + ); + out1 = sc16_t( + boost::int16_t(boost::int8_t(item >> 24)), + boost::int16_t(boost::int8_t(item >> 16)) + ); +} + +template <xtox_t to_host, typename T> +UHD_INLINE void item32_sc8_to_xx( + const item32_t *input, + std::complex<T> *output, + const size_t nsamps, + const double scale_factor +){ + input = reinterpret_cast<const item32_t *>(size_t(input) & ~0x3); + std::complex<T> dummy; + size_t num_samps = nsamps; + + if ((size_t(input) & 0x3) != 0){ + const item32_t item0 = to_host(*input++); + item32_sc8_x1_to_xx(item0, dummy, *output++, scale_factor); + num_samps--; + } + + const size_t num_pairs = num_samps/2; + for (size_t i = 0, j = 0; i < num_pairs; i++, j+=2){ + const item32_t item_i = to_host(input[i]); + item32_sc8_x1_to_xx(item_i, output[j], output[j+1], scale_factor); + } + + if (num_samps != num_pairs*2){ + const item32_t item_n = to_host(input[num_pairs]); + item32_sc8_x1_to_xx(item_n, output[num_samps-1], dummy, scale_factor); + } } #endif /* INCLUDED_LIBUHD_CONVERT_COMMON_HPP */ diff --git a/host/lib/convert/convert_item32.cpp b/host/lib/convert/convert_item32.cpp new file mode 100644 index 000000000..bcac74714 --- /dev/null +++ b/host/lib/convert/convert_item32.cpp @@ -0,0 +1,43 @@ +// +// Copyright 2012 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// + +#include "convert_common.hpp" +#include <uhd/utils/byteswap.hpp> + +#define __DECLARE_ITEM32_CONVERTER(cpu_type, wire_type, xe, htoxx, xxtoh) \ + DECLARE_CONVERTER(cpu_type, 1, wire_type ## _item32_ ## xe, 1, PRIORITY_GENERAL){ \ + const cpu_type ## _t *input = reinterpret_cast<const cpu_type ## _t *>(inputs[0]); \ + item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); \ + xx_to_item32_ ## wire_type<htoxx>(input, output, nsamps, scale_factor); \ + } \ + DECLARE_CONVERTER(wire_type ## _item32_ ## xe, 1, cpu_type, 1, PRIORITY_GENERAL){ \ + const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); \ + cpu_type ## _t *output = reinterpret_cast<cpu_type ## _t *>(outputs[0]); \ + item32_ ## wire_type ## _to_xx<xxtoh>(input, output, nsamps, scale_factor); \ + } + +#define _DECLARE_ITEM32_CONVERTER(cpu_type, wire_type) \ + __DECLARE_ITEM32_CONVERTER(cpu_type, wire_type, be, uhd::htonx, uhd::ntohx) \ + __DECLARE_ITEM32_CONVERTER(cpu_type, wire_type, le, uhd::htowx, uhd::wtohx) + +#define DECLARE_ITEM32_CONVERTER(cpu_type) \ + _DECLARE_ITEM32_CONVERTER(cpu_type, sc8) \ + _DECLARE_ITEM32_CONVERTER(cpu_type, sc16) + +DECLARE_ITEM32_CONVERTER(sc16) +DECLARE_ITEM32_CONVERTER(fc32) +DECLARE_ITEM32_CONVERTER(fc64) diff --git a/host/lib/convert/convert_with_neon.cpp b/host/lib/convert/convert_with_neon.cpp index c7ad62104..ad184e1b6 100644 --- a/host/lib/convert/convert_with_neon.cpp +++ b/host/lib/convert/convert_with_neon.cpp @@ -1,5 +1,5 @@ // -// Copyright 2011-2011 Ettus Research LLC +// Copyright 2011-2012 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -36,8 +36,7 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ vst1_s16((reinterpret_cast<int16_t *>(&output[i])), D9); } - for (; i < nsamps; i++) - output[i] = fc32_to_item32_sc16(input[i], scale_factor); + xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); } DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ @@ -56,6 +55,5 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ vst1q_f32((reinterpret_cast<float *>(&output[i])), Q4); } - for (; i < nsamps; i++) - output[i] = item32_sc16_to_fc32(input[i], scale_factor); + item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); } diff --git a/host/lib/convert/gen_convert_general.py b/host/lib/convert/gen_convert_general.py index 364c4bd1a..b0790755a 100644 --- a/host/lib/convert/gen_convert_general.py +++ b/host/lib/convert/gen_convert_general.py @@ -48,68 +48,6 @@ DECLARE_CONVERTER(sc16_item32_$(end), 1, item32, 1, PRIORITY_GENERAL){ } """ -TMPL_CONV_GEN2_SC16 = """ -DECLARE_CONVERTER($(cpu_type), 1, sc16_item32_$(end), 1, PRIORITY_GENERAL){ - const $(cpu_type)_t *input = reinterpret_cast<const $(cpu_type)_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - for (size_t i = 0; i < nsamps; i++){ - output[i] = $(to_wire)($(cpu_type)_to_item32_sc16(input[i], scale_factor)); - } -} - -DECLARE_CONVERTER(sc16_item32_$(end), 1, $(cpu_type), 1, PRIORITY_GENERAL){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - $(cpu_type)_t *output = reinterpret_cast<$(cpu_type)_t *>(outputs[0]); - - for (size_t i = 0; i < nsamps; i++){ - output[i] = item32_sc16_to_$(cpu_type)($(to_host)(input[i]), scale_factor); - } -} -""" - -TMPL_CONV_GEN2_SC8 = """ -DECLARE_CONVERTER(sc8_item32_$(end), 1, $(cpu_type), 1, PRIORITY_GENERAL){ - const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); - $(cpu_type)_t *output = reinterpret_cast<$(cpu_type)_t *>(outputs[0]); - $(cpu_type)_t dummy; - size_t num_samps = nsamps; - - if ((size_t(inputs[0]) & 0x3) != 0){ - const item32_t item0 = $(to_host)(*input++); - item32_sc8_to_$(cpu_type)(item0, dummy, *output++, scale_factor); - num_samps--; - } - - const size_t num_pairs = num_samps/2; - for (size_t i = 0, j = 0; i < num_pairs; i++, j+=2){ - const item32_t item_i = $(to_host)(input[i]); - item32_sc8_to_$(cpu_type)(item_i, output[j], output[j+1], scale_factor); - } - - if (num_samps != num_pairs*2){ - const item32_t item_n = $(to_host)(input[num_pairs]); - item32_sc8_to_$(cpu_type)(item_n, output[num_samps-1], dummy, scale_factor); - } -} - -DECLARE_CONVERTER($(cpu_type), 1, sc8_item32_$(end), 1, PRIORITY_GENERAL){ - const $(cpu_type)_t *input = reinterpret_cast<const $(cpu_type)_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - const size_t num_pairs = nsamps/2; - for (size_t i = 0, j = 0; i < num_pairs; i++, j+=2){ - const item32_t item = $(cpu_type)_to_item32_sc8(input[j], input[j+1], scale_factor); - output[i] = $(to_wire)(item); - } - - if (nsamps != num_pairs*2){ - const item32_t item = $(cpu_type)_to_item32_sc8(input[nsamps-1], 0, scale_factor); - output[num_pairs] = $(to_wire)(item); - } -} -""" - TMPL_CONV_USRP1_COMPLEX = """ DECLARE_CONVERTER($(cpu_type), $(width), sc16_item16_usrp1, 1, PRIORITY_GENERAL){ #for $w in range($width) @@ -176,16 +114,6 @@ if __name__ == '__main__': ('be', 'uhd::ntohx', 'uhd::htonx'), ('le', 'uhd::wtohx', 'uhd::htowx'), ): - for cpu_type in 'fc64', 'fc32', 'sc16': - output += parse_tmpl( - TMPL_CONV_GEN2_SC16, - end=end, to_host=to_host, to_wire=to_wire, cpu_type=cpu_type - ) - for cpu_type in 'fc64', 'fc32', 'sc16', 'sc8': - output += parse_tmpl( - TMPL_CONV_GEN2_SC8, - end=end, to_host=to_host, to_wire=to_wire, cpu_type=cpu_type - ) output += parse_tmpl( TMPL_CONV_GEN2_ITEM32, end=end, to_host=to_host, to_wire=to_wire diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp new file mode 100644 index 000000000..90bf0ed04 --- /dev/null +++ b/host/lib/convert/sse2_fc32_to_sc16.cpp @@ -0,0 +1,103 @@ +// +// Copyright 2011-2012 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// + +#include "convert_common.hpp" +#include <uhd/utils/byteswap.hpp> +#include <emmintrin.h> + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ + const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); + item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)); + + #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ + for (; i+3 < nsamps; i+=4){ \ + /* load from input */ \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(input) & 0xf){ + case 0x8: + xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); i++; + case 0x0: + convert_fc32_1_to_item32_1_nswap_guts(_) + break; + default: convert_fc32_1_to_item32_1_nswap_guts(u_) + } + + //convert remainder + xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ + const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); + item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)); + + #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ + for (; i+3 < nsamps; i+=4){ \ + /* load from input */ \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(input) & 0xf){ + case 0x8: + xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); i++; + case 0x0: + convert_fc32_1_to_item32_1_bswap_guts(_) + break; + default: convert_fc32_1_to_item32_1_bswap_guts(u_) + } + + //convert remainder + xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +} diff --git a/host/lib/convert/convert_fc32_to_sc8_with_sse2.cpp b/host/lib/convert/sse2_fc32_to_sc8.cpp index b633f487c..72bbc0da5 100644 --- a/host/lib/convert/convert_fc32_to_sc8_with_sse2.cpp +++ b/host/lib/convert/sse2_fc32_to_sc8.cpp @@ -21,41 +21,21 @@ using namespace uhd::convert; -UHD_INLINE __m128i pack_sc32_4x_be( +UHD_INLINE __m128i pack_sc32_4x( const __m128 &in0, const __m128 &in1, const __m128 &in2, const __m128 &in3, - const __m128 &scalar + const __m128 &scalar, const int shuf ){ __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); - tmpi0 = _mm_shuffle_epi32(tmpi0, _MM_SHUFFLE(1, 0, 3, 2)); + tmpi0 = _mm_shuffle_epi32(tmpi0, shuf); __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); - tmpi1 = _mm_shuffle_epi32(tmpi1, _MM_SHUFFLE(1, 0, 3, 2)); + tmpi1 = _mm_shuffle_epi32(tmpi1, shuf); const __m128i lo = _mm_packs_epi32(tmpi0, tmpi1); __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); - tmpi2 = _mm_shuffle_epi32(tmpi2, _MM_SHUFFLE(1, 0, 3, 2)); + tmpi2 = _mm_shuffle_epi32(tmpi2, shuf); __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); - tmpi3 = _mm_shuffle_epi32(tmpi3, _MM_SHUFFLE(1, 0, 3, 2)); - const __m128i hi = _mm_packs_epi32(tmpi2, tmpi3); - - return _mm_packs_epi16(lo, hi); -} - -UHD_INLINE __m128i pack_sc32_4x_le( - const __m128 &in0, const __m128 &in1, - const __m128 &in2, const __m128 &in3, - const __m128 &scalar -){ - __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); - tmpi0 = _mm_shuffle_epi32(tmpi0, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); - tmpi1 = _mm_shuffle_epi32(tmpi1, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i lo = _mm_packs_epi32(tmpi0, tmpi1); - - __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); - tmpi2 = _mm_shuffle_epi32(tmpi2, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); - tmpi3 = _mm_shuffle_epi32(tmpi3, _MM_SHUFFLE(2, 3, 0, 1)); + tmpi3 = _mm_shuffle_epi32(tmpi3, shuf); const __m128i hi = _mm_packs_epi32(tmpi2, tmpi3); return _mm_packs_epi16(lo, hi); @@ -76,7 +56,7 @@ DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD){ __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ \ /* convert */ \ - const __m128i tmpi = pack_sc32_4x_be(tmp0, tmp1, tmp2, tmp3, scalar); \ + const __m128i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar, _MM_SHUFFLE(1, 0, 3, 2)); \ \ /* store to output */ \ _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ @@ -93,16 +73,7 @@ DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD){ } //convert remainder - const size_t num_pairs = nsamps/2; - for (size_t j = i/2; j < num_pairs; j++, i+=2){ - const item32_t item = fc32_to_item32_sc8(input[i], input[i+1], scale_factor); - output[j] = uhd::byteswap(item); - } - - if (nsamps != num_pairs*2){ - const item32_t item = fc32_to_item32_sc8(input[nsamps-1], 0, scale_factor); - output[num_pairs] = uhd::byteswap(item); - } + xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); } DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){ @@ -120,7 +91,7 @@ DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){ __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ \ /* convert */ \ - const __m128i tmpi = pack_sc32_4x_le(tmp0, tmp1, tmp2, tmp3, scalar); \ + const __m128i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar, _MM_SHUFFLE(2, 3, 0, 1)); \ \ /* store to output */ \ _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ @@ -137,14 +108,5 @@ DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){ } //convert remainder - const size_t num_pairs = nsamps/2; - for (size_t j = i/2; j < num_pairs; j++, i+=2){ - const item32_t item = fc32_to_item32_sc8(input[i], input[i+1], scale_factor); - output[j] = (item); - } - - if (nsamps != num_pairs*2){ - const item32_t item = fc32_to_item32_sc8(input[nsamps-1], 0, scale_factor); - output[num_pairs] = (item); - } + xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); } diff --git a/host/lib/convert/sse2_fc64_to_sc16.cpp b/host/lib/convert/sse2_fc64_to_sc16.cpp new file mode 100644 index 000000000..f030e9168 --- /dev/null +++ b/host/lib/convert/sse2_fc64_to_sc16.cpp @@ -0,0 +1,111 @@ +// +// Copyright 2011-2012 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// + +#include "convert_common.hpp" +#include <uhd/utils/byteswap.hpp> +#include <emmintrin.h> + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD){ + const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); + item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor); + + #define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ + for (; i+3 < nsamps; i+=4){ \ + /* load from input */ \ + __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ + __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ + __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ + __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(input) & 0xf) == 0){ + convert_fc64_1_to_item32_1_nswap_guts(_) + } + else{ + convert_fc64_1_to_item32_1_nswap_guts(u_) + } + + //convert remainder + xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD){ + const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); + item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor); + + #define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ + for (; i+3 < nsamps; i+=4){ \ + /* load from input */ \ + __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ + __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ + __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ + __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(input) & 0xf) == 0){ + convert_fc64_1_to_item32_1_bswap_guts(_) + } + else{ + convert_fc64_1_to_item32_1_bswap_guts(u_) + } + + //convert remainder + xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); +} diff --git a/host/lib/convert/convert_fc64_to_sc8_with_sse2.cpp b/host/lib/convert/sse2_fc64_to_sc8.cpp index 405850601..bf3719e13 100644 --- a/host/lib/convert/convert_fc64_to_sc8_with_sse2.cpp +++ b/host/lib/convert/sse2_fc64_to_sc8.cpp @@ -30,7 +30,7 @@ UHD_INLINE __m128i pack_sc8_item32_4x( return _mm_packs_epi16(lo, hi); } -UHD_INLINE __m128i pack_sc32_4x_be( +UHD_INLINE __m128i pack_sc32_4x( const __m128d &lo, const __m128d &hi, const __m128d &scalar ){ @@ -39,16 +39,6 @@ UHD_INLINE __m128i pack_sc32_4x_be( return _mm_unpacklo_epi64(tmpi_lo, tmpi_hi); } -UHD_INLINE __m128i pack_sc32_4x_le( - const __m128d &lo, const __m128d &hi, - const __m128d &scalar -){ - const __m128i tmpi_lo = _mm_cvttpd_epi32(_mm_mul_pd(lo, scalar)); - const __m128i tmpi_hi = _mm_cvttpd_epi32(_mm_mul_pd(hi, scalar)); - const __m128i tmpi = _mm_unpacklo_epi64(tmpi_lo, tmpi_hi); - return _mm_shuffle_epi32(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); -} - DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){ const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); @@ -69,10 +59,10 @@ DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){ \ /* interleave */ \ const __m128i tmpi = pack_sc8_item32_4x( \ - pack_sc32_4x_be(tmp0, tmp1, scalar), \ - pack_sc32_4x_be(tmp2, tmp3, scalar), \ - pack_sc32_4x_be(tmp4, tmp5, scalar), \ - pack_sc32_4x_be(tmp6, tmp7, scalar) \ + pack_sc32_4x(tmp0, tmp1, scalar), \ + pack_sc32_4x(tmp2, tmp3, scalar), \ + pack_sc32_4x(tmp4, tmp5, scalar), \ + pack_sc32_4x(tmp6, tmp7, scalar) \ ); \ \ /* store to output */ \ @@ -90,16 +80,7 @@ DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){ } //convert remainder - const size_t num_pairs = nsamps/2; - for (size_t j = i/2; j < num_pairs; j++, i+=2){ - const item32_t item = fc64_to_item32_sc8(input[i], input[i+1], scale_factor); - output[j] = uhd::byteswap(item); - } - - if (nsamps != num_pairs*2){ - const item32_t item = fc64_to_item32_sc8(input[nsamps-1], 0, scale_factor); - output[num_pairs] = uhd::byteswap(item); - } + xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); } DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){ @@ -121,12 +102,13 @@ DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){ __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \ \ /* interleave */ \ - const __m128i tmpi = pack_sc8_item32_4x( \ - pack_sc32_4x_le(tmp0, tmp1, scalar), \ - pack_sc32_4x_le(tmp2, tmp3, scalar), \ - pack_sc32_4x_le(tmp4, tmp5, scalar), \ - pack_sc32_4x_le(tmp6, tmp7, scalar) \ + __m128i tmpi = pack_sc8_item32_4x( \ + pack_sc32_4x(tmp1, tmp0, scalar), \ + pack_sc32_4x(tmp3, tmp2, scalar), \ + pack_sc32_4x(tmp5, tmp4, scalar), \ + pack_sc32_4x(tmp7, tmp6, scalar) \ ); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ \ /* store to output */ \ _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ @@ -143,14 +125,5 @@ DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){ } //convert remainder - const size_t num_pairs = nsamps/2; - for (size_t j = i/2; j < num_pairs; j++, i+=2){ - const item32_t item = fc64_to_item32_sc8(input[i], input[i+1], scale_factor); - output[j] = (item); - } - - if (nsamps != num_pairs*2){ - const item32_t item = fc64_to_item32_sc8(input[nsamps-1], 0, scale_factor); - output[num_pairs] = (item); - } + xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); } diff --git a/host/lib/convert/convert_fc32_with_sse2.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp index 97a3e8cdc..c03e41585 100644 --- a/host/lib/convert/convert_fc32_with_sse2.cpp +++ b/host/lib/convert/sse2_sc16_to_fc32.cpp @@ -1,5 +1,5 @@ // -// Copyright 2011 Ettus Research LLC +// Copyright 2011-2012 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -21,91 +21,6 @@ using namespace uhd::convert; -DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)); - - #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + swap 16-bit pairs */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(input) & 0xf){ - case 0x8: - output[i] = fc32_to_item32_sc16(input[i], float(scale_factor)); i++; - case 0x0: - convert_fc32_1_to_item32_1_nswap_guts(_) - break; - default: convert_fc32_1_to_item32_1_nswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = fc32_to_item32_sc16(input[i], float(scale_factor)); - } -} - -DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)); - - #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + byteswap -> byteswap 16 bit words */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(input) & 0xf){ - case 0x8: - output[i] = uhd::byteswap(fc32_to_item32_sc16(input[i], float(scale_factor))); i++; - case 0x0: - convert_fc32_1_to_item32_1_bswap_guts(_) - break; - default: convert_fc32_1_to_item32_1_bswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = uhd::byteswap(fc32_to_item32_sc16(input[i], float(scale_factor))); - } -} - DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); @@ -138,7 +53,7 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ //dispatch according to alignment switch (size_t(output) & 0xf){ case 0x8: - output[i] = item32_sc16_to_fc32(input[i], float(scale_factor)); i++; + item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor); i++; case 0x0: convert_item32_1_to_fc32_1_nswap_guts(_) break; @@ -146,9 +61,7 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ } //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_sc16_to_fc32(input[i], float(scale_factor)); - } + item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); } DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ @@ -182,7 +95,7 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ //dispatch according to alignment switch (size_t(output) & 0xf){ case 0x8: - output[i] = item32_sc16_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); i++; + item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor); i++; case 0x0: convert_item32_1_to_fc32_1_bswap_guts(_) break; @@ -190,7 +103,5 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ } //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_sc16_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); - } + item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); } diff --git a/host/lib/convert/convert_fc64_with_sse2.cpp b/host/lib/convert/sse2_sc16_to_fc64.cpp index 6e097e380..66068cad9 100644 --- a/host/lib/convert/convert_fc64_with_sse2.cpp +++ b/host/lib/convert/sse2_sc16_to_fc64.cpp @@ -1,5 +1,5 @@ // -// Copyright 2011 Ettus Research LLC +// Copyright 2011-2012 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -21,99 +21,6 @@ using namespace uhd::convert; -DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - const __m128d scalar = _mm_set1_pd(scale_factor); - - #define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - \ - /* convert and scale */ \ - __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ - __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ - __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ - __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ - __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ - __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ - \ - /* pack + swap 16-bit pairs */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ - convert_fc64_1_to_item32_1_nswap_guts(_) - } - else{ - convert_fc64_1_to_item32_1_nswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = fc64_to_item32_sc16(input[i], scale_factor); - } -} - -DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); - - const __m128d scalar = _mm_set1_pd(scale_factor); - - #define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - \ - /* convert and scale */ \ - __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ - __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ - __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ - __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ - __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ - __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ - \ - /* pack + byteswap -> byteswap 16 bit words */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ - convert_fc64_1_to_item32_1_bswap_guts(_) - } - else{ - convert_fc64_1_to_item32_1_bswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = uhd::byteswap(fc64_to_item32_sc16(input[i], scale_factor)); - } -} - DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD){ const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); @@ -158,9 +65,7 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD){ } //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_sc16_to_fc64(input[i], scale_factor); - } + item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); } DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD){ @@ -206,7 +111,5 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD){ } //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_sc16_to_fc64(uhd::byteswap(input[i]), scale_factor); - } + item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); } diff --git a/host/lib/convert/sse2_sc8_to_fc32.cpp b/host/lib/convert/sse2_sc8_to_fc32.cpp new file mode 100644 index 000000000..61ab7d26d --- /dev/null +++ b/host/lib/convert/sse2_sc8_to_fc32.cpp @@ -0,0 +1,129 @@ +// +// Copyright 2012 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// + +#include "convert_common.hpp" +#include <uhd/utils/byteswap.hpp> +#include <emmintrin.h> + +using namespace uhd::convert; + +static const __m128i zeroi = _mm_setzero_si128(); + +UHD_INLINE void unpack_sc32_4x( + const __m128i &in, + __m128 &out0, __m128 &out1, + __m128 &out2, __m128 &out3, + const __m128 &scalar, const int shuf +){ + const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ + __m128i tmp0 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ + __m128i tmp1 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmplo), shuf); + out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); + out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar); + + const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); + __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); + __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); + out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); + out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar); +} + +DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD){ + const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); + fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); + + size_t i = 0, j = 0; + fc32_t dummy; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0){ + item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor); + num_samps--; + } + + #define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; j+7 < num_samps; j+=8, i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m128 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar, _MM_SHUFFLE(1, 0, 3, 2)); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ + } + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_sc8_item32_1_to_fc32_1_bswap_guts(_) + } + else{ + convert_sc8_item32_1_to_fc32_1_bswap_guts(u_) + } + + //convert remainder + item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); +} + +DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD){ + const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); + fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); + + size_t i = 0, j = 0; + fc32_t dummy; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0){ + item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor); + num_samps--; + } + + #define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; j+7 < num_samps; j+=8, i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m128 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ + _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ + } + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_sc8_item32_1_to_fc32_1_nswap_guts(_) + } + else{ + convert_sc8_item32_1_to_fc32_1_nswap_guts(u_) + } + + //convert remainder + item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); +} diff --git a/host/lib/convert/sse2_sc8_to_fc64.cpp b/host/lib/convert/sse2_sc8_to_fc64.cpp new file mode 100644 index 000000000..aa2010d4e --- /dev/null +++ b/host/lib/convert/sse2_sc8_to_fc64.cpp @@ -0,0 +1,151 @@ +// +// Copyright 2012 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. +// + +#include "convert_common.hpp" +#include <uhd/utils/byteswap.hpp> +#include <emmintrin.h> + +using namespace uhd::convert; + +static const __m128i zeroi = _mm_setzero_si128(); + +UHD_INLINE void unpack_sc32_8x( + const __m128i &in, + __m128d &out0, __m128d &out1, + __m128d &out2, __m128d &out3, + __m128d &out4, __m128d &out5, + __m128d &out6, __m128d &out7, + const __m128d &scalar, + const int shuf = _MM_SHUFFLE(1, 0, 3, 2) +){ + __m128i tmp; + + const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ + tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */ + out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpackhi_epi16(zeroi, tmplo); + out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + + const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); + tmp = _mm_unpacklo_epi16(zeroi, tmphi); + out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpackhi_epi16(zeroi, tmphi); + out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); +} + +DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD){ + const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); + fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); + + size_t i = 0, j = 0; + fc32_t dummy; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0){ + item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor); + num_samps--; + } + + #define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_) \ + for (; j+7 < num_samps; j+=8, i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ + \ + /* unpack */ \ + __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ + unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ + } + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_sc8_item32_1_to_fc64_1_bswap_guts(_) + } + else{ + convert_sc8_item32_1_to_fc64_1_bswap_guts(u_) + } + + //convert remainder + item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); +} + +DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD){ + const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); + fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); + + size_t i = 0, j = 0; + fc32_t dummy; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0){ + item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor); + num_samps--; + } + + #define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_) \ + for (; j+7 < num_samps; j+=8, i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ + \ + /* unpack */ \ + __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ + unpack_sc32_8x(tmpi, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ + _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ + } + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_sc8_item32_1_to_fc64_1_nswap_guts(_) + } + else{ + convert_sc8_item32_1_to_fc64_1_nswap_guts(u_) + } + + //convert remainder + item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); +} |