aboutsummaryrefslogtreecommitdiffstats
path: root/host/lib/convert
diff options
context:
space:
mode:
Diffstat (limited to 'host/lib/convert')
-rw-r--r--host/lib/convert/sse2_fc32_to_sc16.cpp161
-rw-r--r--host/lib/convert/sse2_fc32_to_sc8.cpp127
-rw-r--r--host/lib/convert/sse2_fc64_to_sc16.cpp138
-rw-r--r--host/lib/convert/sse2_fc64_to_sc8.cpp153
-rw-r--r--host/lib/convert/sse2_sc16_to_fc32.cpp162
-rw-r--r--host/lib/convert/sse2_sc16_to_fc64.cpp140
-rw-r--r--host/lib/convert/sse2_sc16_to_sc16.cpp240
-rw-r--r--host/lib/convert/sse2_sc8_to_fc32.cpp132
-rw-r--r--host/lib/convert/sse2_sc8_to_fc64.cpp168
9 files changed, 742 insertions, 679 deletions
diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp
index f562074c6..2d1f853b9 100644
--- a/host/lib/convert/sse2_fc32_to_sc16.cpp
+++ b/host/lib/convert/sse2_fc32_to_sc16.cpp
@@ -1,6 +1,7 @@
//
// Copyright 2011-2012 Ettus Research LLC
// Copyright 2018 Ettus Research, a National Instruments Company
+// Copyright 2019 Ettus Research, a National Instruments Brand
//
// SPDX-License-Identifier: GPL-3.0-or-later
//
@@ -11,101 +12,111 @@
using namespace uhd::convert;
-DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){
- const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+ const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128 scalar = _mm_set_ps1(float(scale_factor));
- // this macro converts values faster by using SSE intrinsics to convert 4 values at a time
- #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \
- __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \
- \
- /* convert and scale */ \
- __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \
- __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \
- \
- /* pack + swap 16-bit pairs */ \
- __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
- tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \
- } \
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128 tmplo = \
+ _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+ __m128 tmphi = \
+ _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \
+ \
+ /* convert and scale */ \
+ __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \
+ __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \
+ \
+ /* pack + swap 16-bit pairs */ \
+ __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
+ tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \
+ }
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(input) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- convert_fc32_1_to_item32_1_nswap_guts(_)
- break;
- case 0x8:
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
- i++;
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- convert_fc32_1_to_item32_1_nswap_guts(_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load
- convert_fc32_1_to_item32_1_nswap_guts(u_)
+ switch (size_t(input) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ convert_fc32_1_to_item32_1_nswap_guts(_) break;
+ case 0x8:
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
+ i++;
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
+ convert_fc32_1_to_item32_1_nswap_guts(_) break;
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load
+ convert_fc32_1_to_item32_1_nswap_guts(u_)
}
// convert any remaining samples
- xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);
+ xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){
- const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+ const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128 scalar = _mm_set_ps1(float(scale_factor));
- // this macro converts values faster by using SSE intrinsics to convert 4 values at a time
- #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \
- __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \
- \
- /* convert and scale */ \
- __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \
- __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \
- \
- /* pack + byteswap -> byteswap 16 bit words */ \
- __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \
- } \
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128 tmplo = \
+ _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+ __m128 tmphi = \
+ _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \
+ \
+ /* convert and scale */ \
+ __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \
+ __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \
+ \
+ /* pack + byteswap -> byteswap 16 bit words */ \
+ __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
+ tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \
+ }
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(input) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- convert_fc32_1_to_item32_1_bswap_guts(_)
- break;
- case 0x8:
- // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion
- xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
- i++;
- // do faster processing of the remaining samples now that we are 16-byte aligned
- convert_fc32_1_to_item32_1_bswap_guts(_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load
- convert_fc32_1_to_item32_1_bswap_guts(u_)
+ switch (size_t(input) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ convert_fc32_1_to_item32_1_bswap_guts(_) break;
+ case 0x8:
+ // the first value is 8-byte aligned - process it and prepare the bulk of the
+ // data for fast conversion
+ xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
+ i++;
+ // do faster processing of the remaining samples now that we are 16-byte
+ // aligned
+ convert_fc32_1_to_item32_1_bswap_guts(_) break;
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load
+ convert_fc32_1_to_item32_1_bswap_guts(u_)
}
// convert any remaining samples
- xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);
+ xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_fc32_to_sc8.cpp b/host/lib/convert/sse2_fc32_to_sc8.cpp
index b3f96ea39..66faa82cc 100644
--- a/host/lib/convert/sse2_fc32_to_sc8.cpp
+++ b/host/lib/convert/sse2_fc32_to_sc8.cpp
@@ -12,94 +12,95 @@
using namespace uhd::convert;
template <const int shuf>
-UHD_INLINE __m128i pack_sc32_4x(
- const __m128 &in0, const __m128 &in1,
- const __m128 &in2, const __m128 &in3,
- const __m128 &scalar
-){
- __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar));
- tmpi0 = _mm_shuffle_epi32(tmpi0, shuf);
- __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar));
- tmpi1 = _mm_shuffle_epi32(tmpi1, shuf);
+UHD_INLINE __m128i pack_sc32_4x(const __m128& in0,
+ const __m128& in1,
+ const __m128& in2,
+ const __m128& in3,
+ const __m128& scalar)
+{
+ __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar));
+ tmpi0 = _mm_shuffle_epi32(tmpi0, shuf);
+ __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar));
+ tmpi1 = _mm_shuffle_epi32(tmpi1, shuf);
const __m128i lo = _mm_packs_epi32(tmpi0, tmpi1);
- __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar));
- tmpi2 = _mm_shuffle_epi32(tmpi2, shuf);
- __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar));
- tmpi3 = _mm_shuffle_epi32(tmpi3, shuf);
+ __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar));
+ tmpi2 = _mm_shuffle_epi32(tmpi2, shuf);
+ __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar));
+ tmpi3 = _mm_shuffle_epi32(tmpi3, shuf);
const __m128i hi = _mm_packs_epi32(tmpi2, tmpi3);
return _mm_packs_epi16(lo, hi);
}
-DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD){
- const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD)
+{
+ const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128 scalar = _mm_set_ps1(float(scale_factor));
- const int shuf = _MM_SHUFFLE(3, 2, 1, 0);
-
- #define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_) \
- for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \
- /* load from input */ \
- __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \
- __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \
- __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \
- __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \
- \
- /* convert */ \
- const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \
- } \
+ const int shuf = _MM_SHUFFLE(3, 2, 1, 0);
+
+#define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_) \
+ for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \
+ /* load from input */ \
+ __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+ __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \
+ __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
+ __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \
+ \
+ /* convert */ \
+ const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc32_1_to_sc8_item32_1_bswap_guts(_)
- }
- else{
+ } else {
convert_fc32_1_to_sc8_item32_1_bswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){
- const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD)
+{
+ const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128 scalar = _mm_set_ps1(float(scale_factor));
- const int shuf = _MM_SHUFFLE(0, 1, 2, 3);
-
- #define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_) \
- for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \
- /* load from input */ \
- __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \
- __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \
- __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \
- __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \
- \
- /* convert */ \
- const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \
- } \
+ const int shuf = _MM_SHUFFLE(0, 1, 2, 3);
+
+#define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_) \
+ for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \
+ /* load from input */ \
+ __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+ __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \
+ __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
+ __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \
+ \
+ /* convert */ \
+ const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc32_1_to_sc8_item32_1_nswap_guts(_)
- }
- else{
+ } else {
convert_fc32_1_to_sc8_item32_1_nswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_fc64_to_sc16.cpp b/host/lib/convert/sse2_fc64_to_sc16.cpp
index 2004c1fd7..7c2ce1f8e 100644
--- a/host/lib/convert/sse2_fc64_to_sc16.cpp
+++ b/host/lib/convert/sse2_fc64_to_sc16.cpp
@@ -11,91 +11,99 @@
using namespace uhd::convert;
-DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD){
- const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+ const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128d scalar = _mm_set1_pd(scale_factor);
- #define convert_fc64_1_to_item32_1_nswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \
- __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \
- __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \
- __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \
- \
- /* convert and scale */ \
- __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \
- __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \
- __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \
- __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \
- __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \
- __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \
- \
- /* pack + swap 16-bit pairs */ \
- __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
- tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \
- } \
+#define convert_fc64_1_to_item32_1_nswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128d tmp0 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+ __m128d tmp1 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \
+ __m128d tmp2 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+ __m128d tmp3 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \
+ \
+ /* convert and scale */ \
+ __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \
+ __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \
+ __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \
+ __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \
+ __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \
+ __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \
+ \
+ /* pack + swap 16-bit pairs */ \
+ __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
+ tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc64_1_to_item32_1_nswap_guts(_)
- }
- else{
+ } else {
convert_fc64_1_to_item32_1_nswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD){
- const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+ const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128d scalar = _mm_set1_pd(scale_factor);
- #define convert_fc64_1_to_item32_1_bswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \
- __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \
- __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \
- __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \
- \
- /* convert and scale */ \
- __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \
- __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \
- __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \
- __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \
- __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \
- __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \
- \
- /* pack + byteswap -> byteswap 16 bit words */ \
- __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \
- } \
+#define convert_fc64_1_to_item32_1_bswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128d tmp0 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+ __m128d tmp1 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \
+ __m128d tmp2 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+ __m128d tmp3 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \
+ \
+ /* convert and scale */ \
+ __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \
+ __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \
+ __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \
+ __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \
+ __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \
+ __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \
+ \
+ /* pack + byteswap -> byteswap 16 bit words */ \
+ __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \
+ tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc64_1_to_item32_1_bswap_guts(_)
- }
- else{
+ } else {
convert_fc64_1_to_item32_1_bswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_fc64_to_sc8.cpp b/host/lib/convert/sse2_fc64_to_sc8.cpp
index 455ca95e3..95db4e927 100644
--- a/host/lib/convert/sse2_fc64_to_sc8.cpp
+++ b/host/lib/convert/sse2_fc64_to_sc8.cpp
@@ -12,108 +12,119 @@
using namespace uhd::convert;
UHD_INLINE __m128i pack_sc8_item32_4x(
- const __m128i &in0, const __m128i &in1,
- const __m128i &in2, const __m128i &in3
-){
+ const __m128i& in0, const __m128i& in1, const __m128i& in2, const __m128i& in3)
+{
const __m128i lo = _mm_packs_epi32(in0, in1);
const __m128i hi = _mm_packs_epi32(in2, in3);
return _mm_packs_epi16(lo, hi);
}
UHD_INLINE __m128i pack_sc32_4x(
- const __m128d &lo, const __m128d &hi,
- const __m128d &scalar
-){
+ const __m128d& lo, const __m128d& hi, const __m128d& scalar)
+{
const __m128i tmpi_lo = _mm_cvttpd_epi32(_mm_mul_pd(hi, scalar));
const __m128i tmpi_hi = _mm_cvttpd_epi32(_mm_mul_pd(lo, scalar));
return _mm_unpacklo_epi64(tmpi_lo, tmpi_hi);
}
-DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){
- const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD)
+{
+ const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128d scalar = _mm_set1_pd(scale_factor);
- #define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_) \
- for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \
- /* load from input */ \
- __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \
- __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \
- __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \
- __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \
- __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \
- __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \
- __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \
- __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \
- \
- /* interleave */ \
- const __m128i tmpi = pack_sc8_item32_4x( \
- pack_sc32_4x(tmp1, tmp0, scalar), \
- pack_sc32_4x(tmp3, tmp2, scalar), \
- pack_sc32_4x(tmp5, tmp4, scalar), \
- pack_sc32_4x(tmp7, tmp6, scalar) \
- ); \
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \
- } \
+#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_) \
+ for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \
+ /* load from input */ \
+ __m128d tmp0 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+ __m128d tmp1 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \
+ __m128d tmp2 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+ __m128d tmp3 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \
+ __m128d tmp4 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \
+ __m128d tmp5 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5)); \
+ __m128d tmp6 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \
+ __m128d tmp7 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7)); \
+ \
+ /* interleave */ \
+ const __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), \
+ pack_sc32_4x(tmp3, tmp2, scalar), \
+ pack_sc32_4x(tmp5, tmp4, scalar), \
+ pack_sc32_4x(tmp7, tmp6, scalar)); \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc64_1_to_sc8_item32_1_bswap_guts(_)
- }
- else{
+ } else {
convert_fc64_1_to_sc8_item32_1_bswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){
- const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD)
+{
+ const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
const __m128d scalar = _mm_set1_pd(scale_factor);
- #define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_) \
- for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \
- /* load from input */ \
- __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \
- __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \
- __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \
- __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \
- __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \
- __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \
- __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \
- __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \
- \
- /* interleave */ \
- __m128i tmpi = pack_sc8_item32_4x( \
- pack_sc32_4x(tmp0, tmp1, scalar), \
- pack_sc32_4x(tmp2, tmp3, scalar), \
- pack_sc32_4x(tmp4, tmp5, scalar), \
- pack_sc32_4x(tmp6, tmp7, scalar) \
- ); \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\
- \
- /* store to output */ \
- _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \
- } \
+#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_) \
+ for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \
+ /* load from input */ \
+ __m128d tmp0 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+ __m128d tmp1 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \
+ __m128d tmp2 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+ __m128d tmp3 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \
+ __m128d tmp4 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \
+ __m128d tmp5 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5)); \
+ __m128d tmp6 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \
+ __m128d tmp7 = \
+ _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7)); \
+ \
+ /* interleave */ \
+ __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar), \
+ pack_sc32_4x(tmp2, tmp3, scalar), \
+ pack_sc32_4x(tmp4, tmp5, scalar), \
+ pack_sc32_4x(tmp6, tmp7, scalar)); \
+ tmpi = \
+ _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \
+ \
+ /* store to output */ \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(input) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(input) & 0xf) == 0) {
convert_fc64_1_to_sc8_item32_1_nswap_guts(_)
- }
- else{
+ } else {
convert_fc64_1_to_sc8_item32_1_nswap_guts(u_)
}
- //convert remainder
- xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor);
+ // convert remainder
+ xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_sc16_to_fc32.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp
index d75c4a2a7..a16ef30d4 100644
--- a/host/lib/convert/sse2_sc16_to_fc32.cpp
+++ b/host/lib/convert/sse2_sc16_to_fc32.cpp
@@ -11,105 +11,111 @@
using namespace uhd::convert;
-DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]);
- const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
+ const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16));
const __m128i zeroi = _mm_setzero_si128();
- // this macro converts values faster by using SSE intrinsics to convert 4 values at a time
- #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack + swap 16-bit pairs */ \
- tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_item32_1_to_fc32_1_nswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack + swap 16-bit pairs */ \
+ tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
__m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
- __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
- \
- /* convert and scale */ \
- __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \
- __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \
- } \
+ __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
+ \
+ /* convert and scale */ \
+ __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \
+ __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi); \
+ }
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(output) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- convert_item32_1_to_fc32_1_nswap_guts(_)
- break;
- case 0x8:
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor);
- i++;
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- convert_item32_1_to_fc32_1_nswap_guts(_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store
- convert_item32_1_to_fc32_1_nswap_guts(u_)
+ switch (size_t(output) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ convert_item32_1_to_fc32_1_nswap_guts(_) break;
+ case 0x8:
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor);
+ i++;
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
+ convert_item32_1_to_fc32_1_nswap_guts(_) break;
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load and store
+ convert_item32_1_to_fc32_1_nswap_guts(u_)
}
// convert any remaining samples
- item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);
+ item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]);
- const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
+ const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16));
const __m128i zeroi = _mm_setzero_si128();
- // this macro converts values faster by using SSE intrinsics to convert 4 values at a time
- #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* byteswap + unpack -> byteswap 16 bit words */ \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
- __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
- __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
- \
- /* convert and scale */ \
- __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \
- __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \
- } \
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_item32_1_to_fc32_1_bswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* byteswap + unpack -> byteswap 16 bit words */ \
+ tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+ __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
+ __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
+ \
+ /* convert and scale */ \
+ __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \
+ __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi); \
+ }
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(output) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- convert_item32_1_to_fc32_1_bswap_guts(_)
- break;
- case 0x8:
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor);
- i++;
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- convert_item32_1_to_fc32_1_bswap_guts(_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store
- convert_item32_1_to_fc32_1_bswap_guts(u_)
+ switch (size_t(output) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ convert_item32_1_to_fc32_1_bswap_guts(_) break;
+ case 0x8:
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor);
+ i++;
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
+ convert_item32_1_to_fc32_1_bswap_guts(_) break;
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load and store
+ convert_item32_1_to_fc32_1_bswap_guts(u_)
}
// convert any remaining samples
- item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);
+ item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_sc16_to_fc64.cpp b/host/lib/convert/sse2_sc16_to_fc64.cpp
index 7f22fd07f..45821ac9f 100644
--- a/host/lib/convert/sse2_sc16_to_fc64.cpp
+++ b/host/lib/convert/sse2_sc16_to_fc64.cpp
@@ -11,95 +11,95 @@
using namespace uhd::convert;
-DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]);
- const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16));
- const __m128i zeroi = _mm_setzero_si128();
+ const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16));
+ const __m128i zeroi = _mm_setzero_si128();
- #define convert_item32_1_to_fc64_1_nswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack + swap 16-bit pairs */ \
- tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
- tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+#define convert_item32_1_to_fc64_1_nswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack + swap 16-bit pairs */ \
+ tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
+ tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
__m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
- __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
- \
- /* convert and scale */ \
- __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
- tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \
- __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
- __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
- tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \
- __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \
- } \
+ __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
+ \
+ /* convert and scale */ \
+ __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
+ tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \
+ __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
+ __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
+ tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \
+ __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_item32_1_to_fc64_1_nswap_guts(_)
- }
- else{
+ } else {
convert_item32_1_to_fc64_1_nswap_guts(u_)
}
- //convert remainder
- item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);
+ // convert remainder
+ item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}
-DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]);
- const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16));
- const __m128i zeroi = _mm_setzero_si128();
+ const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16));
+ const __m128i zeroi = _mm_setzero_si128();
- #define convert_item32_1_to_fc64_1_bswap_guts(_al_) \
- for (; i+3 < nsamps; i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* byteswap + unpack -> byteswap 16 bit words */ \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
- __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
- __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
- \
- /* convert and scale */ \
- __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
- tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \
- __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
- __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
- tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \
- __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \
- } \
+#define convert_item32_1_to_fc64_1_bswap_guts(_al_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* byteswap + unpack -> byteswap 16 bit words */ \
+ tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+ __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
+ __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \
+ \
+ /* convert and scale */ \
+ __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
+ tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \
+ __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \
+ __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
+ tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \
+ __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3); \
+ }
size_t i = 0;
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_item32_1_to_fc64_1_bswap_guts(_)
- }
- else{
+ } else {
convert_item32_1_to_fc64_1_bswap_guts(u_)
}
- //convert remainder
- item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);
+ // convert remainder
+ item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}
diff --git a/host/lib/convert/sse2_sc16_to_sc16.cpp b/host/lib/convert/sse2_sc16_to_sc16.cpp
index 5c81f357b..e484bee31 100644
--- a/host/lib/convert/sse2_sc16_to_sc16.cpp
+++ b/host/lib/convert/sse2_sc16_to_sc16.cpp
@@ -25,20 +25,20 @@ using namespace uhd::convert;
// | C | D | A | B | Output
// -----------------
//
-#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_,_oalign_) \
- for (; i+3 < nsamps; i+=4) { \
- __m128i m0; \
- \
- /* load from input */ \
- m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\
- \
- /* swap 16-bit pairs */ \
- m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \
- m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \
- \
- /* store to output */ \
- _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \
- } \
+#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_, _oalign_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ __m128i m0; \
+ \
+ /* load from input */ \
+ m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \
+ \
+ /* swap 16-bit pairs */ \
+ m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \
+ m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \
+ \
+ /* store to output */ \
+ _mm_store##_oalign_##si128((__m128i*)(output + i), m0); \
+ }
//
// SSE byte swap
@@ -54,138 +54,158 @@ using namespace uhd::convert;
// | B | A | D | C | Output
// -----------------
//
-#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_,_oalign_) \
- for (; i+3 < nsamps; i+=4) { \
- __m128i m0, m1, m2; \
- \
- /* load from input */ \
- m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\
- \
- /* byteswap 16 bit words */ \
- m1 = _mm_srli_epi16(m0, 8); \
- m2 = _mm_slli_epi16(m0, 8); \
- m0 = _mm_or_si128(m1, m2); \
- \
- /* store to output */ \
- _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \
- } \
-
-DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD){
- const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_, _oalign_) \
+ for (; i + 3 < nsamps; i += 4) { \
+ __m128i m0, m1, m2; \
+ \
+ /* load from input */ \
+ m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \
+ \
+ /* byteswap 16 bit words */ \
+ m1 = _mm_srli_epi16(m0, 8); \
+ m2 = _mm_slli_epi16(m0, 8); \
+ m0 = _mm_or_si128(m1, m2); \
+ \
+ /* store to output */ \
+ _mm_store##_oalign_##si128((__m128i*)(output + i), m0); \
+ }
+
+DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+ const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(input) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_)
- break;
- case 0x8:
- if (nsamps < 2)
+ switch (size_t(input) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_)
+ break;
+ case 0x8:
+ if (nsamps < 2)
+ break;
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0);
+ i += 2;
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_)
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
break;
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0);
- i += 2;
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_)
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_)
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)
}
// convert any remaining samples
- xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, 1.0);
+ xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
}
-DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD){
- const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]);
- item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+ const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+ item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(input) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_)
- break;
- case 0x8:
- if (nsamps < 2)
+ switch (size_t(input) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_)
break;
- // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion
- xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0);
- i += 2;
- // do faster processing of the remaining samples now that we are 16-byte aligned
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_)
+ case 0x8:
+ if (nsamps < 2)
+ break;
+ // the first value is 8-byte aligned - process it and prepare the bulk of the
+ // data for fast conversion
+ xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0);
+ i += 2;
+ // do faster processing of the remaining samples now that we are 16-byte
+ // aligned
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_)
+ break;
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)
}
// convert any remaining samples
- xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, 1.0);
+ xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
}
-DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]);
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(output) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_)
- break;
- case 0x8:
- if (nsamps < 2)
+ switch (size_t(output) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _)
+ break;
+ case 0x8:
+ if (nsamps < 2)
+ break;
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0);
+ i += 2;
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _)
break;
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0);
- i += 2;
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store
- CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_)
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load and store
+ CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)
}
// convert any remaining samples
- item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, 1.0);
+ item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
}
-DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
- sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]);
+DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+ sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]);
size_t i = 0;
// need to dispatch according to alignment for fastest conversion
- switch (size_t(output) & 0xf){
- case 0x0:
- // the data is 16-byte aligned, so do the fast processing of the bulk of the samples
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_)
- break;
- case 0x8:
- if (nsamps < 2)
+ switch (size_t(output) & 0xf) {
+ case 0x0:
+ // the data is 16-byte aligned, so do the fast processing of the bulk of the
+ // samples
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _)
+ break;
+ case 0x8:
+ if (nsamps < 2)
+ break;
+ // the first sample is 8-byte aligned - process it to align the remainder of
+ // the samples to 16-bytes
+ item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0);
+ i += 2;
+ // do faster processing of the bulk of the samples now that we are 16-byte
+ // aligned
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _)
break;
- // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes
- item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0);
- i += 2;
- // do faster processing of the bulk of the samples now that we are 16-byte aligned
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_)
- break;
- default:
- // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store
- CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_)
+ default:
+ // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+ // load and store
+ CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)
}
// convert any remaining samples
- item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, 1.0);
+ item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
}
diff --git a/host/lib/convert/sse2_sc8_to_fc32.cpp b/host/lib/convert/sse2_sc8_to_fc32.cpp
index 6d68850bf..aefda2b13 100644
--- a/host/lib/convert/sse2_sc8_to_fc32.cpp
+++ b/host/lib/convert/sse2_sc8_to_fc32.cpp
@@ -14,109 +14,111 @@ using namespace uhd::convert;
static const __m128i zeroi = _mm_setzero_si128();
template <const int shuf>
-UHD_INLINE void unpack_sc32_4x(
- const __m128i &in,
- __m128 &out0, __m128 &out1,
- __m128 &out2, __m128 &out3,
- const __m128 &scalar
-){
+UHD_INLINE void unpack_sc32_4x(const __m128i& in,
+ __m128& out0,
+ __m128& out1,
+ __m128& out2,
+ __m128& out3,
+ const __m128& scalar)
+{
const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */
- __m128i tmp0 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */
+ __m128i tmp0 = _mm_shuffle_epi32(
+ _mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */
__m128i tmp1 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmplo), shuf);
- out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar);
- out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar);
+ out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar);
+ out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar);
const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in);
- __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf);
- __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf);
- out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar);
- out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar);
+ __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf);
+ __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf);
+ out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar);
+ out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar);
}
-DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3);
- fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
+DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+ fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]);
- const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24));
- const int shuf = _MM_SHUFFLE(3, 2, 1, 0);
+ const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24));
+ const int shuf = _MM_SHUFFLE(3, 2, 1, 0);
size_t i = 0, j = 0;
fc32_t dummy;
size_t num_samps = nsamps;
- if ((size_t(inputs[0]) & 0x3) != 0){
+ if ((size_t(inputs[0]) & 0x3) != 0) {
item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);
num_samps--;
}
- #define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \
- for (; j+7 < num_samps; j+=8, i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack + swap 8-bit pairs */ \
- __m128 tmp0, tmp1, tmp2, tmp3; \
- unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \
+#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \
+ for (; j + 7 < num_samps; j += 8, i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack + swap 8-bit pairs */ \
+ __m128 tmp0, tmp1, tmp2, tmp3; \
+ unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3); \
}
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_sc8_item32_1_to_fc32_1_bswap_guts(_)
- }
- else{
+ } else {
convert_sc8_item32_1_to_fc32_1_bswap_guts(u_)
}
- //convert remainder
- item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor);
+ // convert remainder
+ item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);
}
-DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3);
- fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
+DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+ fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]);
- const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24));
- const int shuf = _MM_SHUFFLE(0, 1, 2, 3);
+ const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24));
+ const int shuf = _MM_SHUFFLE(0, 1, 2, 3);
size_t i = 0, j = 0;
fc32_t dummy;
size_t num_samps = nsamps;
- if ((size_t(inputs[0]) & 0x3) != 0){
+ if ((size_t(inputs[0]) & 0x3) != 0) {
item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);
num_samps--;
}
- #define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \
- for (; j+7 < num_samps; j+=8, i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack + swap 8-bit pairs */ \
- __m128 tmp0, tmp1, tmp2, tmp3; \
- unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \
- _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \
+#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \
+ for (; j + 7 < num_samps; j += 8, i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack + swap 8-bit pairs */ \
+ __m128 tmp0, tmp1, tmp2, tmp3; \
+ unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2); \
+ _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3); \
}
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_sc8_item32_1_to_fc32_1_nswap_guts(_)
- }
- else{
+ } else {
convert_sc8_item32_1_to_fc32_1_nswap_guts(u_)
}
- //convert remainder
- item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor);
+ // convert remainder
+ item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);
}
diff --git a/host/lib/convert/sse2_sc8_to_fc64.cpp b/host/lib/convert/sse2_sc8_to_fc64.cpp
index f5b406152..3cc2fefd0 100644
--- a/host/lib/convert/sse2_sc8_to_fc64.cpp
+++ b/host/lib/convert/sse2_sc8_to_fc64.cpp
@@ -13,129 +13,133 @@ using namespace uhd::convert;
static const __m128i zeroi = _mm_setzero_si128();
-UHD_INLINE void unpack_sc32_8x(
- const __m128i &in,
- __m128d &out0, __m128d &out1,
- __m128d &out2, __m128d &out3,
- __m128d &out4, __m128d &out5,
- __m128d &out6, __m128d &out7,
- const __m128d &scalar
-){
+UHD_INLINE void unpack_sc32_8x(const __m128i& in,
+ __m128d& out0,
+ __m128d& out1,
+ __m128d& out2,
+ __m128d& out3,
+ __m128d& out4,
+ __m128d& out5,
+ __m128d& out6,
+ __m128d& out7,
+ const __m128d& scalar)
+{
const int shuf = _MM_SHUFFLE(1, 0, 3, 2);
__m128i tmp;
const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */
- tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */
- out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_shuffle_epi32(tmp, shuf);
- out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_unpackhi_epi16(zeroi, tmplo);
- out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_shuffle_epi32(tmp, shuf);
- out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */
+ out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_shuffle_epi32(tmp, shuf);
+ out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_unpackhi_epi16(zeroi, tmplo);
+ out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_shuffle_epi32(tmp, shuf);
+ out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in);
- tmp = _mm_unpacklo_epi16(zeroi, tmphi);
- out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_shuffle_epi32(tmp, shuf);
- out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_unpackhi_epi16(zeroi, tmphi);
- out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
- tmp = _mm_shuffle_epi32(tmp, shuf);
- out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_unpacklo_epi16(zeroi, tmphi);
+ out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_shuffle_epi32(tmp, shuf);
+ out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_unpackhi_epi16(zeroi, tmphi);
+ out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
+ tmp = _mm_shuffle_epi32(tmp, shuf);
+ out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar);
}
-DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3);
- fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]);
+DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+ fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]);
- const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24));
+ const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24));
size_t i = 0, j = 0;
fc32_t dummy;
size_t num_samps = nsamps;
- if ((size_t(inputs[0]) & 0x3) != 0){
+ if ((size_t(inputs[0]) & 0x3) != 0) {
item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);
num_samps--;
}
- #define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_) \
- for (; j+7 < num_samps; j+=8, i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack */ \
- __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
+#define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_) \
+ for (; j + 7 < num_samps; j += 8, i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack */ \
+ __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
unpack_sc32_8x(tmpi, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7); \
}
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_sc8_item32_1_to_fc64_1_bswap_guts(_)
- }
- else{
+ } else {
convert_sc8_item32_1_to_fc64_1_bswap_guts(u_)
}
- //convert remainder
- item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor);
+ // convert remainder
+ item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);
}
-DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD){
- const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3);
- fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]);
+DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD)
+{
+ const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+ fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]);
- const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24));
+ const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24));
size_t i = 0, j = 0;
fc32_t dummy;
size_t num_samps = nsamps;
- if ((size_t(inputs[0]) & 0x3) != 0){
+ if ((size_t(inputs[0]) & 0x3) != 0) {
item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);
num_samps--;
}
- #define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_) \
- for (; j+7 < num_samps; j+=8, i+=4){ \
- /* load from input */ \
- __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
- \
- /* unpack */ \
- __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
- tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\
- unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \
- \
- /* store to output */ \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \
- _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \
+#define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_) \
+ for (; j + 7 < num_samps; j += 8, i += 4) { \
+ /* load from input */ \
+ __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \
+ \
+ /* unpack */ \
+ __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
+ tmpi = \
+ _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \
+ unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \
+ \
+ /* store to output */ \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6); \
+ _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7); \
}
- //dispatch according to alignment
- if ((size_t(output) & 0xf) == 0){
+ // dispatch according to alignment
+ if ((size_t(output) & 0xf) == 0) {
convert_sc8_item32_1_to_fc64_1_nswap_guts(_)
- }
- else{
+ } else {
convert_sc8_item32_1_to_fc64_1_nswap_guts(u_)
}
- //convert remainder
- item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor);
+ // convert remainder
+ item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);
}