diff options
-rw-r--r-- | host/lib/convert/sse2_fc32_to_sc16.cpp | 161 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc32_to_sc8.cpp | 127 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc64_to_sc16.cpp | 138 | ||||
-rw-r--r-- | host/lib/convert/sse2_fc64_to_sc8.cpp | 153 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc16_to_fc32.cpp | 162 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc16_to_fc64.cpp | 140 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc16_to_sc16.cpp | 240 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc8_to_fc32.cpp | 132 | ||||
-rw-r--r-- | host/lib/convert/sse2_sc8_to_fc64.cpp | 168 |
9 files changed, 742 insertions, 679 deletions
diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp index f562074c6..2d1f853b9 100644 --- a/host/lib/convert/sse2_fc32_to_sc16.cpp +++ b/host/lib/convert/sse2_fc32_to_sc16.cpp @@ -1,6 +1,7 @@ // // Copyright 2011-2012 Ettus Research LLC // Copyright 2018 Ettus Research, a National Instruments Company +// Copyright 2019 Ettus Research, a National Instruments Brand // // SPDX-License-Identifier: GPL-3.0-or-later // @@ -11,101 +12,111 @@ using namespace uhd::convert; -DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128 scalar = _mm_set_ps1(float(scale_factor)); - // this macro converts values faster by using SSE intrinsics to convert 4 values at a time - #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + swap 16-bit pairs */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128 tmplo = \ + _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ + __m128 tmphi = \ + _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \ + } size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(input) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - convert_fc32_1_to_item32_1_nswap_guts(_) - break; - case 0x8: - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); - i++; - // do faster processing of the bulk of the samples now that we are 16-byte aligned - convert_fc32_1_to_item32_1_nswap_guts(_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load - convert_fc32_1_to_item32_1_nswap_guts(u_) + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_fc32_1_to_item32_1_nswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_fc32_1_to_item32_1_nswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + convert_fc32_1_to_item32_1_nswap_guts(u_) } // convert any remaining samples - xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); + xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); } -DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128 scalar = _mm_set_ps1(float(scale_factor)); - // this macro converts values faster by using SSE intrinsics to convert 4 values at a time - #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + byteswap -> byteswap 16 bit words */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128 tmplo = \ + _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ + __m128 tmphi = \ + _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \ + } size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(input) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - convert_fc32_1_to_item32_1_bswap_guts(_) - break; - case 0x8: - // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion - xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); - i++; - // do faster processing of the remaining samples now that we are 16-byte aligned - convert_fc32_1_to_item32_1_bswap_guts(_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load - convert_fc32_1_to_item32_1_bswap_guts(u_) + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_fc32_1_to_item32_1_bswap_guts(_) break; + case 0x8: + // the first value is 8-byte aligned - process it and prepare the bulk of the + // data for fast conversion + xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); + i++; + // do faster processing of the remaining samples now that we are 16-byte + // aligned + convert_fc32_1_to_item32_1_bswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + convert_fc32_1_to_item32_1_bswap_guts(u_) } // convert any remaining samples - xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); + xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_fc32_to_sc8.cpp b/host/lib/convert/sse2_fc32_to_sc8.cpp index b3f96ea39..66faa82cc 100644 --- a/host/lib/convert/sse2_fc32_to_sc8.cpp +++ b/host/lib/convert/sse2_fc32_to_sc8.cpp @@ -12,94 +12,95 @@ using namespace uhd::convert; template <const int shuf> -UHD_INLINE __m128i pack_sc32_4x( - const __m128 &in0, const __m128 &in1, - const __m128 &in2, const __m128 &in3, - const __m128 &scalar -){ - __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); - tmpi0 = _mm_shuffle_epi32(tmpi0, shuf); - __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); - tmpi1 = _mm_shuffle_epi32(tmpi1, shuf); +UHD_INLINE __m128i pack_sc32_4x(const __m128& in0, + const __m128& in1, + const __m128& in2, + const __m128& in3, + const __m128& scalar) +{ + __m128i tmpi0 = _mm_cvtps_epi32(_mm_mul_ps(in0, scalar)); + tmpi0 = _mm_shuffle_epi32(tmpi0, shuf); + __m128i tmpi1 = _mm_cvtps_epi32(_mm_mul_ps(in1, scalar)); + tmpi1 = _mm_shuffle_epi32(tmpi1, shuf); const __m128i lo = _mm_packs_epi32(tmpi0, tmpi1); - __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); - tmpi2 = _mm_shuffle_epi32(tmpi2, shuf); - __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); - tmpi3 = _mm_shuffle_epi32(tmpi3, shuf); + __m128i tmpi2 = _mm_cvtps_epi32(_mm_mul_ps(in2, scalar)); + tmpi2 = _mm_shuffle_epi32(tmpi2, shuf); + __m128i tmpi3 = _mm_cvtps_epi32(_mm_mul_ps(in3, scalar)); + tmpi3 = _mm_shuffle_epi32(tmpi3, shuf); const __m128i hi = _mm_packs_epi32(tmpi2, tmpi3); return _mm_packs_epi16(lo, hi); } -DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128 scalar = _mm_set_ps1(float(scale_factor)); - const int shuf = _MM_SHUFFLE(3, 2, 1, 0); - - #define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_) \ - for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \ - /* load from input */ \ - __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \ - __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ - \ - /* convert */ \ - const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ - } \ + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + +#define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_) \ + for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \ + /* load from input */ \ + __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ + __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ + __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ + __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \ + \ + /* convert */ \ + const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc32_1_to_sc8_item32_1_bswap_guts(_) - } - else{ + } else { convert_fc32_1_to_sc8_item32_1_bswap_guts(u_) } - //convert remainder - xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor); } -DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD){ - const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128 scalar = _mm_set_ps1(float(scale_factor)); - const int shuf = _MM_SHUFFLE(0, 1, 2, 3); - - #define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_) \ - for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \ - /* load from input */ \ - __m128 tmp0 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+0)); \ - __m128 tmp1 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+2)); \ - __m128 tmp2 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+4)); \ - __m128 tmp3 = _mm_load ## _al_ ## ps(reinterpret_cast<const float *>(input+i+6)); \ - \ - /* convert */ \ - const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ - } \ + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + +#define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_) \ + for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \ + /* load from input */ \ + __m128 tmp0 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ + __m128 tmp1 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 2)); \ + __m128 tmp2 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ + __m128 tmp3 = _mm_load##_al_##ps(reinterpret_cast<const float*>(input + i + 6)); \ + \ + /* convert */ \ + const __m128i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc32_1_to_sc8_item32_1_nswap_guts(_) - } - else{ + } else { convert_fc32_1_to_sc8_item32_1_nswap_guts(u_) } - //convert remainder - xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_fc64_to_sc16.cpp b/host/lib/convert/sse2_fc64_to_sc16.cpp index 2004c1fd7..7c2ce1f8e 100644 --- a/host/lib/convert/sse2_fc64_to_sc16.cpp +++ b/host/lib/convert/sse2_fc64_to_sc16.cpp @@ -11,91 +11,99 @@ using namespace uhd::convert; -DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128d scalar = _mm_set1_pd(scale_factor); - #define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - \ - /* convert and scale */ \ - __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ - __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ - __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ - __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ - __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ - __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ - \ - /* pack + swap 16-bit pairs */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ +#define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128d tmp0 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \ + __m128d tmp1 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \ + __m128d tmp2 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \ + __m128d tmp3 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc64_1_to_item32_1_nswap_guts(_) - } - else{ + } else { convert_fc64_1_to_item32_1_nswap_guts(u_) } - //convert remainder - xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); } -DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128d scalar = _mm_set1_pd(scale_factor); - #define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - \ - /* convert and scale */ \ - __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ - __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ - __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ - __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ - __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ - __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ - \ - /* pack + byteswap -> byteswap 16 bit words */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ +#define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128d tmp0 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \ + __m128d tmp1 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \ + __m128d tmp2 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \ + __m128d tmp3 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + i), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc64_1_to_item32_1_bswap_guts(_) - } - else{ + } else { convert_fc64_1_to_item32_1_bswap_guts(u_) } - //convert remainder - xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_fc64_to_sc8.cpp b/host/lib/convert/sse2_fc64_to_sc8.cpp index 455ca95e3..95db4e927 100644 --- a/host/lib/convert/sse2_fc64_to_sc8.cpp +++ b/host/lib/convert/sse2_fc64_to_sc8.cpp @@ -12,108 +12,119 @@ using namespace uhd::convert; UHD_INLINE __m128i pack_sc8_item32_4x( - const __m128i &in0, const __m128i &in1, - const __m128i &in2, const __m128i &in3 -){ + const __m128i& in0, const __m128i& in1, const __m128i& in2, const __m128i& in3) +{ const __m128i lo = _mm_packs_epi32(in0, in1); const __m128i hi = _mm_packs_epi32(in2, in3); return _mm_packs_epi16(lo, hi); } UHD_INLINE __m128i pack_sc32_4x( - const __m128d &lo, const __m128d &hi, - const __m128d &scalar -){ + const __m128d& lo, const __m128d& hi, const __m128d& scalar) +{ const __m128i tmpi_lo = _mm_cvttpd_epi32(_mm_mul_pd(hi, scalar)); const __m128i tmpi_hi = _mm_cvttpd_epi32(_mm_mul_pd(lo, scalar)); return _mm_unpacklo_epi64(tmpi_lo, tmpi_hi); } -DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128d scalar = _mm_set1_pd(scale_factor); - #define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_) \ - for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \ - __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \ - __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \ - __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \ - \ - /* interleave */ \ - const __m128i tmpi = pack_sc8_item32_4x( \ - pack_sc32_4x(tmp1, tmp0, scalar), \ - pack_sc32_4x(tmp3, tmp2, scalar), \ - pack_sc32_4x(tmp5, tmp4, scalar), \ - pack_sc32_4x(tmp7, tmp6, scalar) \ - ); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ - } \ +#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_) \ + for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \ + /* load from input */ \ + __m128d tmp0 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \ + __m128d tmp1 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \ + __m128d tmp2 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \ + __m128d tmp3 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \ + __m128d tmp4 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \ + __m128d tmp5 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5)); \ + __m128d tmp6 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \ + __m128d tmp7 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7)); \ + \ + /* interleave */ \ + const __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), \ + pack_sc32_4x(tmp3, tmp2, scalar), \ + pack_sc32_4x(tmp5, tmp4, scalar), \ + pack_sc32_4x(tmp7, tmp6, scalar)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc64_1_to_sc8_item32_1_bswap_guts(_) - } - else{ + } else { convert_fc64_1_to_sc8_item32_1_bswap_guts(u_) } - //convert remainder - xx_to_item32_sc8<uhd::htonx>(input+i, output+(i/2), nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor); } -DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD){ - const fc64_t *input = reinterpret_cast<const fc64_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); const __m128d scalar = _mm_set1_pd(scale_factor); - #define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_) \ - for (size_t j = 0; i+7 < nsamps; i+=8, j+=4){ \ - /* load from input */ \ - __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+0)); \ - __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+1)); \ - __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+2)); \ - __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+3)); \ - __m128d tmp4 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+4)); \ - __m128d tmp5 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+5)); \ - __m128d tmp6 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+6)); \ - __m128d tmp7 = _mm_load ## _al_ ## pd(reinterpret_cast<const double *>(input+i+7)); \ - \ - /* interleave */ \ - __m128i tmpi = pack_sc8_item32_4x( \ - pack_sc32_4x(tmp0, tmp1, scalar), \ - pack_sc32_4x(tmp2, tmp3, scalar), \ - pack_sc32_4x(tmp4, tmp5, scalar), \ - pack_sc32_4x(tmp6, tmp7, scalar) \ - ); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+j), tmpi); \ - } \ +#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_) \ + for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) { \ + /* load from input */ \ + __m128d tmp0 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \ + __m128d tmp1 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1)); \ + __m128d tmp2 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \ + __m128d tmp3 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3)); \ + __m128d tmp4 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \ + __m128d tmp5 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5)); \ + __m128d tmp6 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \ + __m128d tmp7 = \ + _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7)); \ + \ + /* interleave */ \ + __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar), \ + pack_sc32_4x(tmp2, tmp3, scalar), \ + pack_sc32_4x(tmp4, tmp5, scalar), \ + pack_sc32_4x(tmp6, tmp7, scalar)); \ + tmpi = \ + _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(input) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { convert_fc64_1_to_sc8_item32_1_nswap_guts(_) - } - else{ + } else { convert_fc64_1_to_sc8_item32_1_nswap_guts(u_) } - //convert remainder - xx_to_item32_sc8<uhd::htowx>(input+i, output+(i/2), nsamps-i, scale_factor); + // convert remainder + xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_sc16_to_fc32.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp index d75c4a2a7..a16ef30d4 100644 --- a/host/lib/convert/sse2_sc16_to_fc32.cpp +++ b/host/lib/convert/sse2_sc16_to_fc32.cpp @@ -11,105 +11,111 @@ using namespace uhd::convert; -DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]); - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); + const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16)); const __m128i zeroi = _mm_setzero_si128(); - // this macro converts values faster by using SSE intrinsics to convert 4 values at a time - #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack + swap 16-bit pairs */ \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ - __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \ - } \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ + __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi); \ + } size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(output) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - convert_item32_1_to_fc32_1_nswap_guts(_) - break; - case 0x8: - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor); - i++; - // do faster processing of the bulk of the samples now that we are 16-byte aligned - convert_item32_1_to_fc32_1_nswap_guts(_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store - convert_item32_1_to_fc32_1_nswap_guts(u_) + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_item32_1_to_fc32_1_nswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_item32_1_to_fc32_1_nswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + convert_item32_1_to_fc32_1_nswap_guts(u_) } // convert any remaining samples - item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); + item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); } -DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]); - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); + const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 16)); const __m128i zeroi = _mm_setzero_si128(); - // this macro converts values faster by using SSE intrinsics to convert 4 values at a time - #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* byteswap + unpack -> byteswap 16 bit words */ \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ - __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+0), tmplo); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+i+2), tmphi); \ - } \ +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ + __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 0), tmplo); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + i + 2), tmphi); \ + } size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(output) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - convert_item32_1_to_fc32_1_bswap_guts(_) - break; - case 0x8: - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor); - i++; - // do faster processing of the bulk of the samples now that we are 16-byte aligned - convert_item32_1_to_fc32_1_bswap_guts(_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store - convert_item32_1_to_fc32_1_bswap_guts(u_) + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_item32_1_to_fc32_1_bswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_item32_1_to_fc32_1_bswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + convert_item32_1_to_fc32_1_bswap_guts(u_) } // convert any remaining samples - item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); + item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_sc16_to_fc64.cpp b/host/lib/convert/sse2_sc16_to_fc64.cpp index 7f22fd07f..45821ac9f 100644 --- a/host/lib/convert/sse2_sc16_to_fc64.cpp +++ b/host/lib/convert/sse2_sc16_to_fc64.cpp @@ -11,95 +11,95 @@ using namespace uhd::convert; -DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]); - const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); - const __m128i zeroi = _mm_setzero_si128(); + const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); - #define convert_item32_1_to_fc64_1_nswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack + swap 16-bit pairs */ \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ +#define convert_item32_1_to_fc64_1_nswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ - tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ - __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ - __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ - tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ - __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \ - } \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ + __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ + __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_item32_1_to_fc64_1_nswap_guts(_) - } - else{ + } else { convert_item32_1_to_fc64_1_nswap_guts(u_) } - //convert remainder - item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor); + // convert remainder + item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); } -DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]); - const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); - const __m128i zeroi = _mm_setzero_si128(); + const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); - #define convert_item32_1_to_fc64_1_bswap_guts(_al_) \ - for (; i+3 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* byteswap + unpack -> byteswap 16 bit words */ \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ - tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ - __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ - __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ - tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ - __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+0), tmp0); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+1), tmp1); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+2), tmp2); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+i+3), tmp3); \ - } \ +#define convert_item32_1_to_fc64_1_bswap_guts(_al_) \ + for (; i + 3 < nsamps; i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ + __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ + __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 0), tmp0); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 1), tmp1); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 2), tmp2); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + i + 3), tmp3); \ + } size_t i = 0; - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_item32_1_to_fc64_1_bswap_guts(_) - } - else{ + } else { convert_item32_1_to_fc64_1_bswap_guts(u_) } - //convert remainder - item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor); + // convert remainder + item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); } diff --git a/host/lib/convert/sse2_sc16_to_sc16.cpp b/host/lib/convert/sse2_sc16_to_sc16.cpp index 5c81f357b..e484bee31 100644 --- a/host/lib/convert/sse2_sc16_to_sc16.cpp +++ b/host/lib/convert/sse2_sc16_to_sc16.cpp @@ -25,20 +25,20 @@ using namespace uhd::convert; // | C | D | A | B | Output // ----------------- // -#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_,_oalign_) \ - for (; i+3 < nsamps; i+=4) { \ - __m128i m0; \ - \ - /* load from input */ \ - m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ - \ - /* swap 16-bit pairs */ \ - m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ - m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \ - } \ +#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_, _oalign_) \ + for (; i + 3 < nsamps; i += 4) { \ + __m128i m0; \ + \ + /* load from input */ \ + m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \ + \ + /* swap 16-bit pairs */ \ + m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ + m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_store##_oalign_##si128((__m128i*)(output + i), m0); \ + } // // SSE byte swap @@ -54,138 +54,158 @@ using namespace uhd::convert; // | B | A | D | C | Output // ----------------- // -#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_,_oalign_) \ - for (; i+3 < nsamps; i+=4) { \ - __m128i m0, m1, m2; \ - \ - /* load from input */ \ - m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ - \ - /* byteswap 16 bit words */ \ - m1 = _mm_srli_epi16(m0, 8); \ - m2 = _mm_slli_epi16(m0, 8); \ - m0 = _mm_or_si128(m1, m2); \ - \ - /* store to output */ \ - _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \ - } \ - -DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD){ - const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_, _oalign_) \ + for (; i + 3 < nsamps; i += 4) { \ + __m128i m0, m1, m2; \ + \ + /* load from input */ \ + m0 = _mm_load##_ialign_##si128((const __m128i*)(input + i)); \ + \ + /* byteswap 16 bit words */ \ + m1 = _mm_srli_epi16(m0, 8); \ + m2 = _mm_slli_epi16(m0, 8); \ + m0 = _mm_or_si128(m1, m2); \ + \ + /* store to output */ \ + _mm_store##_oalign_##si128((__m128i*)(output + i), m0); \ + } + +DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(input) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) - break; - case 0x8: - if (nsamps < 2) + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0); + i += 2; + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned break; - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0); - i += 2; - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) - // do faster processing of the bulk of the samples now that we are 16-byte aligned - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_) } // convert any remaining samples - xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, 1.0); + xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, 1.0); } -DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD){ - const sc16_t *input = reinterpret_cast<const sc16_t *>(inputs[0]); - item32_t *output = reinterpret_cast<item32_t *>(outputs[0]); +DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]); + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(input) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) - break; - case 0x8: - if (nsamps < 2) + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_) break; - // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion - xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0); - i += 2; - // do faster processing of the remaining samples now that we are 16-byte aligned - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) + case 0x8: + if (nsamps < 2) + break; + // the first value is 8-byte aligned - process it and prepare the bulk of the + // data for fast conversion + xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0); + i += 2; + // do faster processing of the remaining samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_) + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_) } // convert any remaining samples - xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, 1.0); + xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, 1.0); } -DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(output) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) - break; - case 0x8: - if (nsamps < 2) + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0); + i += 2; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _) break; - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0); - i += 2; - // do faster processing of the bulk of the samples now that we are 16-byte aligned - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store - CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_) } // convert any remaining samples - item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, 1.0); + item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, 1.0); } -DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]); - sc16_t *output = reinterpret_cast<sc16_t *>(outputs[0]); +DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]); + sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion - switch (size_t(output) & 0xf){ - case 0x0: - // the data is 16-byte aligned, so do the fast processing of the bulk of the samples - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) - break; - case 0x8: - if (nsamps < 2) + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0); + i += 2; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _) break; - // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes - item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0); - i += 2; - // do faster processing of the bulk of the samples now that we are 16-byte aligned - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) - break; - default: - // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store - CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_) } // convert any remaining samples - item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, 1.0); + item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, 1.0); } diff --git a/host/lib/convert/sse2_sc8_to_fc32.cpp b/host/lib/convert/sse2_sc8_to_fc32.cpp index 6d68850bf..aefda2b13 100644 --- a/host/lib/convert/sse2_sc8_to_fc32.cpp +++ b/host/lib/convert/sse2_sc8_to_fc32.cpp @@ -14,109 +14,111 @@ using namespace uhd::convert; static const __m128i zeroi = _mm_setzero_si128(); template <const int shuf> -UHD_INLINE void unpack_sc32_4x( - const __m128i &in, - __m128 &out0, __m128 &out1, - __m128 &out2, __m128 &out3, - const __m128 &scalar -){ +UHD_INLINE void unpack_sc32_4x(const __m128i& in, + __m128& out0, + __m128& out1, + __m128& out2, + __m128& out3, + const __m128& scalar) +{ const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ - __m128i tmp0 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ + __m128i tmp0 = _mm_shuffle_epi32( + _mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ __m128i tmp1 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmplo), shuf); - out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); - out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar); + out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); + out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar); const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); - __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); - __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); - out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); - out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar); + __m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); + __m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); + out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); + out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar); } -DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); - fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]); - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); - const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); size_t i = 0, j = 0; fc32_t dummy; size_t num_samps = nsamps; - if ((size_t(inputs[0]) & 0x3) != 0){ + if ((size_t(inputs[0]) & 0x3) != 0) { item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor); num_samps--; } - #define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \ - for (; j+7 < num_samps; j+=8, i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack + swap 8-bit pairs */ \ - __m128 tmp0, tmp1, tmp2, tmp3; \ - unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ +#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; j + 7 < num_samps; j += 8, i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m128 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3); \ } - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_sc8_item32_1_to_fc32_1_bswap_guts(_) - } - else{ + } else { convert_sc8_item32_1_to_fc32_1_bswap_guts(u_) } - //convert remainder - item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); + // convert remainder + item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor); } -DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); - fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast<fc32_t*>(outputs[0]); - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); - const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + const __m128 scalar = _mm_set_ps1(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); size_t i = 0, j = 0; fc32_t dummy; size_t num_samps = nsamps; - if ((size_t(inputs[0]) & 0x3) != 0){ + if ((size_t(inputs[0]) & 0x3) != 0) { item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor); num_samps--; } - #define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \ - for (; j+7 < num_samps; j+=8, i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack + swap 8-bit pairs */ \ - __m128 tmp0, tmp1, tmp2, tmp3; \ - unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ - _mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ +#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; j + 7 < num_samps; j += 8, i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m128 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 0), tmp0); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 2), tmp1); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 4), tmp2); \ + _mm_store##_al_##ps(reinterpret_cast<float*>(output + j + 6), tmp3); \ } - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_sc8_item32_1_to_fc32_1_nswap_guts(_) - } - else{ + } else { convert_sc8_item32_1_to_fc32_1_nswap_guts(u_) } - //convert remainder - item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); + // convert remainder + item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor); } diff --git a/host/lib/convert/sse2_sc8_to_fc64.cpp b/host/lib/convert/sse2_sc8_to_fc64.cpp index f5b406152..3cc2fefd0 100644 --- a/host/lib/convert/sse2_sc8_to_fc64.cpp +++ b/host/lib/convert/sse2_sc8_to_fc64.cpp @@ -13,129 +13,133 @@ using namespace uhd::convert; static const __m128i zeroi = _mm_setzero_si128(); -UHD_INLINE void unpack_sc32_8x( - const __m128i &in, - __m128d &out0, __m128d &out1, - __m128d &out2, __m128d &out3, - __m128d &out4, __m128d &out5, - __m128d &out6, __m128d &out7, - const __m128d &scalar -){ +UHD_INLINE void unpack_sc32_8x(const __m128i& in, + __m128d& out0, + __m128d& out1, + __m128d& out2, + __m128d& out3, + __m128d& out4, + __m128d& out5, + __m128d& out6, + __m128d& out7, + const __m128d& scalar) +{ const int shuf = _MM_SHUFFLE(1, 0, 3, 2); __m128i tmp; const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ - tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */ - out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_shuffle_epi32(tmp, shuf); - out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_unpackhi_epi16(zeroi, tmplo); - out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_shuffle_epi32(tmp, shuf); - out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpacklo_epi16(zeroi, tmplo); /* value in upper 16 bits */ + out0 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out1 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpackhi_epi16(zeroi, tmplo); + out2 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out3 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in); - tmp = _mm_unpacklo_epi16(zeroi, tmphi); - out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_shuffle_epi32(tmp, shuf); - out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_unpackhi_epi16(zeroi, tmphi); - out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); - tmp = _mm_shuffle_epi32(tmp, shuf); - out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpacklo_epi16(zeroi, tmphi); + out4 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out5 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_unpackhi_epi16(zeroi, tmphi); + out6 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); + tmp = _mm_shuffle_epi32(tmp, shuf); + out7 = _mm_mul_pd(_mm_cvtepi32_pd(tmp), scalar); } -DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); - fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_be, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); + fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]); - const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); + const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24)); size_t i = 0, j = 0; fc32_t dummy; size_t num_samps = nsamps; - if ((size_t(inputs[0]) & 0x3) != 0){ + if ((size_t(inputs[0]) & 0x3) != 0) { item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor); num_samps--; } - #define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_) \ - for (; j+7 < num_samps; j+=8, i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack */ \ - __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ +#define convert_sc8_item32_1_to_fc64_1_bswap_guts(_al_) \ + for (; j + 7 < num_samps; j += 8, i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack */ \ + __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ unpack_sc32_8x(tmpi, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ + \ + /* store to output */ \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7); \ } - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_sc8_item32_1_to_fc64_1_bswap_guts(_) - } - else{ + } else { convert_sc8_item32_1_to_fc64_1_bswap_guts(u_) } - //convert remainder - item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); + // convert remainder + item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor); } -DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD){ - const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); - fc64_t *output = reinterpret_cast<fc64_t *>(outputs[0]); +DECLARE_CONVERTER(sc8_item32_le, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3); + fc64_t* output = reinterpret_cast<fc64_t*>(outputs[0]); - const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 24)); + const __m128d scalar = _mm_set1_pd(scale_factor / (1 << 24)); size_t i = 0, j = 0; fc32_t dummy; size_t num_samps = nsamps; - if ((size_t(inputs[0]) & 0x3) != 0){ + if ((size_t(inputs[0]) & 0x3) != 0) { item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor); num_samps--; } - #define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_) \ - for (; j+7 < num_samps; j+=8, i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ - \ - /* unpack */ \ - __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/\ - unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+0), tmp0); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+1), tmp1); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+2), tmp2); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+3), tmp3); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+4), tmp4); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+5), tmp5); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+6), tmp6); \ - _mm_store ## _al_ ## pd(reinterpret_cast<double *>(output+j+7), tmp7); \ +#define convert_sc8_item32_1_to_fc64_1_nswap_guts(_al_) \ + for (; j + 7 < num_samps; j += 8, i += 4) { \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i)); \ + \ + /* unpack */ \ + __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \ + tmpi = \ + _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \ + unpack_sc32_8x(tmpi, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, scalar); \ + \ + /* store to output */ \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 0), tmp0); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 1), tmp1); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 2), tmp2); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 3), tmp3); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 4), tmp4); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 5), tmp5); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 6), tmp6); \ + _mm_store##_al_##pd(reinterpret_cast<double*>(output + j + 7), tmp7); \ } - //dispatch according to alignment - if ((size_t(output) & 0xf) == 0){ + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { convert_sc8_item32_1_to_fc64_1_nswap_guts(_) - } - else{ + } else { convert_sc8_item32_1_to_fc64_1_nswap_guts(u_) } - //convert remainder - item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); + // convert remainder + item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor); } |