// // Copyright 2015 Ettus Research LLC // Copyright 2018 Ettus Research, a National Instruments Company // // SPDX-License-Identifier: GPL-3.0-or-later // #include "convert_common.hpp" #include #include using namespace uhd::convert; // // SSE 16-bit pair swap // // Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned // access respectively. Macro operates on 4 complex 16-bit integers at a time. // // ----------------- // | A | B | C | D | Input // ----------------- // 0 1 2 3 Address // ----------------- // | C | D | A | B | Output // ----------------- // #define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_,_oalign_) \ for (; i+3 < nsamps; i+=4) { \ __m128i m0; \ \ /* load from input */ \ m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ \ /* swap 16-bit pairs */ \ m0 = _mm_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ m0 = _mm_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ \ /* store to output */ \ _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \ } \ // // SSE byte swap // // Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned // access respectively. Macro operates on 4 complex 16-bit integers at a time. // // ----------------- // | A | B | C | D | Input // ----------------- // 0 1 2 3 Address // ----------------- // | B | A | D | C | Output // ----------------- // #define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_,_oalign_) \ for (; i+3 < nsamps; i+=4) { \ __m128i m0, m1, m2; \ \ /* load from input */ \ m0 = _mm_load ## _ialign_ ## si128((const __m128i *) (input+i));\ \ /* byteswap 16 bit words */ \ m1 = _mm_srli_epi16(m0, 8); \ m2 = _mm_slli_epi16(m0, 8); \ m0 = _mm_or_si128(m1, m2); \ \ /* store to output */ \ _mm_store ## _oalign_ ## si128((__m128i *) (output+i), m0); \ } \ DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD){ const sc16_t *input = reinterpret_cast(inputs[0]); item32_t *output = reinterpret_cast(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion switch (size_t(input) & 0xf){ case 0x0: // the data is 16-byte aligned, so do the fast processing of the bulk of the samples CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) break; case 0x8: if (nsamps < 2) break; // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes xx_to_item32_sc16(input, output, 2, 1.0); i += 2; CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_,u_) // do faster processing of the bulk of the samples now that we are 16-byte aligned break; default: // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) } // convert any remaining samples xx_to_item32_sc16(input+i, output+i, nsamps-i, 1.0); } DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD){ const sc16_t *input = reinterpret_cast(inputs[0]); item32_t *output = reinterpret_cast(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion switch (size_t(input) & 0xf){ case 0x0: // the data is 16-byte aligned, so do the fast processing of the bulk of the samples CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) break; case 0x8: if (nsamps < 2) break; // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion xx_to_item32_sc16(input, output, 2, 1.0); i += 2; // do faster processing of the remaining samples now that we are 16-byte aligned CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_,u_) break; default: // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) } // convert any remaining samples xx_to_item32_sc16(input+i, output+i, nsamps-i, 1.0); } DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD){ const item32_t *input = reinterpret_cast(inputs[0]); sc16_t *output = reinterpret_cast(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion switch (size_t(output) & 0xf){ case 0x0: // the data is 16-byte aligned, so do the fast processing of the bulk of the samples CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) break; case 0x8: if (nsamps < 2) break; // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes item32_sc16_to_xx(input, output, 2, 1.0); i += 2; // do faster processing of the bulk of the samples now that we are 16-byte aligned CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,_) break; default: // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_,u_) } // convert any remaining samples item32_sc16_to_xx(input+i, output+i, nsamps-i, 1.0); } DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD){ const item32_t *input = reinterpret_cast(inputs[0]); sc16_t *output = reinterpret_cast(outputs[0]); size_t i = 0; // need to dispatch according to alignment for fastest conversion switch (size_t(output) & 0xf){ case 0x0: // the data is 16-byte aligned, so do the fast processing of the bulk of the samples CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) break; case 0x8: if (nsamps < 2) break; // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes item32_sc16_to_xx(input, output, 2, 1.0); i += 2; // do faster processing of the bulk of the samples now that we are 16-byte aligned CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,_) break; default: // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_,u_) } // convert any remaining samples item32_sc16_to_xx(input+i, output+i, nsamps-i, 1.0); }