From 45dc078069798d14f33518033e58d9aa7ace99f1 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 22 Jun 2010 15:41:59 -0700 Subject: uhd: implemented complex float <-> item32 conversion with sse2 --- host/lib/transport/CMakeLists.txt | 10 +++++ host/lib/transport/gen_convert_types.py | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'host') diff --git a/host/lib/transport/CMakeLists.txt b/host/lib/transport/CMakeLists.txt index 872865d6c..a2bd17b01 100644 --- a/host/lib/transport/CMakeLists.txt +++ b/host/lib/transport/CMakeLists.txt @@ -17,6 +17,16 @@ #This file will be included by cmake, use absolute paths! +######################################################################## +# Check for SIMD headers +######################################################################## +INCLUDE(CheckIncludeFileCXX) +CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H) + +IF(HAVE_EMMINTRIN_H) + ADD_DEFINITIONS(-DHAVE_EMMINTRIN_H) +ENDIF(HAVE_EMMINTRIN_H) + ######################################################################## # Setup defines for interface address discovery ######################################################################## diff --git a/host/lib/transport/gen_convert_types.py b/host/lib/transport/gen_convert_types.py index e81bf7330..6b87bf134 100755 --- a/host/lib/transport/gen_convert_types.py +++ b/host/lib/transport/gen_convert_types.py @@ -30,6 +30,9 @@ TMPL_TEXT = """ \#include \#include \#include +\#include + +\#define USE_EMMINTRIN_H true \#ifdef BOOST_BIG_ENDIAN static const bool is_big_endian = true; @@ -101,6 +104,39 @@ static UHD_INLINE void fc32_to_item32_nswap( } } +\#if defined(HAVE_EMMINTRIN_H) && USE_EMMINTRIN_H +\#include + +static UHD_INLINE void fc32_to_item32_bswap( + const fc32_t *input, item32_t *output, size_t nsamps +){ + __m128 scalar = _mm_set_ps1(shorts_per_float); + + //convert samples with intrinsics pairs at a time + size_t i = 0; for (; i < nsamps/4; i+=4){ + //load from input + __m128 tmplo = _mm_loadu_ps(reinterpret_cast(input+i+0)); + __m128 tmphi = _mm_loadu_ps(reinterpret_cast(input+i+2)); + + //convert and scale + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); + + //pack + byteswap -> byteswap 32 bit words + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); + + //store to output + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = uhd::byteswap(fc32_to_item32(input[i])); + } +} + +\#else static UHD_INLINE void fc32_to_item32_bswap( const fc32_t *input, item32_t *output, size_t nsamps ){ @@ -109,6 +145,8 @@ static UHD_INLINE void fc32_to_item32_bswap( } } +\#endif + /*********************************************************************** * Convert items32 buffer to complex float **********************************************************************/ @@ -129,6 +167,40 @@ static UHD_INLINE void item32_to_fc32_nswap( } } +\#if defined(HAVE_EMMINTRIN_H) && USE_EMMINTRIN_H +\#include + +static UHD_INLINE void item32_to_fc32_bswap( + const item32_t *input, fc32_t *output, size_t nsamps +){ + __m128 scalar = _mm_set_ps1(floats_per_short/(1 << 16)); + + //convert samples with intrinsics pairs at a time + size_t i = 0; for (; i < nsamps/4; i+=4){ + //load from input + __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); + + //byteswap + unpack -> byteswap 32 bit words + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); + __m128i tmpilo = _mm_unpacklo_epi16(tmpi, tmpi); + __m128i tmpihi = _mm_unpackhi_epi16(tmpi, tmpi); + + //convert and scale + __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); + __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); + + //store to output + _mm_storeu_ps(reinterpret_cast(output+i+0), tmplo); + _mm_storeu_ps(reinterpret_cast(output+i+2), tmphi); + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = item32_to_fc32(uhd::byteswap(input[i])); + } +} + +\#else static UHD_INLINE void item32_to_fc32_bswap( const item32_t *input, fc32_t *output, size_t nsamps ){ @@ -137,6 +209,8 @@ static UHD_INLINE void item32_to_fc32_bswap( } } +\#endif + /*********************************************************************** * Sample-buffer converters **********************************************************************/ -- cgit v1.2.3