diff options
| -rw-r--r-- | host/lib/convert/sse2_fc32_to_sc16.cpp | 34 | ||||
| -rw-r--r-- | host/lib/convert/sse2_sc16_to_fc32.cpp | 34 | 
2 files changed, 48 insertions, 20 deletions
| diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp index f5a2b7610..a83e9b46c 100644 --- a/host/lib/convert/sse2_fc32_to_sc16.cpp +++ b/host/lib/convert/sse2_fc32_to_sc16.cpp @@ -27,6 +27,7 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){      const __m128 scalar = _mm_set_ps1(float(scale_factor)); +    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time      #define convert_fc32_1_to_item32_1_nswap_guts(_al_)                 \      for (; i+3 < nsamps; i+=4){                                         \          /* load from input */                                           \ @@ -48,19 +49,25 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){      size_t i = 0; -    //dispatch according to alignment +    // need to dispatch according to alignment for fastest conversion      switch (size_t(input) & 0xf){ +    case 0x0: +        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples +        convert_fc32_1_to_item32_1_nswap_guts(_) +        break;      case 0x8: +        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes          xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);          i++; -        break; -    case 0x0: +        // do faster processing of the bulk of the samples now that we are 16-byte aligned          convert_fc32_1_to_item32_1_nswap_guts(_)          break; -    default: convert_fc32_1_to_item32_1_nswap_guts(u_) +    default: +        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load +        convert_fc32_1_to_item32_1_nswap_guts(u_)      } -    //convert remainder +    // convert any remaining samples      xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);  } @@ -70,6 +77,7 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){      const __m128 scalar = _mm_set_ps1(float(scale_factor)); +    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time      #define convert_fc32_1_to_item32_1_bswap_guts(_al_)                 \      for (; i+3 < nsamps; i+=4){                                         \          /* load from input */                                           \ @@ -90,18 +98,24 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){      size_t i = 0; -    //dispatch according to alignment +    // need to dispatch according to alignment for fastest conversion      switch (size_t(input) & 0xf){ +    case 0x0: +        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples +        convert_fc32_1_to_item32_1_bswap_guts(_)      case 0x8: +        // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion          xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);          i++; -        break; -    case 0x0: +        // do faster processing of the remaining samples now that we are 16-byte aligned          convert_fc32_1_to_item32_1_bswap_guts(_)          break; -    default: convert_fc32_1_to_item32_1_bswap_guts(u_) +        break; +    default: +        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load +        convert_fc32_1_to_item32_1_bswap_guts(u_)      } -    //convert remainder +    // convert any remaining samples      xx_to_item32_sc16<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);  } diff --git a/host/lib/convert/sse2_sc16_to_fc32.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp index 7a9860970..0ac7f1798 100644 --- a/host/lib/convert/sse2_sc16_to_fc32.cpp +++ b/host/lib/convert/sse2_sc16_to_fc32.cpp @@ -28,6 +28,7 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){      const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));      const __m128i zeroi = _mm_setzero_si128(); +    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time      #define convert_item32_1_to_fc32_1_nswap_guts(_al_)                 \      for (; i+3 < nsamps; i+=4){                                         \          /* load from input */                                           \ @@ -50,19 +51,25 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){      size_t i = 0; -    //dispatch according to alignment +    // need to dispatch according to alignment for fastest conversion      switch (size_t(output) & 0xf){ +    case 0x0: +        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples +        convert_item32_1_to_fc32_1_nswap_guts(_) +        break;      case 0x8: +        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes          item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor);          i++; -        break; -    case 0x0: +        // do faster processing of the bulk of the samples now that we are 16-byte aligned          convert_item32_1_to_fc32_1_nswap_guts(_)          break; -    default: convert_item32_1_to_fc32_1_nswap_guts(u_) +    default: +        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store +        convert_item32_1_to_fc32_1_nswap_guts(u_)      } -    //convert remainder +    // convert any remaining samples      item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);  } @@ -73,6 +80,7 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){      const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));      const __m128i zeroi = _mm_setzero_si128(); +    // this macro converts values faster by using SSE intrinsics to convert 4 values at a time      #define convert_item32_1_to_fc32_1_bswap_guts(_al_)                 \      for (; i+3 < nsamps; i+=4){                                         \          /* load from input */                                           \ @@ -94,18 +102,24 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){      size_t i = 0; -    //dispatch according to alignment +    // need to dispatch according to alignment for fastest conversion      switch (size_t(output) & 0xf){ +    case 0x0: +        // the data is 16-byte aligned, so do the fast processing of the bulk of the samples +        convert_item32_1_to_fc32_1_bswap_guts(_) +        break;      case 0x8: +        // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes          item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor);          i++; -        break; -    case 0x0: +        // do faster processing of the bulk of the samples now that we are 16-byte aligned          convert_item32_1_to_fc32_1_bswap_guts(_)          break; -    default: convert_item32_1_to_fc32_1_bswap_guts(u_) +    default: +        // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store +        convert_item32_1_to_fc32_1_bswap_guts(u_)      } -    //convert remainder +    // convert any remaining samples      item32_sc16_to_xx<uhd::htonx>(input+i, output+i, nsamps-i, scale_factor);  } | 
