From a0bae5347bd542b6f84601f8f0c8c70137ea44d1 Mon Sep 17 00:00:00 2001 From: Michael West Date: Wed, 20 Nov 2013 11:31:37 -0800 Subject: Fixed converters and added comments for better readability --- host/lib/convert/sse2_fc32_to_sc16.cpp | 34 ++++++++++++++++++++++++---------- host/lib/convert/sse2_sc16_to_fc32.cpp | 34 ++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/host/lib/convert/sse2_fc32_to_sc16.cpp b/host/lib/convert/sse2_fc32_to_sc16.cpp index f5a2b7610..a83e9b46c 100644 --- a/host/lib/convert/sse2_fc32_to_sc16.cpp +++ b/host/lib/convert/sse2_fc32_to_sc16.cpp @@ -27,6 +27,7 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ const __m128 scalar = _mm_set_ps1(float(scale_factor)); + // this macro converts values faster by using SSE intrinsics to convert 4 values at a time #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ for (; i+3 < nsamps; i+=4){ \ /* load from input */ \ @@ -48,19 +49,25 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ size_t i = 0; - //dispatch according to alignment + // need to dispatch according to alignment for fastest conversion switch (size_t(input) & 0xf){ + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the samples + convert_fc32_1_to_item32_1_nswap_guts(_) + break; case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes xx_to_item32_sc16(input, output, 1, scale_factor); i++; - break; - case 0x0: + // do faster processing of the bulk of the samples now that we are 16-byte aligned convert_fc32_1_to_item32_1_nswap_guts(_) break; - default: convert_fc32_1_to_item32_1_nswap_guts(u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load + convert_fc32_1_to_item32_1_nswap_guts(u_) } - //convert remainder + // convert any remaining samples xx_to_item32_sc16(input+i, output+i, nsamps-i, scale_factor); } @@ -70,6 +77,7 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ const __m128 scalar = _mm_set_ps1(float(scale_factor)); + // this macro converts values faster by using SSE intrinsics to convert 4 values at a time #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ for (; i+3 < nsamps; i+=4){ \ /* load from input */ \ @@ -90,18 +98,24 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD){ size_t i = 0; - //dispatch according to alignment + // need to dispatch according to alignment for fastest conversion switch (size_t(input) & 0xf){ + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the samples + convert_fc32_1_to_item32_1_bswap_guts(_) case 0x8: + // the first value is 8-byte aligned - process it and prepare the bulk of the data for fast conversion xx_to_item32_sc16(input, output, 1, scale_factor); i++; - break; - case 0x0: + // do faster processing of the remaining samples now that we are 16-byte aligned convert_fc32_1_to_item32_1_bswap_guts(_) break; - default: convert_fc32_1_to_item32_1_bswap_guts(u_) + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load + convert_fc32_1_to_item32_1_bswap_guts(u_) } - //convert remainder + // convert any remaining samples xx_to_item32_sc16(input+i, output+i, nsamps-i, scale_factor); } diff --git a/host/lib/convert/sse2_sc16_to_fc32.cpp b/host/lib/convert/sse2_sc16_to_fc32.cpp index 7a9860970..0ac7f1798 100644 --- a/host/lib/convert/sse2_sc16_to_fc32.cpp +++ b/host/lib/convert/sse2_sc16_to_fc32.cpp @@ -28,6 +28,7 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); const __m128i zeroi = _mm_setzero_si128(); + // this macro converts values faster by using SSE intrinsics to convert 4 values at a time #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ for (; i+3 < nsamps; i+=4){ \ /* load from input */ \ @@ -50,19 +51,25 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ size_t i = 0; - //dispatch according to alignment + // need to dispatch according to alignment for fastest conversion switch (size_t(output) & 0xf){ + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the samples + convert_item32_1_to_fc32_1_nswap_guts(_) + break; case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes item32_sc16_to_xx(input, output, 1, scale_factor); i++; - break; - case 0x0: + // do faster processing of the bulk of the samples now that we are 16-byte aligned convert_item32_1_to_fc32_1_nswap_guts(_) break; - default: convert_item32_1_to_fc32_1_nswap_guts(u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store + convert_item32_1_to_fc32_1_nswap_guts(u_) } - //convert remainder + // convert any remaining samples item32_sc16_to_xx(input+i, output+i, nsamps-i, scale_factor); } @@ -73,6 +80,7 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); const __m128i zeroi = _mm_setzero_si128(); + // this macro converts values faster by using SSE intrinsics to convert 4 values at a time #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ for (; i+3 < nsamps; i+=4){ \ /* load from input */ \ @@ -94,18 +102,24 @@ DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD){ size_t i = 0; - //dispatch according to alignment + // need to dispatch according to alignment for fastest conversion switch (size_t(output) & 0xf){ + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the samples + convert_item32_1_to_fc32_1_bswap_guts(_) + break; case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of the samples to 16-bytes item32_sc16_to_xx(input, output, 1, scale_factor); i++; - break; - case 0x0: + // do faster processing of the bulk of the samples now that we are 16-byte aligned convert_item32_1_to_fc32_1_bswap_guts(_) break; - default: convert_item32_1_to_fc32_1_bswap_guts(u_) + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned load and store + convert_item32_1_to_fc32_1_bswap_guts(u_) } - //convert remainder + // convert any remaining samples item32_sc16_to_xx(input+i, output+i, nsamps-i, scale_factor); } -- cgit v1.2.3