aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilip Balister <philip@opensdr.com>2016-01-28 17:00:03 +0100
committerMartin Braun <martin.braun@ettus.com>2016-03-23 10:36:42 -0700
commit47aa63a871ab37f3a4c475ed905de178d1a55639 (patch)
treee07192ec2e83b0a2fb6c2518f985912d9e46b1d0
parent1a51d7e050d70fae1c4b750892102a5540a0587e (diff)
downloaduhd-47aa63a871ab37f3a4c475ed905de178d1a55639.tar.gz
uhd-47aa63a871ab37f3a4c475ed905de178d1a55639.tar.bz2
uhd-47aa63a871ab37f3a4c475ed905de178d1a55639.zip
Unroll the loops in the NEON float to/from integer converters.
Unrolling the RX loop showed marked improvement with perf. The TX path was only slightly better. Checked signal correctness with shinysdr to verify received signal and tx_waveforms into a spectrum analyzer for TX. Signed-off-by: Philip Balister <philip@opensdr.com>
-rw-r--r--host/lib/convert/convert_with_neon.cpp48
1 files changed, 46 insertions, 2 deletions
diff --git a/host/lib/convert/convert_with_neon.cpp b/host/lib/convert/convert_with_neon.cpp
index f1c7773ec..a172afb54 100644
--- a/host/lib/convert/convert_with_neon.cpp
+++ b/host/lib/convert/convert_with_neon.cpp
@@ -34,13 +34,35 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){
size_t i;
float32x4_t Q0 = vdupq_n_f32(float(scale_factor));
- for (i=0; i < (nsamps & ~0x03); i+=2) {
+ for (i=0; i < (nsamps & ~0x0f); i+=8) {
float32x4_t Q1 = vld1q_f32(reinterpret_cast<const float *>(&input[i]));
+ float32x4_t Q4 = vld1q_f32(reinterpret_cast<const float *>(&input[i+2]));
+ float32x4_t Q7 = vld1q_f32(reinterpret_cast<const float *>(&input[i+4]));
+ float32x4_t Q10 = vld1q_f32(reinterpret_cast<const float *>(&input[i+6]));
+
float32x4_t Q2 = vmulq_f32(Q1, Q0);
int32x4_t Q3 = vcvtq_s32_f32(Q2);
int16x4_t D8 = vmovn_s32(Q3);
int16x4_t D9 = vrev32_s16(D8);
vst1_s16((reinterpret_cast<int16_t *>(&output[i])), D9);
+
+ float32x4_t Q5 = vmulq_f32(Q4, Q0);
+ int32x4_t Q6 = vcvtq_s32_f32(Q5);
+ int16x4_t D10 = vmovn_s32(Q6);
+ int16x4_t D11 = vrev32_s16(D10);
+ vst1_s16((reinterpret_cast<int16_t *>(&output[i+2])), D11);
+
+ float32x4_t Q8 = vmulq_f32(Q7, Q0);
+ int32x4_t Q9 = vcvtq_s32_f32(Q8);
+ int16x4_t D12 = vmovn_s32(Q9);
+ int16x4_t D13 = vrev32_s16(D12);
+ vst1_s16((reinterpret_cast<int16_t *>(&output[i+4])), D13);
+
+ float32x4_t Q11 = vmulq_f32(Q10, Q0);
+ int32x4_t Q13 = vcvtq_s32_f32(Q11);
+ int16x4_t D14 = vmovn_s32(Q13);
+ int16x4_t D15 = vrev32_s16(D14);
+ vst1_s16((reinterpret_cast<int16_t *>(&output[i+6])), D15);
}
xx_to_item32_sc16<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);
@@ -53,13 +75,35 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){
size_t i;
float32x4_t Q1 = vdupq_n_f32(float(scale_factor));
- for (i=0; i < (nsamps & ~0x03); i+=2) {
+ for (i=0; i < (nsamps & ~0xf); i+=8) {
int16x4_t D0 = vld1_s16(reinterpret_cast<const int16_t *>(&input[i]));
+ int16x4_t D2 = vld1_s16(reinterpret_cast<const int16_t *>(&input[i+2]));
+ int16x4_t D4 = vld1_s16(reinterpret_cast<const int16_t *>(&input[i+4]));
+ int16x4_t D6 = vld1_s16(reinterpret_cast<const int16_t *>(&input[i+6]));
+
int16x4_t D1 = vrev32_s16(D0);
int32x4_t Q2 = vmovl_s16(D1);
float32x4_t Q3 = vcvtq_f32_s32(Q2);
float32x4_t Q4 = vmulq_f32(Q3, Q1);
vst1q_f32((reinterpret_cast<float *>(&output[i])), Q4);
+
+ int16x4_t D3 = vrev32_s16(D2);
+ int32x4_t Q5 = vmovl_s16(D3);
+ float32x4_t Q6 = vcvtq_f32_s32(Q5);
+ float32x4_t Q7 = vmulq_f32(Q6, Q1);
+ vst1q_f32((reinterpret_cast<float *>(&output[i+2])), Q7);
+
+ int16x4_t D5 = vrev32_s16(D4);
+ int32x4_t Q8 = vmovl_s16(D5);
+ float32x4_t Q9 = vcvtq_f32_s32(Q8);
+ float32x4_t Q10 = vmulq_f32(Q9, Q1);
+ vst1q_f32((reinterpret_cast<float *>(&output[i+4])), Q10);
+
+ int16x4_t D7 = vrev32_s16(D6);
+ int32x4_t Q11 = vmovl_s16(D7);
+ float32x4_t Q12 = vcvtq_f32_s32(Q11);
+ float32x4_t Q13 = vmulq_f32(Q12, Q1);
+ vst1q_f32((reinterpret_cast<float *>(&output[i+6])), Q13);
}
item32_sc16_to_xx<uhd::htowx>(input+i, output+i, nsamps-i, scale_factor);