From 1a51d7e050d70fae1c4b750892102a5540a0587e Mon Sep 17 00:00:00 2001 From: Nicholas Corgan Date: Tue, 22 Mar 2016 09:16:20 -0700 Subject: Added missing stdint.h include --- host/lib/usrp/common/max287x.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'host/lib') diff --git a/host/lib/usrp/common/max287x.hpp b/host/lib/usrp/common/max287x.hpp index 644ec726e..596d992e0 100644 --- a/host/lib/usrp/common/max287x.hpp +++ b/host/lib/usrp/common/max287x.hpp @@ -1,5 +1,5 @@ // -// Copyright 2015 Ettus Research LLC +// Copyright 2015-2016 Ettus Research LLC // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "max2870_regs.hpp" #include "max2871_regs.hpp" -- cgit v1.2.3 From 47aa63a871ab37f3a4c475ed905de178d1a55639 Mon Sep 17 00:00:00 2001 From: Philip Balister Date: Thu, 28 Jan 2016 17:00:03 +0100 Subject: Unroll the loops in the NEON float to/from integer converters. Unrolling the RX loop showed marked improvement with perf. The TX path was only slightly better. Checked signal correctness with shinysdr to verify received signal and tx_waveforms into a spectrum analyzer for TX. Signed-off-by: Philip Balister --- host/lib/convert/convert_with_neon.cpp | 48 ++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'host/lib') diff --git a/host/lib/convert/convert_with_neon.cpp b/host/lib/convert/convert_with_neon.cpp index f1c7773ec..a172afb54 100644 --- a/host/lib/convert/convert_with_neon.cpp +++ b/host/lib/convert/convert_with_neon.cpp @@ -34,13 +34,35 @@ DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD){ size_t i; float32x4_t Q0 = vdupq_n_f32(float(scale_factor)); - for (i=0; i < (nsamps & ~0x03); i+=2) { + for (i=0; i < (nsamps & ~0x0f); i+=8) { float32x4_t Q1 = vld1q_f32(reinterpret_cast(&input[i])); + float32x4_t Q4 = vld1q_f32(reinterpret_cast(&input[i+2])); + float32x4_t Q7 = vld1q_f32(reinterpret_cast(&input[i+4])); + float32x4_t Q10 = vld1q_f32(reinterpret_cast(&input[i+6])); + float32x4_t Q2 = vmulq_f32(Q1, Q0); int32x4_t Q3 = vcvtq_s32_f32(Q2); int16x4_t D8 = vmovn_s32(Q3); int16x4_t D9 = vrev32_s16(D8); vst1_s16((reinterpret_cast(&output[i])), D9); + + float32x4_t Q5 = vmulq_f32(Q4, Q0); + int32x4_t Q6 = vcvtq_s32_f32(Q5); + int16x4_t D10 = vmovn_s32(Q6); + int16x4_t D11 = vrev32_s16(D10); + vst1_s16((reinterpret_cast(&output[i+2])), D11); + + float32x4_t Q8 = vmulq_f32(Q7, Q0); + int32x4_t Q9 = vcvtq_s32_f32(Q8); + int16x4_t D12 = vmovn_s32(Q9); + int16x4_t D13 = vrev32_s16(D12); + vst1_s16((reinterpret_cast(&output[i+4])), D13); + + float32x4_t Q11 = vmulq_f32(Q10, Q0); + int32x4_t Q13 = vcvtq_s32_f32(Q11); + int16x4_t D14 = vmovn_s32(Q13); + int16x4_t D15 = vrev32_s16(D14); + vst1_s16((reinterpret_cast(&output[i+6])), D15); } xx_to_item32_sc16(input+i, output+i, nsamps-i, scale_factor); @@ -53,13 +75,35 @@ DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD){ size_t i; float32x4_t Q1 = vdupq_n_f32(float(scale_factor)); - for (i=0; i < (nsamps & ~0x03); i+=2) { + for (i=0; i < (nsamps & ~0xf); i+=8) { int16x4_t D0 = vld1_s16(reinterpret_cast(&input[i])); + int16x4_t D2 = vld1_s16(reinterpret_cast(&input[i+2])); + int16x4_t D4 = vld1_s16(reinterpret_cast(&input[i+4])); + int16x4_t D6 = vld1_s16(reinterpret_cast(&input[i+6])); + int16x4_t D1 = vrev32_s16(D0); int32x4_t Q2 = vmovl_s16(D1); float32x4_t Q3 = vcvtq_f32_s32(Q2); float32x4_t Q4 = vmulq_f32(Q3, Q1); vst1q_f32((reinterpret_cast(&output[i])), Q4); + + int16x4_t D3 = vrev32_s16(D2); + int32x4_t Q5 = vmovl_s16(D3); + float32x4_t Q6 = vcvtq_f32_s32(Q5); + float32x4_t Q7 = vmulq_f32(Q6, Q1); + vst1q_f32((reinterpret_cast(&output[i+2])), Q7); + + int16x4_t D5 = vrev32_s16(D4); + int32x4_t Q8 = vmovl_s16(D5); + float32x4_t Q9 = vcvtq_f32_s32(Q8); + float32x4_t Q10 = vmulq_f32(Q9, Q1); + vst1q_f32((reinterpret_cast(&output[i+4])), Q10); + + int16x4_t D7 = vrev32_s16(D6); + int32x4_t Q11 = vmovl_s16(D7); + float32x4_t Q12 = vcvtq_f32_s32(Q11); + float32x4_t Q13 = vmulq_f32(Q12, Q1); + vst1q_f32((reinterpret_cast(&output[i+6])), Q13); } item32_sc16_to_xx(input+i, output+i, nsamps-i, scale_factor); -- cgit v1.2.3