From 767d69622770a4bb886f527eaa2e1e2a15a71309 Mon Sep 17 00:00:00 2001 From: "Matthias P. Braendli" Date: Mon, 11 Nov 2024 09:42:47 +0100 Subject: Use ARM NEON in FormatConverter --- src/Buffer.h | 2 +- src/CicEqualizer.h | 3 --- src/FormatConverter.cpp | 43 ++++++++++++++++++++++++++++++++++--------- src/Utils.cpp | 3 +++ 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/Buffer.h b/src/Buffer.h index d5aa802..2c2a65e 100644 --- a/src/Buffer.h +++ b/src/Buffer.h @@ -43,7 +43,7 @@ typedef std::complex complexfix; typedef std::complex complexfix_wide; /* Buffer is a container for a byte array, which is memory-aligned - * to 32 bytes for SSE performance. + * to 32 bytes for SIMD performance. * * The allocation/freeing of the data is handled internally. */ diff --git a/src/CicEqualizer.h b/src/CicEqualizer.h index 70c3ae9..4510d0c 100644 --- a/src/CicEqualizer.h +++ b/src/CicEqualizer.h @@ -29,9 +29,6 @@ #include #include -#ifdef __SSE__ -# include -#endif class CicEqualizer : public ModCodec { diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp index 0821191..94dfa2c 100644 --- a/src/FormatConverter.cpp +++ b/src/FormatConverter.cpp @@ -28,11 +28,15 @@ #include "FormatConverter.h" #include "PcDebug.h" +#include "Log.h" -#include -#include #include +#include #include +#include +#if defined(__ARM_NEON) +#include +#endif FormatConverter::FormatConverter(bool input_is_complexfix_wide, const std::string& format_out) : ModCodec(), @@ -42,9 +46,16 @@ FormatConverter::FormatConverter(bool input_is_complexfix_wide, const std::strin FormatConverter::~FormatConverter() { - etiLog.level(debug) << "FormatConverter: " - << m_num_clipped_samples.load() << - " clipped samples"; + if ( +#if defined(__ARM_NEON) + not m_input_complexfix_wide +#else + true +#endif + ) { + etiLog.level(debug) << "FormatConverter: " << + m_num_clipped_samples.load() << " clipped"; + } } @@ -56,16 +67,29 @@ int FormatConverter::process(Buffer* const dataIn, Buffer* dataOut) size_t num_clipped_samples = 0; - if (m_input_complexfix_wide) { size_t sizeIn = dataIn->getLength() / sizeof(int32_t); - int32_t* in = reinterpret_cast(dataIn->getData()); if (m_format_out == "s16") { dataOut->setLength(sizeIn * sizeof(int16_t)); + const int32_t *in = reinterpret_cast(dataIn->getData()); int16_t* out = reinterpret_cast(dataOut->getData()); + constexpr int shift = 7; + +#if defined(__ARM_NEON) + if (sizeIn % 4 != 0) { + throw std::logic_error("Unexpected length not multiple of 4"); + } + + for (size_t i = 0; i < sizeIn; i += 4) { + int32x4_t input_vec = vld1q_s32(&in[i]); + // Apply shift right, saturate on conversion to int16_t + int16x4_t output_vec = vqshrn_n_s32(input_vec, shift); + vst1_s16(&out[i], output_vec); + } +#else for (size_t i = 0; i < sizeIn; i++) { - const int32_t val = in[i] >> 7; + const int32_t val = in[i] >> shift; if (val < INT16_MIN) { out[i] = INT16_MIN; num_clipped_samples++; @@ -78,6 +102,7 @@ int FormatConverter::process(Buffer* const dataIn, Buffer* dataOut) out[i] = val; } } +#endif } else { throw std::runtime_error("FormatConverter: Invalid fix format " + m_format_out); @@ -85,7 +110,7 @@ int FormatConverter::process(Buffer* const dataIn, Buffer* dataOut) } else { size_t sizeIn = dataIn->getLength() / sizeof(float); - float* in = reinterpret_cast(dataIn->getData()); + const float* in = reinterpret_cast(dataIn->getData()); if (m_format_out == "s16") { dataOut->setLength(sizeIn * sizeof(int16_t)); diff --git a/src/Utils.cpp b/src/Utils.cpp index 0065bc1..f54122c 100644 --- a/src/Utils.cpp +++ b/src/Utils.cpp @@ -62,6 +62,9 @@ static void printHeader() #endif #if defined(__SSE__) "SSE " << +#endif +#if defined(__ARM_NEON) + "NEON " << #endif "\n"; } -- cgit v1.2.3