68 files changed, 5370 insertions, 747 deletions
diff --git a/Makefile.am b/Makefile.am
index d29b530..87d553a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -35,10 +35,11 @@ endif
 
 bin_PROGRAMS = odr-dabmod
 
-odr_dabmod_CFLAGS   = -Wall -Isrc -Ilib \
-					  $(GITVERSION_FLAGS)
-odr_dabmod_CXXFLAGS = -Wall -Isrc -Ilib \
-					  $(GITVERSION_FLAGS) $(BOOST_CPPFLAGS)
+KISS_FLAGS=-DFIXED_POINT=16
+odr_dabmod_CFLAGS   = -Wall -Isrc -Ilib -Ikiss \
+					  $(GITVERSION_FLAGS) $(KISS_FLAGS)
+odr_dabmod_CXXFLAGS = -Wall -Isrc -Ilib -Ikiss \
+					  $(GITVERSION_FLAGS) $(BOOST_CPPFLAGS) $(KISS_FLAGS)
 odr_dabmod_LDADD    =  $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) $(UHD_LIBS) $(LIMESDR_LIBS) $(ADDITIONAL_UHD_LIBS)
 odr_dabmod_SOURCES  = src/DabMod.cpp \
 					  src/PcDebug.h \
@@ -175,7 +176,17 @@ odr_dabmod_SOURCES  = src/DabMod.cpp \
 					  src/PAPRStats.cpp \
 					  src/PAPRStats.h \
 					  src/TII.cpp \
-					  src/TII.h
+					  src/TII.h \
+					  kiss/kfc.h \
+					  kiss/kfc.c \
+					  kiss/kiss_fft.c \
+					  kiss/kiss_fft.h \
+					  kiss/kiss_fftnd.c \
+					  kiss/kiss_fftnd.h \
+					  kiss/kiss_fftndr.c \
+					  kiss/kiss_fftndr.h \
+					  kiss/kiss_fftr.c \
+					  kiss/kiss_fftr.h
 
 
 man_MANS = man/odr-dabmod.1
diff --git a/doc/example.ini b/doc/example.ini
index eda50a5..0d0f8e3 100644
--- a/doc/example.ini
+++ b/doc/example.ini
@@ -103,6 +103,8 @@ gainmode=var
 ; If not defined, use Transmission Mode 1
 ;mode=1
 
+fixed_point=1
+
 ; The digital gain is a value that is multiplied to each sample. It is used
 ; to tune the chain to make sure that no non-linearities appear up to the
 ; USRP daughterboard programmable gain amplifier (PGA).
diff --git a/fpm/LICENSE b/fpm/LICENSE
new file mode 100644
index 0000000..bb86b71
--- /dev/null
+++ b/fpm/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Mike Lankamp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/fpm/README.md b/fpm/README.md
new file mode 100644
index 0000000..38ee444
--- /dev/null
+++ b/fpm/README.md
@@ -0,0 +1,48 @@
+# fpm
+A C++ header-only fixed-point math library. "fpm" stands for "fixed-point math".
+
+It is designed to serve as a drop-in replacement for floating-point types and aims to provide as much of the standard library's functionality as possible with exclusively integers. `fpm` requires C++11 or higher.
+
+[![Build Status](https://travis-ci.org/MikeLankamp/fpm.svg?branch=master)](https://travis-ci.org/MikeLankamp/fpm)
+[![Build status](https://ci.appveyor.com/api/projects/status/0velpwqk38spu412?svg=true)](https://ci.appveyor.com/project/MikeLankamp/fpm)
+
+`fpm` is designed to guard against accidental conversion to and from floats and supports many of the standard C++ maths functions, including trigonometry, power and logarithmic functions, with performance and accuracy generally comparable to alternative libraries.
+
+## Why use fixed-point math?
+There are several reasons why you can not or choose not to use floating-point math, but still want a similar type:
+* Your target platform lacks an FPU, does not support floating-point operations or its floating-point operations are
+  considerably slower than fixed-point integer operations.
+* You require deterministic calculations.
+
+If any of these reasons apply for you, and your problem domain has a clearly outlined range and required resolution,
+then fixed-point numbers might be a solution for you.
+
+## Quick Start
+To use `fpm`, include its header `<fpm/fixed.hpp>` and use the `fpm::fixed_16_16`, `fpm::fixed_24_8` or `fpm::fixed_8_24`
+types as if they were native floating-pointer types:
+```c++
+#include <fpm/fixed.hpp>  // For fpm::fixed_16_16
+#include <fpm/math.hpp>   // For fpm::cos
+#include <fpm/ios.hpp>    // For fpm::operator<<
+#include <iostream>       // For std::cin, std::cout
+
+int main() {
+    std::cout << "Please input a number: ";
+    fpm::fixed_16_16 x;
+    std::cin >> x;
+    std::cout << "The cosine of " << x << " radians is: " << cos(x) << std::endl;
+    return 0;
+}
+```
+
+To use the fixed-point equivalents of the `<math.h>` functions such as `sqrt`, `sin` and `log`, include the header `<fpm/math.hpp>`.
+To stream fixed-point values to or from streams, include the header `<fpm/ios.hpp>`.
+
+## Documentation
+Please refer to the [documentation](docs/index.md) for detailed information how to use `fpm`, or skip straight to the [performance](docs/performance.md) or [accuracy](docs/accuracy.md) results.
+
+## Contributions
+This library is a work-in-progress. We welcome any contributions that improve the functional coverage or the performance or accuracy of the mathematical functions.
+
+## License
+See the [LICENSE](LICENSE) file
diff --git a/fpm/fixed.hpp b/fpm/fixed.hpp
new file mode 100644
index 0000000..e2e71bf
--- /dev/null
+++ b/fpm/fixed.hpp
@@ -0,0 +1,490 @@
+#ifndef FPM_FIXED_HPP
+#define FPM_FIXED_HPP
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <type_traits>
+
+namespace fpm
+{
+
+//! Fixed-point number type
+//! \tparam BaseType         the base integer type used to store the fixed-point number. This can be a signed or unsigned type.
+//! \tparam IntermediateType the integer type used to store intermediate results during calculations.
+//! \tparam FractionBits     the number of bits of the BaseType used to store the fraction
+//! \tparam EnableRounding   enable rounding of LSB for multiplication, division, and type conversion
+template <typename BaseType, typename IntermediateType, unsigned int FractionBits, bool EnableRounding = true>
+class fixed
+{
+    static_assert(std::is_integral<BaseType>::value, "BaseType must be an integral type");
+    static_assert(FractionBits > 0, "FractionBits must be greater than zero");
+    static_assert(FractionBits <= sizeof(BaseType) * 8 - 1, "BaseType must at least be able to contain entire fraction, with space for at least one integral bit");
+    static_assert(sizeof(IntermediateType) > sizeof(BaseType), "IntermediateType must be larger than BaseType");
+    static_assert(std::is_signed<IntermediateType>::value == std::is_signed<BaseType>::value, "IntermediateType must have same signedness as BaseType");
+
+    // Although this value fits in the BaseType in terms of bits, if there's only one integral bit, this value
+    // is incorrect (flips from positive to negative), so we must extend the size to IntermediateType.
+    static constexpr IntermediateType FRACTION_MULT = IntermediateType(1) << FractionBits;
+
+    struct raw_construct_tag {};
+    constexpr inline fixed(BaseType val, raw_construct_tag) noexcept : m_value(val) {}
+
+public:
+    inline fixed() noexcept = default;
+
+    // Converts an integral number to the fixed-point type.
+    // Like static_cast, this truncates bits that don't fit.
+    template <typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+    constexpr inline explicit fixed(T val) noexcept
+        : m_value(static_cast<BaseType>(val * FRACTION_MULT))
+    {}
+
+    // Converts an floating-point number to the fixed-point type.
+    // Like static_cast, this truncates bits that don't fit.
+    template <typename T, typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
+    constexpr inline explicit fixed(T val) noexcept
+        : m_value(static_cast<BaseType>((EnableRounding) ?
+		       (val >= 0.0) ? (val * FRACTION_MULT + T{0.5}) : (val * FRACTION_MULT - T{0.5})
+		      : (val * FRACTION_MULT)))
+    {}
+
+    // Constructs from another fixed-point type with possibly different underlying representation.
+    // Like static_cast, this truncates bits that don't fit.
+    template <typename B, typename I, unsigned int F, bool R>
+    constexpr inline explicit fixed(fixed<B,I,F,R> val) noexcept
+        : m_value(from_fixed_point<F>(val.raw_value()).raw_value())
+    {}
+
+    // Explicit conversion to a floating-point type
+    template <typename T, typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
+    constexpr inline explicit operator T() const noexcept
+    {
+        return static_cast<T>(m_value) / FRACTION_MULT;
+    }
+
+    // Explicit conversion to an integral type
+    template <typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+    constexpr inline explicit operator T() const noexcept
+    {
+        return static_cast<T>(m_value / FRACTION_MULT);
+    }
+
+    // Returns the raw underlying value of this type.
+    // Do not use this unless you know what you're doing.
+    constexpr inline BaseType raw_value() const noexcept
+    {
+        return m_value;
+    }
+
+    //! Constructs a fixed-point number from another fixed-point number.
+    //! \tparam NumFractionBits the number of bits used by the fraction in \a value.
+    //! \param value the integer fixed-point number
+    template <unsigned int NumFractionBits, typename T, typename std::enable_if<(NumFractionBits > FractionBits)>::type* = nullptr>
+    static constexpr inline fixed from_fixed_point(T value) noexcept
+    {
+	// To correctly round the last bit in the result, we need one more bit of information.
+	// We do this by multiplying by two before dividing and adding the LSB to the real result.
+	return (EnableRounding) ? fixed(static_cast<BaseType>(
+             value / (T(1) << (NumFractionBits - FractionBits)) +
+            (value / (T(1) << (NumFractionBits - FractionBits - 1)) % 2)),
+	    raw_construct_tag{}) :
+	    fixed(static_cast<BaseType>(value / (T(1) << (NumFractionBits - FractionBits))),
+	     raw_construct_tag{});
+    }
+
+    template <unsigned int NumFractionBits, typename T, typename std::enable_if<(NumFractionBits <= FractionBits)>::type* = nullptr>
+    static constexpr inline fixed from_fixed_point(T value) noexcept
+    {
+        return fixed(static_cast<BaseType>(
+            value * (T(1) << (FractionBits - NumFractionBits))),
+            raw_construct_tag{});
+    }
+
+    // Constructs a fixed-point number from its raw underlying value.
+    // Do not use this unless you know what you're doing.
+    static constexpr inline fixed from_raw_value(BaseType value) noexcept
+    {
+        return fixed(value, raw_construct_tag{});
+    }
+
+    //
+    // Constants
+    //
+    static constexpr fixed e() { return from_fixed_point<61>(6267931151224907085ll); }
+    static constexpr fixed pi() { return from_fixed_point<61>(7244019458077122842ll); }
+    static constexpr fixed half_pi() { return from_fixed_point<62>(7244019458077122842ll); }
+    static constexpr fixed two_pi() { return from_fixed_point<60>(7244019458077122842ll); }
+
+    //
+    // Arithmetic member operators
+    //
+
+    constexpr inline fixed operator-() const noexcept
+    {
+        return fixed::from_raw_value(-m_value);
+    }
+
+    inline fixed& operator+=(const fixed& y) noexcept
+    {
+        m_value += y.m_value;
+        return *this;
+    }
+
+    template <typename I, typename std::enable_if<std::is_integral<I>::value>::type* = nullptr>
+    inline fixed& operator+=(I y) noexcept
+    {
+        m_value += y * FRACTION_MULT;
+        return *this;
+    }
+
+    inline fixed& operator-=(const fixed& y) noexcept
+    {
+        m_value -= y.m_value;
+        return *this;
+    }
+
+    template <typename I, typename std::enable_if<std::is_integral<I>::value>::type* = nullptr>
+    inline fixed& operator-=(I y) noexcept
+    {
+        m_value -= y * FRACTION_MULT;
+        return *this;
+    }
+
+    inline fixed& operator*=(const fixed& y) noexcept
+    {
+	if (EnableRounding){
+	    // Normal fixed-point multiplication is: x * y / 2**FractionBits.
+	    // To correctly round the last bit in the result, we need one more bit of information.
+	    // We do this by multiplying by two before dividing and adding the LSB to the real result.
+	    auto value = (static_cast<IntermediateType>(m_value) * y.m_value) / (FRACTION_MULT / 2);
+	    m_value = static_cast<BaseType>((value / 2) + (value % 2));
+	} else {
+	    auto value = (static_cast<IntermediateType>(m_value) * y.m_value) / FRACTION_MULT;
+	    m_value = static_cast<BaseType>(value);
+	}
+	return *this;
+    }
+
+    template <typename I, typename std::enable_if<std::is_integral<I>::value>::type* = nullptr>
+    inline fixed& operator*=(I y) noexcept
+    {
+        m_value *= y;
+        return *this;
+    }
+
+    inline fixed& operator/=(const fixed& y) noexcept
+    {
+        assert(y.m_value != 0);
+	if (EnableRounding){
+	    // Normal fixed-point division is: x * 2**FractionBits / y.
+	    // To correctly round the last bit in the result, we need one more bit of information.
+	    // We do this by multiplying by two before dividing and adding the LSB to the real result.
+	    auto value = (static_cast<IntermediateType>(m_value) * FRACTION_MULT * 2) / y.m_value;
+	    m_value = static_cast<BaseType>((value / 2) + (value % 2));
+	} else {
+	    auto value = (static_cast<IntermediateType>(m_value) * FRACTION_MULT) / y.m_value;
+	    m_value = static_cast<BaseType>(value);
+	}
+        return *this;
+    }
+
+    template <typename I, typename std::enable_if<std::is_integral<I>::value>::type* = nullptr>
+    inline fixed& operator/=(I y) noexcept
+    {
+        m_value /= y;
+        return *this;
+    }
+
+private:
+    BaseType m_value;
+};
+
+//
+// Convenience typedefs
+//
+
+using fixed_16_16 = fixed<std::int32_t, std::int64_t, 16>;
+using fixed_24_8 = fixed<std::int32_t, std::int64_t, 8>;
+using fixed_8_24 = fixed<std::int32_t, std::int64_t, 24>;
+
+//
+// Addition
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> operator+(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) += y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator+(const fixed<B, I, F, R>& x, T y) noexcept
+{
+    return fixed<B, I, F, R>(x) += y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator+(T x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(y) += x;
+}
+
+//
+// Subtraction
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> operator-(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) -= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator-(const fixed<B, I, F, R>& x, T y) noexcept
+{
+    return fixed<B, I, F, R>(x) -= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator-(T x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) -= y;
+}
+
+//
+// Multiplication
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> operator*(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) *= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator*(const fixed<B, I, F, R>& x, T y) noexcept
+{
+    return fixed<B, I, F, R>(x) *= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator*(T x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(y) *= x;
+}
+
+//
+// Division
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> operator/(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) /= y;
+}
+
+template <typename B, typename I, unsigned int F, typename T, bool R, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator/(const fixed<B, I, F, R>& x, T y) noexcept
+{
+    return fixed<B, I, F, R>(x) /= y;
+}
+
+template <typename B, typename I, unsigned int F, typename T, bool R, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+constexpr inline fixed<B, I, F, R> operator/(T x, const fixed<B, I, F, R>& y) noexcept
+{
+    return fixed<B, I, F, R>(x) /= y;
+}
+
+//
+// Comparison operators
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator==(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() == y.raw_value();
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator!=(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() != y.raw_value();
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator<(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() < y.raw_value();
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator>(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() > y.raw_value();
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator<=(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() <= y.raw_value();
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool operator>=(const fixed<B, I, F, R>& x, const fixed<B, I, F, R>& y) noexcept
+{
+    return x.raw_value() >= y.raw_value();
+}
+
+namespace detail
+{
+// Number of base-10 digits required to fully represent a number of bits
+static constexpr int max_digits10(int bits)
+{
+    // 8.24 fixed-point equivalent of (int)ceil(bits * std::log10(2));
+    using T = long long;
+    return static_cast<int>((T{bits} * 5050445 + (T{1} << 24) - 1) >> 24);
+}
+
+// Number of base-10 digits that can be fully represented by a number of bits
+static constexpr int digits10(int bits)
+{
+    // 8.24 fixed-point equivalent of (int)(bits * std::log10(2));
+    using T = long long;
+    return static_cast<int>((T{bits} * 5050445) >> 24);
+}
+
+} // namespace detail
+} // namespace fpm
+
+// Specializations for customization points
+namespace std
+{
+
+template <typename B, typename I, unsigned int F, bool R>
+struct hash<fpm::fixed<B,I,F,R>>
+{
+    using argument_type = fpm::fixed<B, I, F, R>;
+    using result_type = std::size_t;
+
+    result_type operator()(argument_type arg) const noexcept(noexcept(std::declval<std::hash<B>>()(arg.raw_value()))) {
+        return m_hash(arg.raw_value());
+    }
+
+private:
+    std::hash<B> m_hash;
+};
+
+template <typename B, typename I, unsigned int F, bool R>
+struct numeric_limits<fpm::fixed<B,I,F,R>>
+{
+    static constexpr bool is_specialized = true;
+    static constexpr bool is_signed = std::numeric_limits<B>::is_signed;
+    static constexpr bool is_integer = false;
+    static constexpr bool is_exact = true;
+    static constexpr bool has_infinity = false;
+    static constexpr bool has_quiet_NaN = false;
+    static constexpr bool has_signaling_NaN = false;
+    static constexpr std::float_denorm_style has_denorm = std::denorm_absent;
+    static constexpr bool has_denorm_loss = false;
+    static constexpr std::float_round_style round_style = std::round_to_nearest;
+    static constexpr bool is_iec_559 = false;
+    static constexpr bool is_bounded = true;
+    static constexpr bool is_modulo = std::numeric_limits<B>::is_modulo;
+    static constexpr int digits = std::numeric_limits<B>::digits;
+
+    // Any number with `digits10` significant base-10 digits (that fits in
+    // the range of the type) is guaranteed to be convertible from text and
+    // back without change. Worst case, this is 0.000...001, so we can only
+    // guarantee this case. Nothing more.
+    static constexpr int digits10 = 1;
+
+    // This is equal to max_digits10 for the integer and fractional part together.
+    static constexpr int max_digits10 =
+        fpm::detail::max_digits10(std::numeric_limits<B>::digits - F) + fpm::detail::max_digits10(F);
+
+    static constexpr int radix = 2;
+    static constexpr int min_exponent = 1 - F;
+    static constexpr int min_exponent10 = -fpm::detail::digits10(F);
+    static constexpr int max_exponent = std::numeric_limits<B>::digits - F;
+    static constexpr int max_exponent10 = fpm::detail::digits10(std::numeric_limits<B>::digits - F);
+    static constexpr bool traps = true;
+    static constexpr bool tinyness_before = false;
+
+    static constexpr fpm::fixed<B,I,F,R> lowest() noexcept {
+        return fpm::fixed<B,I,F,R>::from_raw_value(std::numeric_limits<B>::lowest());
+    };
+
+    static constexpr fpm::fixed<B,I,F,R> min() noexcept {
+        return lowest();
+    }
+
+    static constexpr fpm::fixed<B,I,F,R> max() noexcept {
+        return fpm::fixed<B,I,F,R>::from_raw_value(std::numeric_limits<B>::max());
+    };
+
+    static constexpr fpm::fixed<B,I,F,R> epsilon() noexcept {
+        return fpm::fixed<B,I,F,R>::from_raw_value(1);
+    };
+
+    static constexpr fpm::fixed<B,I,F,R> round_error() noexcept {
+        return fpm::fixed<B,I,F,R>(1) / 2;
+    };
+
+    static constexpr fpm::fixed<B,I,F,R> denorm_min() noexcept {
+        return min();
+    }
+};
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_specialized;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_signed;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_integer;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_exact;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::has_infinity;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::has_quiet_NaN;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::has_signaling_NaN;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr std::float_denorm_style numeric_limits<fpm::fixed<B,I,F,R>>::has_denorm;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::has_denorm_loss;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr std::float_round_style numeric_limits<fpm::fixed<B,I,F,R>>::round_style;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_iec_559;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_bounded;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::is_modulo;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::digits;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::digits10;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::max_digits10;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::radix;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::min_exponent;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::min_exponent10;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::max_exponent;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr int numeric_limits<fpm::fixed<B,I,F,R>>::max_exponent10;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::traps;
+template <typename B, typename I, unsigned int F, bool R>
+constexpr bool numeric_limits<fpm::fixed<B,I,F,R>>::tinyness_before;
+
+}
+
+#endif
diff --git a/fpm/ios.hpp b/fpm/ios.hpp
new file mode 100644
index 0000000..69581fb
--- /dev/null
+++ b/fpm/ios.hpp
@@ -0,0 +1,740 @@
+#ifndef FPM_IOS_HPP
+#define FPM_IOS_HPP
+
+#include "fixed.hpp"
+#include "math.hpp"
+#include <array>
+#include <algorithm>
+#include <cctype>
+#include <climits>
+#include <limits>
+#include <ios>
+#include <vector>
+
+namespace fpm
+{
+
+template <typename CharT, typename B, typename I, unsigned int F, bool R>
+std::basic_ostream<CharT>& operator<<(std::basic_ostream<CharT>& os, fixed<B, I, F, R> x) noexcept
+{
+    const auto uppercase = ((os.flags() & std::ios_base::uppercase) != 0);
+    const auto showpoint = ((os.flags() & std::ios_base::showpoint) != 0);
+    const auto adjustfield = (os.flags() & std::ios_base::adjustfield);
+    const auto width = os.width();
+    const auto& ctype = std::use_facet<std::ctype<CharT>>(os.getloc());
+    const auto& numpunct = std::use_facet<std::numpunct<CharT>>(os.getloc());
+
+    auto floatfield = (os.flags() & std::ios_base::floatfield);
+    auto precision = os.precision();
+    auto show_trailing_zeros = true;
+    auto use_significant_digits = false;
+
+    // Invalid precision? Reset to the default
+    if (precision < 0)
+    {
+        precision = 6;
+    }
+
+    // Output buffer. Needs to be big enough for the formatted number without padding.
+    // Optional prefixes (i.e. "+"/"-", decimal separator, exponent "e+/-" and/or "0x").
+    constexpr auto worst_case_constant_size = 6;
+    // Maximum number of digits from the base type (covers integral + fractional digits)
+    constexpr auto worst_case_digit_count = std::numeric_limits<B>::digits10 + 2;
+    // Exponent suffixes (i.e. maximum digits based on log of the base type size).
+    // Needs a log10, but that isn't constexpr, so we're over-allocating on the stack. Can't hurt.
+    constexpr auto worst_case_suffix_size = std::numeric_limits<B>::digits;
+    // Double the digit count: in the worst case the thousands grouping add a character per digit.
+    using buffer_t = std::array<CharT, worst_case_constant_size + worst_case_digit_count * 2 + worst_case_suffix_size>;
+    buffer_t buffer;
+
+    // Output cursor
+    auto end = buffer.begin();
+
+    // Keep track of the start of "internal" padding
+    typename buffer_t::iterator internal_pad = buffer.end();
+
+    // Representation of a number.
+    // The value of the number is: raw / divisor * (10|2) ^ exponent
+    // The base of the exponent is 2 in hexfloat mode, or 10 otherwise.
+    struct number_t {
+        I raw;          // raw fixed-point value
+        I divisor;      // the divisor indicating the place of the decimal point
+        int exponent;   // the exponent applied
+    };
+
+    // Convert a value without exponent to scientific representation
+    // where the part before the decimal point is less than 10.
+    const auto as_scientific = [](number_t value) {
+        assert(value.exponent == 0);
+        if (value.raw > 0)
+        {
+            while (value.raw / 10 >= value.divisor) {
+                value.divisor *= 10;
+                ++value.exponent;
+            }
+            while (value.raw < value.divisor) {
+                 value.raw *= 10;
+                --value.exponent;
+            }
+        }
+        return value;
+    };
+
+    number_t value = { x.raw_value(), I{1} << F, 0};
+
+    auto base = B{10};
+
+    // First write the sign
+    if (value.raw < 0)
+    {
+        *end++ = ctype.widen('-');
+        value.raw = -value.raw;
+        internal_pad = end;
+    }
+    else if (os.flags() & std::ios_base::showpos)
+    {
+        *end++ = ctype.widen('+');
+        internal_pad = end;
+    }
+    assert(value.raw >= 0);
+
+    switch (floatfield)
+    {
+    case std::ios_base::fixed | std::ios_base::scientific:
+        // Hexadecimal mode: figure out the hexadecimal exponent and write "0x"
+        if (value.raw > 0)
+        {
+            auto bit  = detail::find_highest_bit(value.raw);
+            value.exponent = bit - F;    // exponent is applied to base 2
+            value.divisor = I{1} << bit; // divisor is at the highest bit, ensuring it starts with "1."
+            precision = (bit + 3) / 4;   // precision is number of nibbles, so we show all of them
+        }
+        base = 16;
+        show_trailing_zeros = false; // Always strip trailing zeros in hexfloat mode
+
+        *end++ = ctype.widen('0');
+        *end++ = ctype.widen(uppercase ? 'X' : 'x');
+        break;
+
+    case std::ios_base::scientific:
+        // Scientific mode, normalize value to scientific notation
+        value = as_scientific(value);
+        break;
+
+    case std::ios_base::fixed:
+        // Fixed mode. Nothing to do.
+        break;
+
+    default:
+    {
+        // "auto" mode: figure out the exponent
+        const number_t sci_value = as_scientific(value);
+
+        // Now `precision` indicates the number of *significant digits* (not fractional digits).
+        use_significant_digits = true;
+        precision = std::max<std::streamsize>(precision, 1);
+
+        if (sci_value.exponent >= precision || sci_value.exponent < -4) {
+            // Display as scientific format
+            floatfield = std::ios_base::scientific;
+            value = sci_value;
+        } else {
+            // Display as fixed format.
+            // "showpoint" indicates whether or not we show trailing zeros
+            floatfield = std::ios_base::fixed;
+            show_trailing_zeros = showpoint;
+        }
+        break;
+    }
+    };
+
+    // If we didn't write a sign, any internal padding starts here
+    // (after a potential "0x" for hexfloats).
+    if (internal_pad == buffer.end()) {
+        internal_pad = end;
+    }
+
+    // Separate out the integral part of the number
+    I integral = value.raw / value.divisor;
+    value.raw %= value.divisor;
+
+    // Here we start printing the number itself
+    const char* const digits = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
+    const auto digits_start = end;
+
+    // Are we already printing significant digits? (yes if we're not counting significant digits)
+    bool significant_digits = !use_significant_digits;
+
+    // Print the integral part
+    int last_digit = 0;
+    if (integral == 0) {
+        *end++ = ctype.widen('0');
+        if (value.raw == 0) {
+            // If the fraction is zero too, all zeros including the integral count
+            // as significant digits.
+            significant_digits = true;
+        }
+    } else {
+        while (integral > 0) {
+            last_digit = integral % base;
+            *end++ = ctype.widen(digits[last_digit]);
+            integral /= base;
+        }
+        std::reverse(digits_start, end);
+        significant_digits = true;
+    }
+
+    if (use_significant_digits && significant_digits)
+    {
+        // Apparently the integral part was significant; subtract its
+        // length from the remaining significant digits.
+        precision -= (end - digits_start);
+    }
+
+    // At this point, `value` contains only the fraction and
+    // `precision` holds the number of digits to print.
+    assert(value.raw < value.divisor);
+    assert(precision >= 0);
+
+    // Location of decimal point
+    typename buffer_t::iterator point = buffer.end();
+
+    // Start (and length) of the trailing zeros to insert while printing
+    // By tracking this to print them later instead of actually printing them now,
+    // we can support large precisions with a small printing buffer.
+    typename buffer_t::iterator trailing_zeros_start = buffer.end();
+    std::streamsize trailing_zeros_count = 0;
+
+    if (precision > 0)
+    {
+        // Print the fractional part
+        *(point = end++) = numpunct.decimal_point();
+
+        for (int i = 0; i < precision; ++i)
+        {
+            if (value.raw == 0)
+            {
+                // The rest of the digits are all zeros, mark them
+                // to be printed in this spot.
+                trailing_zeros_start = end;
+                trailing_zeros_count = precision - i;
+                break;
+            }
+
+            // Shift the divisor if we can to avoid overflow on the value
+            if (value.divisor % base == 0) {
+                value.divisor /= base;
+            } else {
+                value.raw *= base;
+            }
+            assert(value.divisor > 0);
+            assert(value.raw >= 0);
+            last_digit = (value.raw / value.divisor) % base;
+            value.raw %= value.divisor;
+            *end++ = ctype.widen(digits[last_digit]);
+
+            if (!significant_digits) {
+                // We're still finding the first significant digit
+                if (last_digit != 0) {
+                    // Found it
+                    significant_digits = true;
+                } else {
+                    // Not yet; increment number of digits to print
+                    ++precision;
+                }
+            }
+        }
+    }
+    else if (showpoint)
+    {
+        // No fractional part to print, but we still want the point
+        *(point = end++) = numpunct.decimal_point();
+    }
+
+    // Insert `ch` into the output at `position`, updating all references accordingly
+    const auto insert_character = [&](typename buffer_t::iterator position, CharT ch) {
+        assert(position >= buffer.begin() && position < end);
+        std::move_backward(position, end, end + 1);
+        if (point != buffer.end() && position < point) {
+            ++point;
+        }
+        if (trailing_zeros_start != buffer.end() && position < trailing_zeros_start) {
+            ++trailing_zeros_start;
+        }
+        ++end;
+        *position = ch;
+    };
+
+    // Round the number: round to nearest
+    bool increment = false;
+    if (value.raw > value.divisor / 2) {
+        // Round up
+        increment = true;
+    } else if (value.raw == value.divisor / 2) {
+        // It's a tie (i.e. "xyzw.5"): round to even
+        increment = ((last_digit % 2) == 1);
+    }
+
+    if (increment)
+    {
+        auto p = end - 1;
+        // Increment all digits backwards while we see "9"
+        while (p >= digits_start) {
+            if (p == point) {
+                // Skip over the decimal point
+                --p;
+            }
+            if ((*p)++ != ctype.widen('9')) {
+                break;
+            }
+            *p-- = ctype.widen('0');
+        }
+
+        if (p < digits_start) {
+            // We've incremented all the way to the start (all 9's), we need to insert the
+            // carried-over 1 from incrementing the last 9.
+            assert(p == digits_start - 1);
+            insert_character(++p, ctype.widen('1'));
+
+            if (floatfield == std::ios::scientific)
+            {
+                // We just made the integral part equal to 10, so we shift the decimal point
+                // back one place (if any) and tweak the exponent, so that we keep the integer part
+                // less than 10.
+                if (point != buffer.end()) {
+                    assert(p + 2 == point);
+                    std::swap(*(point - 1), *point);
+                    --point;
+                }
+                ++value.exponent;
+
+                // We've introduced an extra digit so we need to strip the last digit
+                // to maintain the same precision
+                --end;
+            }
+        }
+
+        if (use_significant_digits && *p == ctype.widen('1') && point != buffer.end()) {
+            // We've converted a leading zero to a 1 so we need to strip the last digit
+            // (behind the decimal point) to maintain the same significant digit count.
+            --end;
+        }
+    }
+
+    if (point != buffer.end())
+    {
+        if (!show_trailing_zeros)
+        {
+            // Remove trailing zeros
+            while (*(end - 1) == ctype.widen('0')) {
+                --end;
+            }
+
+            // Also clear the "trailing zeros to append during printing" range
+            trailing_zeros_start = buffer.end();
+            trailing_zeros_count = 0;
+        }
+
+        if (end - 1 == point && trailing_zeros_count == 0 && !showpoint) {
+            // Remove the decimal point, too
+            --end;
+        }
+    }
+
+    // Apply thousands grouping
+    const auto& grouping = numpunct.grouping();
+    if (!grouping.empty())
+    {
+        // Step backwards from the end or decimal point, inserting the
+        // thousands separator at every group interval.
+        const CharT thousands_sep = ctype.widen(numpunct.thousands_sep());
+        std::size_t group = 0;
+        auto p = point != buffer.end() ? point : end;
+        auto size = static_cast<int>(grouping[group]);
+        while (size > 0 && size < CHAR_MAX && p - digits_start > size) {
+            p -= size;
+            insert_character(p, thousands_sep);
+            if (group < grouping.size() - 1) {
+                size = static_cast<int>(grouping[++group]);
+            }
+        }
+    }
+
+    // Print the exponent if required
+    assert(floatfield != 0);
+    if (floatfield & std::ios_base::scientific)
+    {
+        // Hexadecimal (%a/%A) or decimal (%e/%E) scientific notation
+        if (floatfield & std::ios_base::fixed) {
+            *end++ = ctype.widen(uppercase ? 'P' : 'p');
+        } else {
+            *end++ = ctype.widen(uppercase ? 'E' : 'e');
+        }
+
+        if (value.exponent < 0) {
+            *end++ = ctype.widen('-');
+            value.exponent = -value.exponent;
+        } else {
+            *end++ = ctype.widen('+');
+        }
+
+        if (floatfield == std::ios_base::scientific) {
+            // In decimal scientific notation (%e/%E), the exponent is at least two digits
+            if (value.exponent < 10) {
+                *end++ = ctype.widen('0');
+            }
+        }
+
+        const auto exponent_start = end;
+        if (value.exponent == 0) {
+            *end++ = ctype.widen('0');
+        } else while (value.exponent > 0) {
+            *end++ = ctype.widen(digits[value.exponent % 10]);
+            value.exponent /= 10;
+        }
+        std::reverse(exponent_start, end);
+    }
+
+    // Write character `ch` `count` times to the stream
+    const auto sputcn = [&](CharT ch, std::streamsize count){
+        // Fill a buffer to output larger chunks
+        constexpr std::streamsize chunk_size = 64;
+        std::array<CharT, chunk_size> fill_buffer;
+        std::fill_n(fill_buffer.begin(), std::min(count, chunk_size), ch);
+
+        for (std::streamsize size, left = count; left > 0; left -= size) {
+            size = std::min(chunk_size, left);
+            os.rdbuf()->sputn(&fill_buffer[0], size);
+        }
+    };
+
+    // Outputs a range of characters, making sure to output the trailing zeros range
+    // if it lies in the specified range
+    const auto put_range = [&](typename buffer_t::const_iterator begin, typename buffer_t::const_iterator end) {
+        assert(end >= begin);
+        if (trailing_zeros_start >= begin && trailing_zeros_start <= end) {
+            // Print range with trailing zeros range in the middle
+            assert(trailing_zeros_count > 0);
+            os.rdbuf()->sputn(&*begin, trailing_zeros_start - begin);
+            sputcn(ctype.widen('0'), trailing_zeros_count);
+            os.rdbuf()->sputn(&*trailing_zeros_start, end - trailing_zeros_start);
+        } else {
+            // Print range as-is
+            os.rdbuf()->sputn(&*begin, end - begin);
+        }
+    };
+
+    // Pad the buffer if necessary.
+    // Note that the length of trailing zeros is counted towards the length of the content.
+    const auto content_size = end - buffer.begin() + trailing_zeros_count;
+    if (content_size >= width)
+    {
+        // Buffer needs no padding, output as-is
+        put_range(buffer.begin(), end);
+    }
+    else
+    {
+        const auto pad_size = width - content_size;
+        switch (adjustfield)
+        {
+        case std::ios_base::left:
+            // Content is left-aligned, so output the buffer, followed by the padding
+            put_range(buffer.begin(), end);
+            sputcn(os.fill(), pad_size);
+            break;
+        case std::ios_base::internal:
+            // Content is internally aligned, so output the buffer up to the "internal pad"
+            // point, followed by the padding, followed by the remainder of the buffer.
+            put_range(buffer.begin(), internal_pad);
+            sputcn(os.fill(), pad_size);
+            put_range(internal_pad, end);
+            break;
+        default:
+            // Content is right-aligned, so output the padding, followed by the buffer
+            sputcn(os.fill(), pad_size);
+            put_range(buffer.begin(), end);
+            break;
+        }
+    }
+
+    // Width is reset after every write
+    os.width(0);
+
+    return os;
+}
+
+
+template <typename CharT, class Traits, typename B, typename I, unsigned int F, bool R>
+std::basic_istream<CharT, Traits>& operator>>(std::basic_istream<CharT, Traits>& is, fixed<B, I, F, R>& x)
+{
+    typename std::basic_istream<CharT, Traits>::sentry sentry(is);
+    if (!sentry)
+    {
+        return is;
+    }
+
+    const auto& ctype = std::use_facet<std::ctype<CharT>>(is.getloc());
+    const auto& numpunct = std::use_facet<std::numpunct<CharT>>(is.getloc());
+
+    bool thousands_separator_allowed = false;
+    const bool supports_thousands_separators = !numpunct.grouping().empty();
+
+    const auto& is_valid_character = [](char ch) {
+        // Note: allowing ['p', 'i', 'n', 't', 'y'] is technically in violation of the spec (we are emulating std::num_get),
+        // but otherwise we cannot parse hexfloats and "infinity". This is a known issue with the spec (LWG #2381).
+        return std::isxdigit(ch) ||
+            ch == 'x' || ch == 'X' || ch == 'p' || ch == 'P' ||
+            ch == 'i' || ch == 'I' || ch == 'n' || ch == 'N' ||
+            ch == 't' || ch == 'T' || ch == 'y' || ch == 'Y' ||
+            ch == '-' || ch == '+';
+    };
+
+    const auto& peek = [&]() {
+        for(;;) {
+            auto ch = is.rdbuf()->sgetc();
+            if (ch == Traits::eof()) {
+                is.setstate(std::ios::eofbit);
+                return '\0';
+            }
+            if (ch == numpunct.decimal_point()) {
+                return '.';
+            }
+            if (ch == numpunct.thousands_sep())
+            {
+                if (!supports_thousands_separators || !thousands_separator_allowed) {
+                    return '\0';
+                }
+                // Ignore valid thousands separators
+                is.rdbuf()->sbumpc();
+                continue;
+            }
+            auto res = ctype.narrow(ch, 0);
+            if (!is_valid_character(res)) {
+                // Invalid character: end input
+                return '\0';
+            }
+            return res;
+        }
+    };
+
+    const auto& bump = [&]() {
+        is.rdbuf()->sbumpc();
+    };
+
+    const auto& next = [&]() {
+        bump();
+        return peek();
+    };
+
+    bool negate = false;
+    auto ch = peek();
+    if (ch == '-') {
+        negate = true;
+        ch = next();
+    } else if (ch == '+') {
+        ch = next();
+    }
+
+    const char infinity[] = "infinity";
+    // Must be "inf" or "infinity"
+    int i = 0;
+    while (i < 8 && ch == infinity[i]) {
+        ++i;
+        ch = next();
+    }
+
+    if (i > 0) {
+        if (i == 3 || i == 8) {
+            x = negate ? std::numeric_limits<fixed<B, I, F, R>>::min() : std::numeric_limits<fixed<B, I, F, R>>::max();
+        } else {
+            is.setstate(std::ios::failbit);
+        }
+        return is;
+    }
+
+    char exponent_char = 'e';
+    int base = 10;
+
+    constexpr auto NoFraction = std::numeric_limits<std::size_t>::max();
+    std::size_t fraction_start = NoFraction;
+    std::vector<unsigned char> significand;
+
+    if (ch == '0') {
+        ch = next();
+        if (ch == 'x' || ch == 'X') {
+            // Hexfloat
+            exponent_char = 'p';
+            base = 16;
+            ch = next();
+        } else {
+            significand.push_back(0);
+        }
+    }
+
+    // Parse the significand
+    thousands_separator_allowed = true;
+    for (;; ch = next()) {
+        if (ch == '.') {
+            if (fraction_start != NoFraction) {
+                // Double decimal point. Stop parsing.
+                break;
+            }
+            fraction_start = significand.size();
+            thousands_separator_allowed = false;
+        } else {
+            unsigned char val = base;
+            if (ch >= '0' && ch <= '9') {
+                val = ch - '0';
+            } else if (ch >= 'a' && ch <= 'f') {
+                val = ch - 'a' + 10;
+            } else if (ch >= 'A' && ch <= 'F') {
+                val = ch - 'A' + 10;
+            }
+            if (val < 0 || val >= base) {
+                break;
+            }
+            significand.push_back(val);
+        }
+    }
+    if (significand.empty()) {
+        // We need a significand
+        is.setstate(std::ios::failbit);
+        return is;
+    }
+    thousands_separator_allowed = false;
+
+    if (fraction_start == NoFraction) {
+        // If we haven't seen a fraction yet, place it at the end of the significand
+        fraction_start = significand.size();
+    }
+
+    // Parse the exponent
+    bool exponent_overflow = false;
+    std::size_t exponent = 0;
+    bool exponent_negate = false;
+    if (std::tolower(ch) == exponent_char)
+    {
+        ch = next();
+        if (ch == '-') {
+            exponent_negate = true;
+            ch = next();
+        } else if (ch == '+') {
+            ch = next();
+        }
+
+        bool parsed = false;
+        while (std::isdigit(ch)) {
+            if (exponent <= std::numeric_limits<int>::max() / 10) {
+                exponent = exponent * 10 + (ch - '0');
+            } else {
+                exponent_overflow = true;
+            }
+            parsed = true;
+            ch = next();
+        }
+        if (!parsed) {
+            // If the exponent character is given, the exponent value may not be empty
+            is.setstate(std::ios::failbit);
+            return is;
+        }
+    }
+
+    // We've parsed all we need. Construct the value.
+    if (exponent_overflow) {
+        // Absolute exponent is too large
+        if (std::all_of(significand.begin(), significand.end(), [](unsigned char x){ return x == 0; })) {
+            // Significand is zero. Exponent doesn't matter.
+            x = fixed<B, I, F, R>(0);
+        } else if (exponent_negate) {
+            // A huge negative exponent approaches 0.
+            x = fixed<B, I, F, R>::from_raw_value(0);
+        } else {
+            // A huge positive exponent approaches infinity.
+            x = std::numeric_limits<fixed<B, I, F, R>>::max();
+        }
+        return is;
+    }
+
+    // Shift the fraction offset according to exponent
+    {
+        const auto exponent_mult = (base == 10) ? 1: 4;
+        if (exponent_negate) {
+            const auto adjust = std::min(exponent / exponent_mult, fraction_start);
+            fraction_start -= adjust;
+            exponent -= adjust * exponent_mult;
+        } else {
+            const auto adjust = std::min(exponent / exponent_mult, significand.size() - fraction_start);
+            fraction_start += adjust;
+            exponent -= adjust * exponent_mult;
+        }
+    }
+
+    constexpr auto IsSigned = std::is_signed<B>::value;
+    constexpr auto IntBits = sizeof(B) * 8 - F - (IsSigned ? 1 : 0);
+    constexpr auto MaxInt = (I{1} << IntBits) - 1;
+    constexpr auto MaxFraction = (I{1} << F) - 1;
+    constexpr auto MaxValue = (I{1} << sizeof(B) * 8) - 1;
+
+    // Parse the integer part
+    I integer = 0;
+    for (std::size_t i = 0; i < fraction_start; ++i) {
+        if (integer > MaxInt / base) {
+            // Overflow
+            x = negate ? std::numeric_limits<fixed<B, I, F, R>>::min() : std::numeric_limits<fixed<B, I, F, R>>::max();
+            return is;
+        }
+        assert(significand[i] < base);
+        integer = integer * base + significand[i];
+    }
+
+    // Parse the fractional part
+    I fraction = 0;
+    I divisor = 1;
+    for (std::size_t i = fraction_start; i < significand.size(); ++i) {
+        assert(significand[i] < base);
+        if (divisor > MaxFraction / base) {
+            // We're done
+            break;
+        }
+        fraction = fraction * base + significand[i];
+        divisor *= base;
+    }
+
+    // Construct the value from the parsed parts
+    I raw_value = (integer << F) + (fraction << F) / divisor;
+
+    // Apply remaining exponent
+    if (exponent_char == 'p') {
+        // Base-2 exponent
+        if (exponent_negate) {
+            raw_value >>= exponent;
+        } else {
+            raw_value <<= exponent;
+        }
+    } else {
+        // Base-10 exponent
+        if (exponent_negate) {
+            I remainder = 0;
+            for (std::size_t e = 0; e < exponent; ++e) {
+                remainder = raw_value % 10;
+                raw_value /= 10;
+            }
+            raw_value += remainder / 5;
+        } else {
+            for (std::size_t e = 0; e < exponent; ++e) {
+                if (raw_value > MaxValue / 10) {
+                    // Overflow
+                    x = negate ? std::numeric_limits<fixed<B, I, F, R>>::min() : std::numeric_limits<fixed<B, I, F, R>>::max();
+                    return is;
+                }
+                raw_value *= 10;
+            }
+        }
+    }
+    x = fixed<B, I, F, R>::from_raw_value(static_cast<B>(negate ? -raw_value : raw_value));
+    return is;
+}
+
+}
+
+#endif
diff --git a/fpm/math.hpp b/fpm/math.hpp
new file mode 100644
index 0000000..7a76349
--- /dev/null
+++ b/fpm/math.hpp
@@ -0,0 +1,684 @@
+#ifndef FPM_MATH_HPP
+#define FPM_MATH_HPP
+
+#include "fixed.hpp"
+#include <cmath>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace fpm
+{
+
+//
+// Helper functions
+//
+namespace detail
+{
+
+// Returns the index of the most-signifcant set bit
+inline long find_highest_bit(unsigned long long value) noexcept
+{
+    assert(value != 0);
+#if defined(_MSC_VER)
+    unsigned long index;
+#if defined(_WIN64)
+    _BitScanReverse64(&index, value);
+#else
+    if (_BitScanReverse(&index, static_cast<unsigned long>(value >> 32)) != 0) {
+        index += 32;
+    } else {
+        _BitScanReverse(&index, static_cast<unsigned long>(value & 0xfffffffflu));
+    }
+#endif
+    return index;
+#elif defined(__GNUC__) || defined(__clang__)
+    return sizeof(value) * 8 - 1 - __builtin_clzll(value);
+#else
+#   error "your platform does not support find_highest_bit()"
+#endif
+}
+
+}
+
+//
+// Classification methods
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline int fpclassify(fixed<B, I, F, R> x) noexcept
+{
+    return (x.raw_value() == 0) ? FP_ZERO : FP_NORMAL;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isfinite(fixed<B, I, F, R>) noexcept
+{
+    return true;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isinf(fixed<B, I, F, R>) noexcept
+{
+    return false;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isnan(fixed<B, I, F, R>) noexcept
+{
+    return false;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isnormal(fixed<B, I, F, R> x) noexcept
+{
+    return x.raw_value() != 0;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool signbit(fixed<B, I, F, R> x) noexcept
+{
+    return x.raw_value() < 0;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isgreater(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return x > y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isgreaterequal(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return x >= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isless(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return x < y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool islessequal(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return x <= y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool islessgreater(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return x != y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline bool isunordered(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return false;
+}
+
+//
+// Nearest integer operations
+//
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> ceil(fixed<B, I, F, R> x) noexcept
+{
+    constexpr auto FRAC = B(1) << F;
+    auto value = x.raw_value();
+    if (value > 0) value += FRAC - 1;
+    return fixed<B, I, F, R>::from_raw_value(value / FRAC * FRAC);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> floor(fixed<B, I, F, R> x) noexcept
+{
+    constexpr auto FRAC = B(1) << F;
+    auto value = x.raw_value();
+    if (value < 0) value -= FRAC - 1;
+    return fixed<B, I, F, R>::from_raw_value(value / FRAC * FRAC);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> trunc(fixed<B, I, F, R> x) noexcept
+{
+    constexpr auto FRAC = B(1) << F;
+    return fixed<B, I, F, R>::from_raw_value(x.raw_value() / FRAC * FRAC);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> round(fixed<B, I, F, R> x) noexcept
+{
+    constexpr auto FRAC = B(1) << F;
+    auto value = x.raw_value() / (FRAC / 2);
+    return fixed<B, I, F, R>::from_raw_value(((value / 2) + (value % 2)) * FRAC);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> nearbyint(fixed<B, I, F, R> x) noexcept
+{
+    // Rounding mode is assumed to be FE_TONEAREST
+    constexpr auto FRAC = B(1) << F;
+    auto value = x.raw_value();
+    const bool is_half = std::abs(value % FRAC) == FRAC / 2;
+    value /= FRAC / 2;
+    value = (value / 2) + (value % 2);
+    value -= (value % 2) * is_half;
+    return fixed<B, I, F, R>::from_raw_value(value * FRAC);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> rint(fixed<B, I, F, R> x) noexcept
+{
+    // Rounding mode is assumed to be FE_TONEAREST
+    return nearbyint(x);
+}
+
+//
+// Mathematical functions
+//
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> abs(fixed<B, I, F, R> x) noexcept
+{
+    return (x >= fixed<B, I, F, R>{0}) ? x : -x;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> fmod(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return
+        assert(y.raw_value() != 0),
+        fixed<B, I, F, R>::from_raw_value(x.raw_value() % y.raw_value());
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> remainder(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    return
+        assert(y.raw_value() != 0),
+        x - nearbyint(x / y) * y;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> remquo(fixed<B, I, F, R> x, fixed<B, I, F, R> y, int* quo) noexcept
+{
+    assert(y.raw_value() != 0);
+    assert(quo != nullptr);
+    *quo = x.raw_value() / y.raw_value();
+    return fixed<B, I, F, R>::from_raw_value(x.raw_value() % y.raw_value());
+}
+
+//
+// Manipulation functions
+//
+
+template <typename B, typename I, unsigned int F, bool R, typename C, typename J, unsigned int G, bool S>
+constexpr inline fixed<B, I, F, R> copysign(fixed<B, I, F, R> x, fixed<C, J, G, S> y) noexcept
+{
+    return
+        x = abs(x),
+        (y >= fixed<C, J, G, S>{0}) ? x : -x;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> nextafter(fixed<B, I, F, R> from, fixed<B, I, F, R> to) noexcept
+{
+    return from == to ? to :
+           to > from ? fixed<B, I, F, R>::from_raw_value(from.raw_value() + 1)
+                     : fixed<B, I, F, R>::from_raw_value(from.raw_value() - 1);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+constexpr inline fixed<B, I, F, R> nexttoward(fixed<B, I, F, R> from, fixed<B, I, F, R> to) noexcept
+{
+    return nextafter(from, to);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> modf(fixed<B, I, F, R> x, fixed<B, I, F, R>* iptr) noexcept
+{
+    const auto raw = x.raw_value();
+    constexpr auto FRAC = B{1} << F;
+    *iptr = fixed<B, I, F, R>::from_raw_value(raw / FRAC * FRAC);
+    return fixed<B, I, F, R>::from_raw_value(raw % FRAC);
+}
+
+
+//
+// Power functions
+//
+
+template <typename B, typename I, unsigned int F, bool R, typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+fixed<B, I, F, R> pow(fixed<B, I, F, R> base, T exp) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+
+    if (base == Fixed(0)) {
+        assert(exp > 0);
+        return Fixed(0);
+    }
+
+    Fixed result {1};
+    if (exp < 0)
+    {
+        for (Fixed intermediate = base; exp != 0; exp /= 2, intermediate *= intermediate)
+        {
+            if ((exp % 2) != 0)
+            {
+                result /= intermediate;
+            }
+        }
+    }
+    else
+    {
+        for (Fixed intermediate = base; exp != 0; exp /= 2, intermediate *= intermediate)
+        {
+            if ((exp % 2) != 0)
+            {
+                result *= intermediate;
+            }
+        }
+    }
+    return result;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> pow(fixed<B, I, F, R> base, fixed<B, I, F, R> exp) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+
+    if (base == Fixed(0)) {
+        assert(exp > Fixed(0));
+        return Fixed(0);
+    }
+
+    if (exp < Fixed(0))
+    {
+        return 1 / pow(base, -exp);
+    }
+
+    constexpr auto FRAC = B(1) << F;
+    if (exp.raw_value() % FRAC == 0)
+    {
+        // Non-fractional exponents are easier to calculate
+        return pow(base, exp.raw_value() / FRAC);
+    }
+
+    // For negative bases we do not support fractional exponents.
+    // Technically fractions with odd denominators could work,
+    // but that's too much work to figure out.
+    assert(base > Fixed(0));
+    return exp2(log2(base) * exp);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> exp(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    if (x < Fixed(0)) {
+        return 1 / exp(-x);
+    }
+    constexpr auto FRAC = B(1) << F;
+    const B x_int = x.raw_value() / FRAC;
+    x -= x_int;
+    assert(x >= Fixed(0) && x < Fixed(1));
+
+    constexpr auto fA = Fixed::template from_fixed_point<63>( 128239257017632854ll); // 1.3903728105644451e-2
+    constexpr auto fB = Fixed::template from_fixed_point<63>( 320978614890280666ll); // 3.4800571158543038e-2
+    constexpr auto fC = Fixed::template from_fixed_point<63>(1571680799599592947ll); // 1.7040197373796334e-1
+    constexpr auto fD = Fixed::template from_fixed_point<63>(4603349000587966862ll); // 4.9909609871464493e-1
+    constexpr auto fE = Fixed::template from_fixed_point<62>(4612052447974689712ll); // 1.0000794567422495
+    constexpr auto fF = Fixed::template from_fixed_point<63>(9223361618412247875ll); // 9.9999887043019773e-1
+    return pow(Fixed::e(), x_int) * (((((fA * x + fB) * x + fC) * x + fD) * x + fE) * x + fF);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> exp2(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    if (x < Fixed(0)) {
+        return 1 / exp2(-x);
+    }
+    constexpr auto FRAC = B(1) << F;
+    const B x_int = x.raw_value() / FRAC;
+    x -= x_int;
+    assert(x >= Fixed(0) && x < Fixed(1));
+
+    constexpr auto fA = Fixed::template from_fixed_point<63>(  17491766697771214ll); // 1.8964611454333148e-3
+    constexpr auto fB = Fixed::template from_fixed_point<63>(  82483038782406547ll); // 8.9428289841091295e-3
+    constexpr auto fC = Fixed::template from_fixed_point<63>( 515275173969157690ll); // 5.5866246304520701e-2
+    constexpr auto fD = Fixed::template from_fixed_point<63>(2214897896212987987ll); // 2.4013971109076949e-1
+    constexpr auto fE = Fixed::template from_fixed_point<63>(6393224161192452326ll); // 6.9315475247516736e-1
+    constexpr auto fF = Fixed::template from_fixed_point<63>(9223371050976163566ll); // 9.9999989311082668e-1
+    return Fixed(1 << x_int) * (((((fA * x + fB) * x + fC) * x + fD) * x + fE) * x + fF);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> expm1(fixed<B, I, F, R> x) noexcept
+{
+    return exp(x) - 1;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> log2(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    assert(x > Fixed(0));
+
+    // Normalize input to the [1:2] domain
+    B value = x.raw_value();
+    const long highest = detail::find_highest_bit(value);
+    if (highest >= F) {
+        value >>= (highest - F);
+    } else {
+        value <<= (F - highest);
+    }
+    x = Fixed::from_raw_value(value);
+    assert(x >= Fixed(1) && x < Fixed(2));
+
+    constexpr auto fA = Fixed::template from_fixed_point<63>(  413886001457275979ll); //  4.4873610194131727e-2
+    constexpr auto fB = Fixed::template from_fixed_point<63>(-3842121857793256941ll); // -4.1656368651734915e-1
+    constexpr auto fC = Fixed::template from_fixed_point<62>( 7522345947206307744ll); //  1.6311487636297217
+    constexpr auto fD = Fixed::template from_fixed_point<61>(-8187571043052183818ll); // -3.5507929249026341
+    constexpr auto fE = Fixed::template from_fixed_point<60>( 5870342889289496598ll); //  5.0917108110420042
+    constexpr auto fF = Fixed::template from_fixed_point<61>(-6457199832668582866ll); // -2.8003640347009253
+    return Fixed(highest - F) + (((((fA * x + fB) * x + fC) * x + fD) * x + fE) * x + fF);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> log(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    return log2(x) / log2(Fixed::e());
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> log10(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    return log2(x) / log2(Fixed(10));
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> log1p(fixed<B, I, F, R> x) noexcept
+{
+    return log(1 + x);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> cbrt(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+
+    if (x == Fixed(0))
+    {
+        return x;
+    }
+    if (x < Fixed(0))
+    {
+        return -cbrt(-x);
+    }
+    assert(x >= Fixed(0));
+
+    // Finding the cube root of an integer, taken from Hacker's Delight,
+    // based on the square root algorithm.
+
+    // We start at the greatest power of eight that's less than the argument.
+    int ofs = ((detail::find_highest_bit(x.raw_value()) + 2*F) / 3 * 3);
+    I num = I{x.raw_value()};
+    I res = 0;
+
+    const auto do_round = [&]
+    {
+        for (; ofs >= 0; ofs -= 3)
+        {
+            res += res;
+            const I val = (3*res*(res + 1) + 1) << ofs;
+            if (num >= val)
+            {
+                num -= val;
+                res++;
+            }
+        }
+    };
+
+    // We should shift by 2*F (since there are two multiplications), but that
+    // could overflow even the intermediate type, so we have to split the
+    // algorithm up in two rounds of F bits each. Each round will deplete
+    // 'num' digit by digit, so after a round we can shift it again.
+    num <<= F;
+    ofs -= F;
+    do_round();
+
+    num <<= F;
+    ofs += F;
+    do_round();
+
+    return Fixed::from_raw_value(static_cast<B>(res));
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> sqrt(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+
+    assert(x >= Fixed(0));
+    if (x == Fixed(0))
+    {
+        return x;
+    }
+
+    // Finding the square root of an integer in base-2, from:
+    // https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_.28base_2.29
+
+    // Shift by F first because it's fixed-point.
+    I num = I{x.raw_value()} << F;
+    I res = 0;
+
+    // "bit" starts at the greatest power of four that's less than the argument.
+    for (I bit = I{1} << ((detail::find_highest_bit(x.raw_value()) + F) / 2 * 2); bit != 0; bit >>= 2)
+    {
+        const I val = res + bit;
+        res >>= 1;
+        if (num >= val)
+        {
+            num -= val;
+            res += bit;
+        }
+    }
+
+    // Round the last digit up if necessary
+    if (num > res)
+    {
+        res++;
+    }
+
+    return Fixed::from_raw_value(static_cast<B>(res));
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> hypot(fixed<B, I, F, R> x, fixed<B, I, F, R> y) noexcept
+{
+    assert(x != 0 || y != 0);
+    return sqrt(x*x + y*y);
+}
+
+//
+// Trigonometry functions
+//
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> sin(fixed<B, I, F, R> x) noexcept
+{
+    // This sine uses a fifth-order curve-fitting approximation originally
+    // described by Jasper Vijn on coranac.com which has a worst-case
+    // relative error of 0.07% (over [-pi:pi]).
+    using Fixed = fixed<B, I, F, R>;
+
+    // Turn x from [0..2*PI] domain into [0..4] domain
+    x = fmod(x, Fixed::two_pi());
+    x = x / Fixed::half_pi();
+
+    // Take x modulo one rotation, so [-4..+4].
+    if (x < Fixed(0)) {
+        x += Fixed(4);
+    }
+
+    int sign = +1;
+    if (x > Fixed(2)) {
+        // Reduce domain to [0..2].
+        sign = -1;
+        x -= Fixed(2);
+    }
+
+    if (x > Fixed(1)) {
+        // Reduce domain to [0..1].
+        x = Fixed(2) - x;
+    }
+
+    const Fixed x2 = x*x;
+    return sign * x * (Fixed::pi() - x2*(Fixed::two_pi() - 5 - x2*(Fixed::pi() - 3)))/2;
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> cos(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    if (x > Fixed(0)) {  // Prevent an overflow due to the addition of π/2
+        return sin(x - (Fixed::two_pi() - Fixed::half_pi()));
+    } else {
+        return sin(Fixed::half_pi() + x);
+    }    
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+inline fixed<B, I, F, R> tan(fixed<B, I, F, R> x) noexcept
+{
+    auto cx = cos(x);
+
+    // Tangent goes to infinity at 90 and -90 degrees.
+    // We can't represent that with fixed-point maths.
+    assert(abs(cx).raw_value() > 1);
+
+    return sin(x) / cx;
+}
+
+namespace detail {
+
+// Calculates atan(x) assuming that x is in the range [0,1]
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> atan_sanitized(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    assert(x >= Fixed(0) && x <= Fixed(1));
+
+    constexpr auto fA = Fixed::template from_fixed_point<63>(  716203666280654660ll); //  0.0776509570923569
+    constexpr auto fB = Fixed::template from_fixed_point<63>(-2651115102768076601ll); // -0.287434475393028
+    constexpr auto fC = Fixed::template from_fixed_point<63>( 9178930894564541004ll); //  0.995181681698119  (PI/4 - A - B)
+
+    const auto xx = x * x;
+    return ((fA*xx + fB)*xx + fC)*x;
+}
+
+// Calculate atan(y / x), assuming x != 0.
+//
+// If x is very, very small, y/x can easily overflow the fixed-point range.
+// If q = y/x and q > 1, atan(q) would calculate atan(1/q) as intermediate step
+// anyway. We can shortcut that here and avoid the loss of information, thus
+// improving the accuracy of atan(y/x) for very small x.
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> atan_div(fixed<B, I, F, R> y, fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    assert(x != Fixed(0));
+
+    // Make sure y and x are positive.
+    // If y / x is negative (when y or x, but not both, are negative), negate the result to
+    // keep the correct outcome.
+    if (y < Fixed(0)) {
+        if (x < Fixed(0)) {
+            return atan_div(-y, -x);
+        }
+        return -atan_div(-y, x);
+    }
+    if (x < Fixed(0)) {
+        return -atan_div(y, -x);
+    }
+    assert(y >= Fixed(0));
+    assert(x >  Fixed(0));
+
+    if (y > x) {
+        return Fixed::half_pi() - detail::atan_sanitized(x / y);
+    }
+    return detail::atan_sanitized(y / x);
+}
+
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> atan(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    if (x < Fixed(0))
+    {
+        return -atan(-x);
+    }
+
+    if (x > Fixed(1))
+    {
+        return Fixed::half_pi() - detail::atan_sanitized(Fixed(1) / x);
+    }
+
+    return detail::atan_sanitized(x);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> asin(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    assert(x >= Fixed(-1) && x <= Fixed(+1));
+
+    const auto yy = Fixed(1) - x * x;
+    if (yy == Fixed(0))
+    {
+        return copysign(Fixed::half_pi(), x);
+    }
+    return detail::atan_div(x, sqrt(yy));
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> acos(fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    assert(x >= Fixed(-1) && x <= Fixed(+1));
+
+    if (x == Fixed(-1))
+    {
+        return Fixed::pi();
+    }
+    const auto yy = Fixed(1) - x * x;
+    return Fixed(2)*detail::atan_div(sqrt(yy), Fixed(1) + x);
+}
+
+template <typename B, typename I, unsigned int F, bool R>
+fixed<B, I, F, R> atan2(fixed<B, I, F, R> y, fixed<B, I, F, R> x) noexcept
+{
+    using Fixed = fixed<B, I, F, R>;
+    if (x == Fixed(0))
+    {
+        assert(y != Fixed(0));
+        return (y > Fixed(0)) ? Fixed::half_pi() : -Fixed::half_pi();
+    }
+
+    auto ret = detail::atan_div(y, x);
+
+    if (x < Fixed(0))
+    {
+        return (y >= Fixed(0)) ? ret + Fixed::pi() : ret - Fixed::pi();
+    }
+    return ret;
+}
+
+}
+
+#endif
diff --git a/kiss/CHANGELOG b/kiss/CHANGELOG
new file mode 100644
index 0000000..2dd3603
--- /dev/null
+++ b/kiss/CHANGELOG
@@ -0,0 +1,123 @@
+1.3.0 2012-07-18
+  removed non-standard malloc.h from kiss_fft.h
+
+  moved -lm to end of link line
+
+  checked various return values
+
+  converted python Numeric code to NumPy
+ 
+  fixed test of int32_t on 64 bit OS
+
+  added padding in a couple of places to allow SIMD alignment of structs
+
+1.2.9 2010-05-27
+  threadsafe ( including OpenMP )
+
+  first edition of kissfft.hh the C++ template fft engine
+ 
+1.2.8 
+  Changed memory.h to string.h -- apparently more standard
+  
+  Added openmp extensions.  This can have fairly linear speedups for larger FFT sizes.
+
+1.2.7 
+  Shrank the real-fft memory footprint. Thanks to Galen Seitz.
+
+1.2.6 (Nov 14, 2006) The "thanks to GenArts" release.
+  Added multi-dimensional real-optimized FFT, see tools/kiss_fftndr
+  Thanks go to GenArts, Inc. for sponsoring the development.
+
+1.2.5 (June 27, 2006) The "release for no good reason" release.
+   Changed some harmless code to make some compilers' warnings go away.
+   Added some more digits to pi -- why not.
+   Added kiss_fft_next_fast_size() function to help people decide how much to pad.
+   Changed multidimensional test from 8 dimensions to only 3 to avoid testing 
+   problems with fixed point (sorry Buckaroo Banzai).
+
+1.2.4 (Oct 27, 2005)   The "oops, inverse fixed point real fft was borked" release. 
+   Fixed scaling bug for inverse fixed point real fft -- also fixed test code that should've been failing.
+    Thanks to Jean-Marc Valin for bug report.
+
+   Use sys/types.h for more portable types than short,int,long => int16_t,int32_t,int64_t
+   If your system does not have these, you may need to define them -- but at least it breaks in a 
+   loud and easily fixable way -- unlike silently using the wrong size type.
+
+   Hopefully tools/psdpng.c is fixed -- thanks to Steve Kellog for pointing out the weirdness.
+
+1.2.3 (June 25, 2005)   The "you want to use WHAT as a sample" release.
+    Added ability to use 32 bit fixed point samples -- requires a 64 bit intermediate result, a la 'long long'
+
+    Added ability to do 4 FFTs in parallel by using SSE SIMD instructions. This is accomplished by
+    using the __m128 (vector of 4 floats) as kiss_fft_scalar.  Define USE_SIMD to use this.
+    
+    I know, I know ...  this is drifting a bit from the "kiss" principle, but the speed advantages 
+    make it worth it for some.  Also recent gcc makes it SOO easy to use vectors of 4 floats like a POD type.
+
+1.2.2 (May 6, 2005)   The Matthew release
+    Replaced fixed point division with multiply&shift.  Thanks to Jean-Marc Valin for 
+    discussions regarding.  Considerable speedup for fixed-point.
+
+    Corrected overflow protection in real fft routines  when using fixed point.
+    Finder's Credit goes to Robert Oschler of robodance for pointing me at the bug.
+    This also led to the CHECK_OVERFLOW_OP macro.
+
+1.2.1 (April 4, 2004) 
+    compiles cleanly with just about every -W warning flag under the sun
+
+    reorganized kiss_fft_state so it could be read-only/const. This may be useful for embedded systems
+    that are willing to predeclare twiddle factors, factorization.
+
+    Fixed C_MUL,S_MUL on 16-bit platforms.
+
+    tmpbuf will only be allocated if input & output buffers are same
+    scratchbuf will only be allocated for ffts that are not multiples of 2,3,5
+ 
+    NOTE: The tmpbuf,scratchbuf changes may require synchronization code for multi-threaded apps.
+
+
+1.2 (Feb 23, 2004)
+    interface change -- cfg object is forward declaration of struct instead of void*
+    This maintains type saftey and lets the compiler warn/error about stupid mistakes.
+            (prompted by suggestion from Erik de Castro Lopo)
+
+    small speed improvements
+
+    added psdpng.c -- sample utility that will create png spectrum "waterfalls" from an input file
+        ( not terribly useful yet)
+
+1.1.1 (Feb 1, 2004 )
+    minor bug fix -- only affects odd rank, in-place, multi-dimensional FFTs
+
+1.1 : (Jan 30,2004)
+    split sample_code/ into test/ and tools/
+
+    Removed 2-D fft and added N-D fft (arbitrary)
+
+    modified fftutil.c to allow multi-d FFTs
+
+    Modified core fft routine to allow an input stride via kiss_fft_stride()
+    (eased support of multi-D ffts)
+
+    Added fast convolution filtering (FIR filtering using overlap-scrap method, with tail scrap)
+
+    Add kfc.[ch]: the KISS FFT Cache. It takes care of allocs for you ( suggested by Oscar Lesta ).
+
+1.0.1 (Dec 15, 2003)
+    fixed bug that occurred when nfft==1. Thanks to Steven Johnson.
+    
+1.0 : (Dec 14, 2003)
+    changed kiss_fft function from using a single buffer, to two buffers.
+    If the same buffer pointer is supplied for both in and out, kiss will
+    manage the buffer copies.
+
+    added kiss_fft2d and kiss_fftr as separate source files (declarations in kiss_fft.h )
+
+0.4 :(Nov 4,2003) optimized for radix 2,3,4,5
+
+0.3 :(Oct 28, 2003) woops, version 2 didn't actually factor out any radices other than 2.
+        Thanks to Steven Johnson for finding this one.
+
+0.2 :(Oct 27, 2003) added mixed radix, only radix 2,4 optimized versions
+
+0.1 :(May 19 2003)  initial release, radix 2 only
diff --git a/kiss/COPYING b/kiss/COPYING
new file mode 100644
index 0000000..6b4b622
--- /dev/null
+++ b/kiss/COPYING
@@ -0,0 +1,11 @@
+Copyright (c) 2003-2010 Mark Borgerding . All rights reserved.
+
+KISS FFT is provided under:
+
+  SPDX-License-Identifier: BSD-3-Clause
+
+Being under the terms of the BSD 3-clause "New" or "Revised" License,
+according with:
+
+  LICENSES/BSD-3-Clause
+
diff --git a/kiss/README.md b/kiss/README.md
new file mode 100644
index 0000000..1138a0c
--- /dev/null
+++ b/kiss/README.md
@@ -0,0 +1,245 @@
+# KISS FFT [![Build Status](https://travis-ci.com/mborgerding/kissfft.svg?branch=master)](https://travis-ci.com/mborgerding/kissfft)
+
+KISS FFT - A mixed-radix Fast Fourier Transform based up on the principle, 
+"Keep It Simple, Stupid."
+
+There are many great fft libraries already around.  Kiss FFT is not trying
+to be better than any of them.  It only attempts to be a reasonably efficient, 
+moderately useful FFT that can use fixed or floating data types and can be 
+incorporated into someone's C program in a few minutes with trivial licensing.
+
+## USAGE:
+
+The basic usage for 1-d complex FFT is:
+
+```c
+    #include "kiss_fft.h"
+    kiss_fft_cfg cfg = kiss_fft_alloc( nfft ,is_inverse_fft ,0,0 );
+    while ...
+    
+        ... // put kth sample in cx_in[k].r and cx_in[k].i
+        
+        kiss_fft( cfg , cx_in , cx_out );
+        
+        ... // transformed. DC is in cx_out[0].r and cx_out[0].i 
+        
+    kiss_fft_free(cfg);
+```
+ - **Note**: frequency-domain data is stored from dc up to 2pi.
+    so cx_out[0] is the dc bin of the FFT
+    and cx_out[nfft/2] is the Nyquist bin (if exists)
+
+Declarations are in "kiss_fft.h", along with a brief description of the 
+functions you'll need to use. 
+
+Code definitions for 1d complex FFTs are in kiss_fft.c.
+
+You can do other cool stuff with the extras you'll find in tools/
+> - multi-dimensional FFTs 
+> - real-optimized FFTs  (returns the positive half-spectrum: 
+    (nfft/2+1) complex frequency bins)
+> - fast convolution FIR filtering (not available for fixed point)
+> - spectrum image creation
+
+The core fft and most tools/ code can be compiled to use float, double,
+ Q15 short or Q31 samples. The default is float.
+
+## BUILDING:
+
+There are two functionally-equivalent build systems supported by kissfft:
+
+ - Make (traditional Makefiles for Unix / Linux systems)
+ - CMake (more modern and feature-rich build system developed by Kitware)
+
+To build kissfft, the following build environment can be used:
+
+ - GNU build environment with GCC, Clang and GNU Make or CMake (>= 3.6)
+ - Microsoft Visual C++ (MSVC) with CMake (>= 3.6)
+
+Additional libraries required to build and test kissfft include:
+
+ - libpng for psdpng tool,
+ - libfftw3 to validate kissfft results against it,
+ - python 2/3 with Numpy to validate kissfft results against it.
+ - OpenMP supported by GCC, Clang or MSVC for multi-core FFT transformations
+
+Environments like Cygwin and MinGW can be highly likely used to build kissfft
+targeting Windows platform, but no tests were performed to the date.
+
+Both Make and CMake builds are easily configurable:
+
+ - `KISSFFT_DATATYPE=<datatype>` (for Make) or `-DKISSFFT_DATATYPE=<datatype>`
+   (for CMake) denote the principal datatype used by kissfft. It can be one
+   of the following:
+
+   - float (default)
+   - double
+   - int16_t
+   - int32_t
+   - SIMD (requires SSE instruction set support on target CPU)
+
+ - `KISSFFT_OPENMP=1` (for Make) or `-DKISSFFT_OPENMP=ON` (for CMake) builds kissfft
+   with OpenMP support. Please note that a supported compiler is required and this
+   option is turned off by default.
+
+ - `KISSFFT_STATIC=1` (for Make) or `-DKISSFFT_STATIC=ON` (for CMake) instructs
+   the builder to create static library ('.lib' for Windows / '.a' for Unix or Linux).
+   By default, this option is turned off and the shared library is created
+   ('.dll' for Windows, '.so' for Linux or Unix, '.dylib' for Mac OSX)
+
+ - `-DKISSFFT_TEST=OFF` (for CMake) disables building tests for kissfft. On Make,
+   building tests is done separately by 'make testall' or 'make testsingle', so
+   no specific setting is required.
+
+ - `KISSFFT_TOOLS=0` (for Make) or `-DKISSFFT_TOOLS=OFF` (for CMake) builds kissfft
+    without command-line tools like 'fastconv'. By default the tools are built.
+
+    - `KISSFFT_USE_ALLOCA=1` (for Make) or `-DKISSFFT_USE_ALLOCA=ON` (for CMake)
+       build kissfft with 'alloca' usage instead of 'malloc' / 'free'.
+
+    - `PREFIX=/full/path/to/installation/prefix/directory` (for Make) or
+      `-DCMAKE_INSTALL_PREFIX=/full/path/to/installation/prefix/directory` (for CMake)
+      specifies the prefix directory to install kissfft into.
+
+For example, to build kissfft as a static library with 'int16_t' datatype and
+OpenMP support using Make, run the command from kissfft source tree:
+
+```
+make KISSFFT_DATATYPE=int16_t KISSFFT_STATIC=1 KISSFFT_OPENMP=1 all
+```
+
+The same configuration for CMake is:
+
+```
+mkdir build && cd build
+cmake -DKISSFFT_DATATYPE=int16_t -DKISSFFT_STATIC=ON -DKISSFFT_OPENMP=ON ..
+make all
+```
+
+To specify '/tmp/1234' as installation prefix directory, run:
+
+
+```
+make PREFIX=/tmp/1234 KISSFFT_DATATYPE=int16_t KISSFFT_STATIC=1 KISSFFT_OPENMP=1 install
+```
+
+or
+
+```
+mkdir build && cd build
+cmake -DCMAKE_INSTALL_PREFIX=/tmp/1234 -DKISSFFT_DATATYPE=int16_t -DKISSFFT_STATIC=ON -DKISSFFT_OPENMP=ON ..
+make all
+make install
+```
+
+## TESTING:
+
+To validate the build configured as an example above, run the following command from
+kissfft source tree:
+
+```
+make KISSFFT_DATATYPE=int16_t KISSFFT_STATIC=1 KISSFFT_OPENMP=1 testsingle
+```
+
+if using Make, or:
+
+```
+make test
+```
+
+if using CMake.
+
+To test all possible build configurations, please run an extended testsuite from
+kissfft source tree:
+
+```
+sh test/kissfft-testsuite.sh
+```
+
+Please note that the extended testsuite takes around 20-40 minutes depending on device
+it runs on. This testsuite is useful for reporting bugs or testing the pull requests.
+
+## BACKGROUND
+
+I started coding this because I couldn't find a fixed point FFT that didn't 
+use assembly code.  I started with floating point numbers so I could get the 
+theory straight before working on fixed point issues.  In the end, I had a 
+little bit of code that could be recompiled easily to do ffts with short, float
+or double (other types should be easy too).  
+
+Once I got my FFT working, I was curious about the speed compared to
+a well respected and highly optimized fft library.  I don't want to criticize 
+this great library, so let's call it FFT_BRANDX.
+During this process, I learned:
+
+> 1. FFT_BRANDX has more than 100K lines of code. The core of kiss_fft is about 500 lines (cpx 1-d).
+> 2. It took me an embarrassingly long time to get FFT_BRANDX working.
+> 3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB (without optimizing for size).
+> 4. FFT_BRANDX is roughly twice as fast as KISS FFT in default mode.
+
+It is wonderful that free, highly optimized libraries like FFT_BRANDX exist.
+But such libraries carry a huge burden of complexity necessary to extract every 
+last bit of performance.
+
+**Sometimes simpler is better, even if it's not better.**
+
+## FREQUENTLY ASKED QUESTIONS:
+> Q: Can I use kissfft in a project with a ___ license?</br>
+> A: Yes.  See LICENSE below.
+
+> Q: Why don't I get the output I expect?</br>
+> A: The two most common causes of this are
+> 	1) scaling : is there a constant multiplier between what you got and what you want?
+> 	2) mixed build environment -- all code must be compiled with same preprocessor 
+> 	definitions for FIXED_POINT and kiss_fft_scalar
+
+> Q: Will you write/debug my code for me?</br>
+> A: Probably not unless you pay me.  I am happy to answer pointed and topical questions, but 
+> I may refer you to a book, a forum, or some other resource.
+
+
+## PERFORMANCE
+    (on Athlon XP 2100+, with gcc 2.96, float data type)
+
+Kiss performed 10000 1024-pt cpx ffts in .63 s of cpu time.
+For comparison, it took md5sum twice as long to process the same amount of data.
+Transforming 5 minutes of CD quality audio takes less than a second (nfft=1024). 
+
+**DO NOT:**
+- use Kiss if you need the Fastest Fourier Transform in the World
+- ask me to add features that will bloat the code
+
+## UNDER THE HOOD
+
+Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. If you give it an input buffer  
+and output buffer that are the same, a temporary buffer will be created to hold the data.
+
+No static data is used.  The core routines of kiss_fft are thread-safe (but not all of the tools directory).[
+
+No scaling is done for the floating point version (for speed).  
+Scaling is done both ways for the fixed-point version (for overflow prevention).
+
+Optimized butterflies are used for factors 2,3,4, and 5. 
+
+The real (i.e. not complex) optimization code only works for even length ffts.  It does two half-length
+FFTs in parallel (packed into real&imag), and then combines them via twiddling.  The result is 
+nfft/2+1 complex frequency bins from DC to Nyquist.  If you don't know what this means, search the web.
+
+The fast convolution filtering uses the overlap-scrap method, slightly 
+modified to put the scrap at the tail.
+
+## LICENSE
+    Revised BSD License, see COPYING for verbiage. 
+    Basically, "free to use&change, give credit where due, no guarantees"
+    Note this license is compatible with GPL at one end of the spectrum and closed, commercial software at 
+    the other end.  See http://www.fsf.org/licensing/licenses
+  
+## TODO
+ - Add real optimization for odd length FFTs 
+ - Document/revisit the input/output fft scaling
+ - Make doc describing the overlap (tail) scrap fast convolution filtering in kiss_fastfir.c
+ - Test all the ./tools/ code with fixed point (kiss_fastfir.c doesn't work, maybe others)
+
+## AUTHOR
+    Mark Borgerding
+    Mark@Borgerding.net
diff --git a/kiss/_kiss_fft_guts.h b/kiss/_kiss_fft_guts.h
new file mode 100644
index 0000000..4bd8d1c
--- /dev/null
+++ b/kiss/_kiss_fft_guts.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+/* kiss_fft.h
+   defines kiss_fft_scalar as either short or a float type
+   and defines
+   typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
+
+#ifndef _kiss_fft_guts_h
+#define _kiss_fft_guts_h
+
+#include "kiss_fft.h"
+#include "kiss_fft_log.h"
+#include <limits.h>
+
+#define MAXFACTORS 32
+/* e.g. an fft of length 128 has 4 factors
+ as far as kissfft is concerned
+ 4*4*4*2
+ */
+
+struct kiss_fft_state{
+    int nfft;
+    int inverse;
+    int factors[2*MAXFACTORS];
+    kiss_fft_cpx twiddles[1];
+};
+
+/*
+  Explanation of macros dealing with complex math:
+
+   C_MUL(m,a,b)         : m = a*b
+   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
+   C_SUB( res, a,b)     : res = a - b
+   C_SUBFROM( res , a)  : res -= a
+   C_ADDTO( res , a)    : res += a
+ * */
+#ifdef FIXED_POINT
+#include <stdint.h>
+#if (FIXED_POINT==32)
+# define FRACBITS 31
+# define SAMPPROD int64_t
+#define SAMP_MAX INT32_MAX
+#define SAMP_MIN INT32_MIN
+#else
+# define FRACBITS 15
+# define SAMPPROD int32_t
+#define SAMP_MAX INT16_MAX
+#define SAMP_MIN INT16_MIN
+#endif
+
+#if defined(CHECK_OVERFLOW)
+#  define CHECK_OVERFLOW_OP(a,op,b)  \
+    if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
+        KISS_FFT_WARNING("overflow (%d " #op" %d) = %ld", (a),(b),(SAMPPROD)(a) op (SAMPPROD)(b)); }
+#endif
+
+
+#   define smul(a,b) ( (SAMPPROD)(a)*(b) )
+#   define sround( x )  (kiss_fft_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
+
+#   define S_MUL(a,b) sround( smul(a,b) )
+
+#   define C_MUL(m,a,b) \
+      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
+          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
+
+#   define DIVSCALAR(x,k) \
+    (x) = sround( smul(  x, SAMP_MAX/k ) )
+
+#   define C_FIXDIV(c,div) \
+    do {    DIVSCALAR( (c).r , div);  \
+        DIVSCALAR( (c).i  , div); }while (0)
+
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
+        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
+
+#else  /* not FIXED_POINT*/
+
+#   define S_MUL(a,b) ( (a)*(b) )
+#define C_MUL(m,a,b) \
+    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
+        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
+#   define C_FIXDIV(c,div) /* NOOP */
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r *= (s);\
+        (c).i *= (s); }while(0)
+#endif
+
+#ifndef CHECK_OVERFLOW_OP
+#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
+#endif
+
+#define  C_ADD( res, a,b)\
+    do { \
+        CHECK_OVERFLOW_OP((a).r,+,(b).r)\
+        CHECK_OVERFLOW_OP((a).i,+,(b).i)\
+        (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do { \
+        CHECK_OVERFLOW_OP((a).r,-,(b).r)\
+        CHECK_OVERFLOW_OP((a).i,-,(b).i)\
+        (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do { \
+        CHECK_OVERFLOW_OP((res).r,+,(a).r)\
+        CHECK_OVERFLOW_OP((res).i,+,(a).i)\
+        (res).r += (a).r;  (res).i += (a).i;\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {\
+        CHECK_OVERFLOW_OP((res).r,-,(a).r)\
+        CHECK_OVERFLOW_OP((res).i,-,(a).i)\
+        (res).r -= (a).r;  (res).i -= (a).i; \
+    }while(0)
+
+
+#ifdef FIXED_POINT
+#  define KISS_FFT_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
+#  define KISS_FFT_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
+#  define HALF_OF(x) ((x)>>1)
+#elif defined(USE_SIMD)
+#  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
+#  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
+#  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
+#else
+#  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
+#  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
+#  define HALF_OF(x) ((x)*((kiss_fft_scalar).5))
+#endif
+
+#define  kf_cexp(x,phase) \
+    do{ \
+        (x)->r = KISS_FFT_COS(phase);\
+        (x)->i = KISS_FFT_SIN(phase);\
+    }while(0)
+
+
+/* a debugging function */
+#define pcpx(c)\
+    KISS_FFT_DEBUG("%g + %gi\n",(double)((c)->r),(double)((c)->i))
+
+
+#ifdef KISS_FFT_USE_ALLOCA
+// define this to allow use of alloca instead of malloc for temporary buffers
+// Temporary buffers are used in two case:
+// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
+// 2. "in-place" FFTs.  Notice the quotes, since kissfft does not really do an in-place transform.
+#include <alloca.h>
+#define  KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes)
+#define  KISS_FFT_TMP_FREE(ptr)
+#else
+#define  KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes)
+#define  KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr)
+#endif
+
+#endif /* _kiss_fft_guts_h */
+
diff --git a/kiss/kfc.c b/kiss/kfc.c
new file mode 100644
index 0000000..a405d9b
--- /dev/null
+++ b/kiss/kfc.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#include "kfc.h"
+
+typedef struct cached_fft *kfc_cfg;
+
+struct cached_fft
+{
+    int nfft;
+    int inverse;
+    kiss_fft_cfg cfg;
+    kfc_cfg next;
+};
+
+static kfc_cfg cache_root=NULL;
+static int ncached=0;
+
+static kiss_fft_cfg find_cached_fft(int nfft,int inverse)
+{
+    size_t len;
+    kfc_cfg  cur=cache_root;
+    kfc_cfg  prev=NULL;
+    while ( cur ) {
+        if ( cur->nfft == nfft && inverse == cur->inverse )
+            break;/*found the right node*/
+        prev = cur;
+        cur = prev->next;
+    }
+    if (cur== NULL) {
+        /* no cached node found, need to create a new one*/
+        kiss_fft_alloc(nfft,inverse,0,&len);
+#ifdef USE_SIMD
+        int padding = (16-sizeof(struct cached_fft)) & 15;
+        // make sure the cfg aligns on a 16 byte boundary
+        len += padding;
+#endif
+        cur = (kfc_cfg)KISS_FFT_MALLOC((sizeof(struct cached_fft) + len ));
+        if (cur == NULL)
+            return NULL;
+        cur->cfg = (kiss_fft_cfg)(cur+1);
+#ifdef USE_SIMD
+        cur->cfg = (kiss_fft_cfg) ((char*)(cur+1)+padding);
+#endif
+        kiss_fft_alloc(nfft,inverse,cur->cfg,&len);
+        cur->nfft=nfft;
+        cur->inverse=inverse;
+        cur->next = NULL;
+        if ( prev )
+            prev->next = cur;
+        else
+            cache_root = cur;
+        ++ncached;
+    }
+    return cur->cfg;
+}
+
+void kfc_cleanup(void)
+{
+    kfc_cfg  cur=cache_root;
+    kfc_cfg  next=NULL;
+    while (cur){
+        next = cur->next;
+        free(cur);
+        cur=next;
+    }
+    ncached=0;
+    cache_root = NULL;
+}
+void kfc_fft(int nfft, const kiss_fft_cpx * fin,kiss_fft_cpx * fout)
+{
+    kiss_fft( find_cached_fft(nfft,0),fin,fout );
+}
+
+void kfc_ifft(int nfft, const kiss_fft_cpx * fin,kiss_fft_cpx * fout)
+{
+    kiss_fft( find_cached_fft(nfft,1),fin,fout );
+}
+
+#ifdef KFC_TEST
+static void check(int nc)
+{
+    if (ncached != nc) {
+        fprintf(stderr,"ncached should be %d,but it is %d\n",nc,ncached);
+        exit(1);
+    }
+}
+
+int main(void)
+{
+    kiss_fft_cpx buf1[1024],buf2[1024];
+    memset(buf1,0,sizeof(buf1));
+    check(0);
+    kfc_fft(512,buf1,buf2);
+    check(1);
+    kfc_fft(512,buf1,buf2);
+    check(1);
+    kfc_ifft(512,buf1,buf2);
+    check(2);
+    kfc_cleanup();
+    check(0);
+    return 0;
+}
+#endif
diff --git a/kiss/kfc.h b/kiss/kfc.h
new file mode 100644
index 0000000..d7d8c1b
--- /dev/null
+++ b/kiss/kfc.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef KFC_H
+#define KFC_H
+#include "kiss_fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+KFC -- Kiss FFT Cache
+
+Not needing to deal with kiss_fft_alloc and a config 
+object may be handy for a lot of programs.
+
+KFC uses the underlying KISS FFT functions, but caches the config object. 
+The first time kfc_fft or kfc_ifft for a given FFT size, the cfg 
+object is created for it.  All subsequent calls use the cached 
+configuration object.
+
+NOTE:
+You should probably not use this if your program will be using a lot 
+of various sizes of FFTs.  There is a linear search through the
+cached objects.  If you are only using one or two FFT sizes, this
+will be negligible. Otherwise, you may want to use another method 
+of managing the cfg objects.
+ 
+ There is no automated cleanup of the cached objects.  This could lead 
+to large memory usage in a program that uses a lot of *DIFFERENT* 
+sized FFTs.  If you want to force all cached cfg objects to be freed,
+call kfc_cleanup.
+ 
+ */
+
+/*forward complex FFT */
+void KISS_FFT_API kfc_fft(int nfft, const kiss_fft_cpx * fin,kiss_fft_cpx * fout);
+/*reverse complex FFT */
+void KISS_FFT_API kfc_ifft(int nfft, const kiss_fft_cpx * fin,kiss_fft_cpx * fout);
+
+/*free all cached objects*/
+void KISS_FFT_API kfc_cleanup(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/kiss/kiss_fft.c b/kiss/kiss_fft.c
new file mode 100644
index 0000000..58c24a0
--- /dev/null
+++ b/kiss/kiss_fft.c
@@ -0,0 +1,420 @@
+/*
+ *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+
+#include "_kiss_fft_guts.h"
+/* The guts header contains all the multiplication and addition macros that are defined for
+ fixed or floating point complex numbers.  It also delares the kf_ internal functions.
+ */
+
+static void kf_bfly2(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    kiss_fft_cpx * Fout2;
+    kiss_fft_cpx * tw1 = st->twiddles;
+    kiss_fft_cpx t;
+    Fout2 = Fout + m;
+    do{
+        C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
+
+        C_MUL (t,  *Fout2 , *tw1);
+        tw1 += fstride;
+        C_SUB( *Fout2 ,  *Fout , t );
+        C_ADDTO( *Fout ,  t );
+        ++Fout2;
+        ++Fout;
+    }while (--m);
+}
+
+static void kf_bfly4(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        const size_t m
+        )
+{
+    kiss_fft_cpx *tw1,*tw2,*tw3;
+    kiss_fft_cpx scratch[6];
+    size_t k=m;
+    const size_t m2=2*m;
+    const size_t m3=3*m;
+
+
+    tw3 = tw2 = tw1 = st->twiddles;
+
+    do {
+        C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
+
+        C_MUL(scratch[0],Fout[m] , *tw1 );
+        C_MUL(scratch[1],Fout[m2] , *tw2 );
+        C_MUL(scratch[2],Fout[m3] , *tw3 );
+
+        C_SUB( scratch[5] , *Fout, scratch[1] );
+        C_ADDTO(*Fout, scratch[1]);
+        C_ADD( scratch[3] , scratch[0] , scratch[2] );
+        C_SUB( scratch[4] , scratch[0] , scratch[2] );
+        C_SUB( Fout[m2], *Fout, scratch[3] );
+        tw1 += fstride;
+        tw2 += fstride*2;
+        tw3 += fstride*3;
+        C_ADDTO( *Fout , scratch[3] );
+
+        if(st->inverse) {
+            Fout[m].r = scratch[5].r - scratch[4].i;
+            Fout[m].i = scratch[5].i + scratch[4].r;
+            Fout[m3].r = scratch[5].r + scratch[4].i;
+            Fout[m3].i = scratch[5].i - scratch[4].r;
+        }else{
+            Fout[m].r = scratch[5].r + scratch[4].i;
+            Fout[m].i = scratch[5].i - scratch[4].r;
+            Fout[m3].r = scratch[5].r - scratch[4].i;
+            Fout[m3].i = scratch[5].i + scratch[4].r;
+        }
+        ++Fout;
+    }while(--k);
+}
+
+static void kf_bfly3(
+         kiss_fft_cpx * Fout,
+         const size_t fstride,
+         const kiss_fft_cfg st,
+         size_t m
+         )
+{
+     size_t k=m;
+     const size_t m2 = 2*m;
+     kiss_fft_cpx *tw1,*tw2;
+     kiss_fft_cpx scratch[5];
+     kiss_fft_cpx epi3;
+     epi3 = st->twiddles[fstride*m];
+
+     tw1=tw2=st->twiddles;
+
+     do{
+         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
+
+         C_MUL(scratch[1],Fout[m] , *tw1);
+         C_MUL(scratch[2],Fout[m2] , *tw2);
+
+         C_ADD(scratch[3],scratch[1],scratch[2]);
+         C_SUB(scratch[0],scratch[1],scratch[2]);
+         tw1 += fstride;
+         tw2 += fstride*2;
+
+         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+
+         C_MULBYSCALAR( scratch[0] , epi3.i );
+
+         C_ADDTO(*Fout,scratch[3]);
+
+         Fout[m2].r = Fout[m].r + scratch[0].i;
+         Fout[m2].i = Fout[m].i - scratch[0].r;
+
+         Fout[m].r -= scratch[0].i;
+         Fout[m].i += scratch[0].r;
+
+         ++Fout;
+     }while(--k);
+}
+
+static void kf_bfly5(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+    int u;
+    kiss_fft_cpx scratch[13];
+    kiss_fft_cpx * twiddles = st->twiddles;
+    kiss_fft_cpx *tw;
+    kiss_fft_cpx ya,yb;
+    ya = twiddles[fstride*m];
+    yb = twiddles[fstride*2*m];
+
+    Fout0=Fout;
+    Fout1=Fout0+m;
+    Fout2=Fout0+2*m;
+    Fout3=Fout0+3*m;
+    Fout4=Fout0+4*m;
+
+    tw=st->twiddles;
+    for ( u=0; u<m; ++u ) {
+        C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
+        scratch[0] = *Fout0;
+
+        C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+        C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+        C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+        C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+        C_ADD( scratch[7],scratch[1],scratch[4]);
+        C_SUB( scratch[10],scratch[1],scratch[4]);
+        C_ADD( scratch[8],scratch[2],scratch[3]);
+        C_SUB( scratch[9],scratch[2],scratch[3]);
+
+        Fout0->r += scratch[7].r + scratch[8].r;
+        Fout0->i += scratch[7].i + scratch[8].i;
+
+        scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+        scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+
+        scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
+        scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
+
+        C_SUB(*Fout1,scratch[5],scratch[6]);
+        C_ADD(*Fout4,scratch[5],scratch[6]);
+
+        scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+        scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+        scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
+        scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
+
+        C_ADD(*Fout2,scratch[11],scratch[12]);
+        C_SUB(*Fout3,scratch[11],scratch[12]);
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+    }
+}
+
+/* perform the butterfly for one stage of a mixed radix FFT */
+static void kf_bfly_generic(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m,
+        int p
+        )
+{
+    int u,k,q1,q;
+    kiss_fft_cpx * twiddles = st->twiddles;
+    kiss_fft_cpx t;
+    int Norig = st->nfft;
+
+    kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
+    if (scratch == NULL){
+        KISS_FFT_ERROR("Memory allocation failed.");
+        return;
+    }
+
+    for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            scratch[q1] = Fout[ k  ];
+            C_FIXDIV(scratch[q1],p);
+            k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            int twidx=0;
+            Fout[ k ] = scratch[0];
+            for (q=1;q<p;++q ) {
+                twidx += fstride * k;
+                if (twidx>=Norig) twidx-=Norig;
+                C_MUL(t,scratch[q] , twiddles[twidx] );
+                C_ADDTO( Fout[ k ] ,t);
+            }
+            k += m;
+        }
+    }
+    KISS_FFT_TMP_FREE(scratch);
+}
+
+static
+void kf_work(
+        kiss_fft_cpx * Fout,
+        const kiss_fft_cpx * f,
+        const size_t fstride,
+        int in_stride,
+        int * factors,
+        const kiss_fft_cfg st
+        )
+{
+    kiss_fft_cpx * Fout_beg=Fout;
+    const int p=*factors++; /* the radix  */
+    const int m=*factors++; /* stage's fft length/p */
+    const kiss_fft_cpx * Fout_end = Fout + p*m;
+
+#ifdef _OPENMP
+    // use openmp extensions at the
+    // top-level (not recursive)
+    if (fstride==1 && p<=5 && m!=1)
+    {
+        int k;
+
+        // execute the p different work units in different threads
+#       pragma omp parallel for
+        for (k=0;k<p;++k)
+            kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
+        // all threads have joined by this point
+
+        switch (p) {
+            case 2: kf_bfly2(Fout,fstride,st,m); break;
+            case 3: kf_bfly3(Fout,fstride,st,m); break;
+            case 4: kf_bfly4(Fout,fstride,st,m); break;
+            case 5: kf_bfly5(Fout,fstride,st,m); break;
+            default: kf_bfly_generic(Fout,fstride,st,m,p); break;
+        }
+        return;
+    }
+#endif
+
+    if (m==1) {
+        do{
+            *Fout = *f;
+            f += fstride*in_stride;
+        }while(++Fout != Fout_end );
+    }else{
+        do{
+            // recursive call:
+            // DFT of size m*p performed by doing
+            // p instances of smaller DFTs of size m,
+            // each one takes a decimated version of the input
+            kf_work( Fout , f, fstride*p, in_stride, factors,st);
+            f += fstride*in_stride;
+        }while( (Fout += m) != Fout_end );
+    }
+
+    Fout=Fout_beg;
+
+    // recombine the p smaller DFTs
+    switch (p) {
+        case 2: kf_bfly2(Fout,fstride,st,m); break;
+        case 3: kf_bfly3(Fout,fstride,st,m); break;
+        case 4: kf_bfly4(Fout,fstride,st,m); break;
+        case 5: kf_bfly5(Fout,fstride,st,m); break;
+        default: kf_bfly_generic(Fout,fstride,st,m,p); break;
+    }
+}
+
+/*  facbuf is populated by p1,m1,p2,m2, ...
+    where
+    p[i] * m[i] = m[i-1]
+    m0 = n                  */
+static
+void kf_factor(int n,int * facbuf)
+{
+    int p=4;
+    double floor_sqrt;
+    floor_sqrt = floor( sqrt((double)n) );
+
+    /*factor out powers of 4, powers of 2, then any remaining primes */
+    do {
+        while (n % p) {
+            switch (p) {
+                case 4: p = 2; break;
+                case 2: p = 3; break;
+                default: p += 2; break;
+            }
+            if (p > floor_sqrt)
+                p = n;          /* no more factors, skip to end */
+        }
+        n /= p;
+        *facbuf++ = p;
+        *facbuf++ = n;
+    } while (n > 1);
+}
+
+/*
+ *
+ * User-callable function to allocate all necessary storage space for the fft.
+ *
+ * The return value is a contiguous block of memory, allocated with malloc.  As such,
+ * It can be freed with free(), rather than a kiss_fft-specific function.
+ * */
+kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
+{
+    KISS_FFT_ALIGN_CHECK(mem)
+
+    kiss_fft_cfg st=NULL;
+    size_t memneeded = KISS_FFT_ALIGN_SIZE_UP(sizeof(struct kiss_fft_state)
+        + sizeof(kiss_fft_cpx)*(nfft-1)); /* twiddle factors*/
+
+    if ( lenmem==NULL ) {
+        st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
+    }else{
+        if (mem != NULL && *lenmem >= memneeded)
+            st = (kiss_fft_cfg)mem;
+        *lenmem = memneeded;
+    }
+    if (st) {
+        int i;
+        st->nfft=nfft;
+        st->inverse = inverse_fft;
+
+        for (i=0;i<nfft;++i) {
+            const double pi=3.141592653589793238462643383279502884197169399375105820974944;
+            double phase = -2*pi*i / nfft;
+            if (st->inverse)
+                phase *= -1;
+            kf_cexp(st->twiddles+i, phase );
+        }
+
+        kf_factor(nfft,st->factors);
+    }
+    return st;
+}
+
+
+void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
+{
+    if (fin == fout) {
+        //NOTE: this is not really an in-place FFT algorithm.
+        //It just performs an out-of-place FFT into a temp buffer
+        if (fout == NULL){
+            KISS_FFT_ERROR("fout buffer NULL.");
+        return;
+        }
+
+        kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
+        if (tmpbuf == NULL){
+            KISS_FFT_ERROR("Memory allocation error.");
+        return;
+        }
+
+
+
+        kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
+        memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
+        KISS_FFT_TMP_FREE(tmpbuf);
+    }else{
+        kf_work( fout, fin, 1,in_stride, st->factors,st );
+    }
+}
+
+void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+    kiss_fft_stride(cfg,fin,fout,1);
+}
+
+
+void kiss_fft_cleanup(void)
+{
+    // nothing needed any more
+}
+
+int kiss_fft_next_fast_size(int n)
+{
+    while(1) {
+        int m=n;
+        while ( (m%2) == 0 ) m/=2;
+        while ( (m%3) == 0 ) m/=3;
+        while ( (m%5) == 0 ) m/=5;
+        if (m<=1)
+            break; /* n is completely factorable by twos, threes, and fives */
+        n++;
+    }
+    return n;
+}
diff --git a/kiss/kiss_fft.h b/kiss/kiss_fft.h
new file mode 100644
index 0000000..dce1034
--- /dev/null
+++ b/kiss/kiss_fft.h
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef KISS_FFT_H
+#define KISS_FFT_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+// Define KISS_FFT_SHARED macro to properly export symbols
+#ifdef KISS_FFT_SHARED
+# ifdef _WIN32
+#  ifdef KISS_FFT_BUILD
+#   define KISS_FFT_API __declspec(dllexport)
+#  else
+#   define KISS_FFT_API __declspec(dllimport)
+#  endif
+# else
+#  define KISS_FFT_API __attribute__ ((visibility ("default")))
+# endif
+#else
+# define KISS_FFT_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ ATTENTION!
+ If you would like a :
+ -- a utility that will handle the caching of fft objects
+ -- real-only (no imaginary time component ) FFT
+ -- a multi-dimensional FFT
+ -- a command-line utility to perform ffts
+ -- a command-line utility to perform fast-convolution filtering
+
+ Then see kfc.h kiss_fftr.h kiss_fftnd.h fftutil.c kiss_fastfir.c
+  in the tools/ directory.
+*/
+
+/* User may override KISS_FFT_MALLOC and/or KISS_FFT_FREE. */
+#ifdef USE_SIMD
+# include <xmmintrin.h>
+# define kiss_fft_scalar __m128
+# ifndef KISS_FFT_MALLOC
+#  define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16)
+#  define KISS_FFT_ALIGN_CHECK(ptr) 
+#  define KISS_FFT_ALIGN_SIZE_UP(size) ((size + 15UL) & ~0xFUL)
+# endif
+# ifndef KISS_FFT_FREE
+#  define KISS_FFT_FREE _mm_free
+# endif
+#else
+# define KISS_FFT_ALIGN_CHECK(ptr)
+# define KISS_FFT_ALIGN_SIZE_UP(size) (size)
+# ifndef KISS_FFT_MALLOC
+#  define KISS_FFT_MALLOC malloc
+# endif
+# ifndef KISS_FFT_FREE
+#  define KISS_FFT_FREE free
+# endif
+#endif
+
+
+#ifdef FIXED_POINT
+#include <stdint.h>
+# if (FIXED_POINT == 32)
+#  define kiss_fft_scalar int32_t
+# else	
+#  define kiss_fft_scalar int16_t
+# endif
+#else
+# ifndef kiss_fft_scalar
+/*  default is float */
+#   define kiss_fft_scalar float
+# endif
+#endif
+
+typedef struct {
+    kiss_fft_scalar r;
+    kiss_fft_scalar i;
+}kiss_fft_cpx;
+
+typedef struct kiss_fft_state* kiss_fft_cfg;
+
+/* 
+ *  kiss_fft_alloc
+ *  
+ *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
+ *
+ *  typical usage:      kiss_fft_cfg mycfg=kiss_fft_alloc(1024,0,NULL,NULL);
+ *
+ *  The return value from fft_alloc is a cfg buffer used internally
+ *  by the fft routine or NULL.
+ *
+ *  If lenmem is NULL, then kiss_fft_alloc will allocate a cfg buffer using malloc.
+ *  The returned value should be free()d when done to avoid memory leaks.
+ *  
+ *  The state can be placed in a user supplied buffer 'mem':
+ *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
+ *      then the function places the cfg in mem and the size used in *lenmem
+ *      and returns mem.
+ *  
+ *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
+ *      then the function returns NULL and places the minimum cfg 
+ *      buffer size in *lenmem.
+ * */
+
+kiss_fft_cfg KISS_FFT_API kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem);
+
+/*
+ * kiss_fft(cfg,in_out_buf)
+ *
+ * Perform an FFT on a complex input buffer.
+ * for a forward FFT,
+ * fin should be  f[0] , f[1] , ... ,f[nfft-1]
+ * fout will be   F[0] , F[1] , ... ,F[nfft-1]
+ * Note that each element is complex and can be accessed like
+    f[k].r and f[k].i
+ * */
+void KISS_FFT_API kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+
+/*
+ A more generic version of the above function. It reads its input from every Nth sample.
+ * */
+void KISS_FFT_API kiss_fft_stride(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int fin_stride);
+
+/* If kiss_fft_alloc allocated a buffer, it is one contiguous 
+   buffer and can be simply free()d when no longer needed*/
+#define kiss_fft_free KISS_FFT_FREE
+
+/*
+ Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
+ your compiler output to call this before you exit.
+*/
+void KISS_FFT_API kiss_fft_cleanup(void);
+	
+
+/*
+ * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
+ */
+int KISS_FFT_API kiss_fft_next_fast_size(int n);
+
+/* for real ffts, we need an even size */
+#define kiss_fftr_next_fast_size_real(n) \
+        (kiss_fft_next_fast_size( ((n)+1)>>1)<<1)
+
+#ifdef __cplusplus
+} 
+#endif
+
+#endif
diff --git a/kiss/kiss_fft_log.h b/kiss/kiss_fft_log.h
new file mode 100644
index 0000000..b5b631a
--- /dev/null
+++ b/kiss/kiss_fft_log.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef kiss_fft_log_h
+#define kiss_fft_log_h
+
+#define ERROR 1
+#define WARNING 2
+#define INFO 3
+#define DEBUG 4
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#if defined(NDEBUG)
+# define KISS_FFT_LOG_MSG(severity, ...) ((void)0)
+#else
+# define KISS_FFT_LOG_MSG(severity, ...) \
+    fprintf(stderr, "[" #severity "] " __FILE__ ":" TOSTRING(__LINE__) " "); \
+    fprintf(stderr, __VA_ARGS__); \
+    fprintf(stderr, "\n")
+#endif
+
+#define KISS_FFT_ERROR(...) KISS_FFT_LOG_MSG(ERROR, __VA_ARGS__)
+#define KISS_FFT_WARNING(...) KISS_FFT_LOG_MSG(WARNING, __VA_ARGS__)
+#define KISS_FFT_INFO(...) KISS_FFT_LOG_MSG(INFO, __VA_ARGS__)
+#define KISS_FFT_DEBUG(...) KISS_FFT_LOG_MSG(DEBUG, __VA_ARGS__)
+
+
+
+#endif /* kiss_fft_log_h */
+\ No newline at end of file
diff --git a/kiss/kiss_fftnd.c b/kiss/kiss_fftnd.c
new file mode 100644
index 0000000..5d5b089
--- /dev/null
+++ b/kiss/kiss_fftnd.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#include "kiss_fftnd.h"
+#include "_kiss_fft_guts.h"
+
+struct kiss_fftnd_state{
+    int dimprod; /* dimsum would be mighty tasty right now */
+    int ndims; 
+    int *dims;
+    kiss_fft_cfg *states; /* cfg states for each dimension */
+    kiss_fft_cpx * tmpbuf; /*buffer capable of hold the entire input */
+};
+
+kiss_fftnd_cfg kiss_fftnd_alloc(const int *dims,int ndims,int inverse_fft,void*mem,size_t*lenmem)
+{
+    KISS_FFT_ALIGN_CHECK(mem)
+
+    kiss_fftnd_cfg st = NULL;
+    int i;
+    int dimprod=1;
+    size_t memneeded = KISS_FFT_ALIGN_SIZE_UP(sizeof(struct kiss_fftnd_state));
+    char * ptr = NULL;
+
+    for (i=0;i<ndims;++i) {
+        size_t sublen=0;
+        kiss_fft_alloc (dims[i], inverse_fft, NULL, &sublen);
+        memneeded += sublen;   /* st->states[i] */
+        dimprod *= dims[i];
+    }
+    memneeded += KISS_FFT_ALIGN_SIZE_UP(sizeof(int) * ndims);/*  st->dims */
+    memneeded += KISS_FFT_ALIGN_SIZE_UP(sizeof(void*) * ndims);/* st->states  */
+    memneeded += KISS_FFT_ALIGN_SIZE_UP(sizeof(kiss_fft_cpx) * dimprod); /* st->tmpbuf */
+
+    if (lenmem == NULL) {/* allocate for the caller*/
+        ptr = (char *) malloc (memneeded);
+    } else { /* initialize supplied buffer if big enough */
+        if (*lenmem >= memneeded)
+            ptr = (char *) mem;
+        *lenmem = memneeded; /*tell caller how big struct is (or would be) */
+    }
+    if (!ptr)
+        return NULL; /*malloc failed or buffer too small */
+
+    st = (kiss_fftnd_cfg) ptr;
+    st->dimprod = dimprod;
+    st->ndims = ndims;
+    ptr += KISS_FFT_ALIGN_SIZE_UP(sizeof(struct kiss_fftnd_state));
+
+    st->states = (kiss_fft_cfg *)ptr;
+    ptr += KISS_FFT_ALIGN_SIZE_UP(sizeof(void*) * ndims);
+
+    st->dims = (int*)ptr;
+    ptr += KISS_FFT_ALIGN_SIZE_UP(sizeof(int) * ndims);
+
+    st->tmpbuf = (kiss_fft_cpx*)ptr;
+    ptr += KISS_FFT_ALIGN_SIZE_UP(sizeof(kiss_fft_cpx) * dimprod);
+
+    for (i=0;i<ndims;++i) {
+        size_t len;
+        st->dims[i] = dims[i];
+        kiss_fft_alloc (st->dims[i], inverse_fft, NULL, &len);
+        st->states[i] = kiss_fft_alloc (st->dims[i], inverse_fft, ptr,&len);
+        ptr += len;
+    }
+    /*
+Hi there!
+
+If you're looking at this particular code, it probably means you've got a brain-dead bounds checker 
+that thinks the above code overwrites the end of the array.
+
+It doesn't.
+
+-- Mark 
+
+P.S.
+The below code might give you some warm fuzzies and help convince you.
+       */
+    if ( ptr - (char*)st != (int)memneeded ) {
+        fprintf(stderr,
+                "################################################################################\n"
+                "Internal error! Memory allocation miscalculation\n"
+                "################################################################################\n"
+               );
+    }
+    return st;
+}
+
+/*
+ This works by tackling one dimension at a time.
+
+ In effect,
+ Each stage starts out by reshaping the matrix into a DixSi 2d matrix.
+ A Di-sized fft is taken of each column, transposing the matrix as it goes.
+
+Here's a 3-d example:
+Take a 2x3x4 matrix, laid out in memory as a contiguous buffer
+ [ [ [ a b c d ] [ e f g h ] [ i j k l ] ]
+   [ [ m n o p ] [ q r s t ] [ u v w x ] ] ]
+
+Stage 0 ( D=2): treat the buffer as a 2x12 matrix
+   [ [a b ... k l]
+     [m n ... w x] ]
+
+   FFT each column with size 2.
+   Transpose the matrix at the same time using kiss_fft_stride.
+
+   [ [ a+m a-m ]
+     [ b+n b-n]
+     ...
+     [ k+w k-w ]
+     [ l+x l-x ] ]
+
+   Note fft([x y]) == [x+y x-y]
+
+Stage 1 ( D=3) treats the buffer (the output of stage D=2) as an 3x8 matrix,
+   [ [ a+m a-m b+n b-n c+o c-o d+p d-p ] 
+     [ e+q e-q f+r f-r g+s g-s h+t h-t ]
+     [ i+u i-u j+v j-v k+w k-w l+x l-x ] ]
+
+   And perform FFTs (size=3) on each of the columns as above, transposing 
+   the matrix as it goes.  The output of stage 1 is 
+       (Legend: ap = [ a+m e+q i+u ]
+                am = [ a-m e-q i-u ] )
+   
+   [ [ sum(ap) fft(ap)[0] fft(ap)[1] ]
+     [ sum(am) fft(am)[0] fft(am)[1] ]
+     [ sum(bp) fft(bp)[0] fft(bp)[1] ]
+     [ sum(bm) fft(bm)[0] fft(bm)[1] ]
+     [ sum(cp) fft(cp)[0] fft(cp)[1] ]
+     [ sum(cm) fft(cm)[0] fft(cm)[1] ]
+     [ sum(dp) fft(dp)[0] fft(dp)[1] ]
+     [ sum(dm) fft(dm)[0] fft(dm)[1] ]  ]
+
+Stage 2 ( D=4) treats this buffer as a 4*6 matrix,
+   [ [ sum(ap) fft(ap)[0] fft(ap)[1] sum(am) fft(am)[0] fft(am)[1] ]
+     [ sum(bp) fft(bp)[0] fft(bp)[1] sum(bm) fft(bm)[0] fft(bm)[1] ]
+     [ sum(cp) fft(cp)[0] fft(cp)[1] sum(cm) fft(cm)[0] fft(cm)[1] ]
+     [ sum(dp) fft(dp)[0] fft(dp)[1] sum(dm) fft(dm)[0] fft(dm)[1] ]  ]
+
+   Then FFTs each column, transposing as it goes.
+
+   The resulting matrix is the 3d FFT of the 2x3x4 input matrix.
+
+   Note as a sanity check that the first element of the final 
+   stage's output (DC term) is 
+   sum( [ sum(ap) sum(bp) sum(cp) sum(dp) ] )
+   , i.e. the summation of all 24 input elements. 
+
+*/
+void kiss_fftnd(kiss_fftnd_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+    int i,k;
+    const kiss_fft_cpx * bufin=fin;
+    kiss_fft_cpx * bufout;
+
+    /*arrange it so the last bufout == fout*/
+    if ( st->ndims & 1 ) {
+        bufout = fout;
+        if (fin==fout) {
+            memcpy( st->tmpbuf, fin, sizeof(kiss_fft_cpx) * st->dimprod );
+            bufin = st->tmpbuf;
+        }
+    }else
+        bufout = st->tmpbuf;
+
+    for ( k=0; k < st->ndims; ++k) {
+        int curdim = st->dims[k];
+        int stride = st->dimprod / curdim;
+
+        for ( i=0 ; i<stride ; ++i ) 
+            kiss_fft_stride( st->states[k], bufin+i , bufout+i*curdim, stride );
+
+        /*toggle back and forth between the two buffers*/
+        if (bufout == st->tmpbuf){
+            bufout = fout;
+            bufin = st->tmpbuf;
+        }else{
+            bufout = st->tmpbuf;
+            bufin = fout;
+        }
+    }
+}
diff --git a/kiss/kiss_fftnd.h b/kiss/kiss_fftnd.h
new file mode 100644
index 0000000..956ba94
--- /dev/null
+++ b/kiss/kiss_fftnd.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef KISS_FFTND_H
+#define KISS_FFTND_H
+
+#include "kiss_fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct kiss_fftnd_state * kiss_fftnd_cfg;
+    
+kiss_fftnd_cfg KISS_FFT_API kiss_fftnd_alloc(const int *dims,int ndims,int inverse_fft,void*mem,size_t*lenmem);
+void KISS_FFT_API kiss_fftnd(kiss_fftnd_cfg  cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/kiss/kiss_fftndr.c b/kiss/kiss_fftndr.c
new file mode 100644
index 0000000..e979d03
--- /dev/null
+++ b/kiss/kiss_fftndr.c
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#include "kiss_fftndr.h"
+#include "_kiss_fft_guts.h"
+#define MAX(x,y) ( ( (x)<(y) )?(y):(x) )
+
+struct kiss_fftndr_state
+{
+    int dimReal;
+    int dimOther;
+    kiss_fftr_cfg cfg_r;
+    kiss_fftnd_cfg cfg_nd;
+    void * tmpbuf;
+};
+
+static int prod(const int *dims, int ndims)
+{
+    int x=1;
+    while (ndims--) 
+        x *= *dims++;
+    return x;
+}
+
+kiss_fftndr_cfg kiss_fftndr_alloc(const int *dims,int ndims,int inverse_fft,void*mem,size_t*lenmem)
+{
+    KISS_FFT_ALIGN_CHECK(mem)
+
+    kiss_fftndr_cfg st = NULL;
+    size_t nr=0 , nd=0,ntmp=0;
+    int dimReal = dims[ndims-1];
+    int dimOther = prod(dims,ndims-1);
+    size_t memneeded;
+    char * ptr = NULL;
+
+    (void)kiss_fftr_alloc(dimReal,inverse_fft,NULL,&nr);
+    (void)kiss_fftnd_alloc(dims,ndims-1,inverse_fft,NULL,&nd);
+    ntmp =
+        MAX( 2*dimOther , dimReal+2) * sizeof(kiss_fft_scalar)  // freq buffer for one pass
+        + dimOther*(dimReal+2) * sizeof(kiss_fft_scalar);  // large enough to hold entire input in case of in-place
+
+    memneeded = KISS_FFT_ALIGN_SIZE_UP(sizeof( struct kiss_fftndr_state )) + KISS_FFT_ALIGN_SIZE_UP(nr) + KISS_FFT_ALIGN_SIZE_UP(nd) + KISS_FFT_ALIGN_SIZE_UP(ntmp);
+
+    if (lenmem==NULL) {
+        ptr = (char*) malloc(memneeded);
+    }else{
+        if (*lenmem >= memneeded)
+            ptr = (char *)mem;
+        *lenmem = memneeded; 
+    }
+    if (ptr==NULL)
+        return NULL;
+    
+    st = (kiss_fftndr_cfg) ptr;
+    memset( st , 0 , memneeded);
+    ptr += KISS_FFT_ALIGN_SIZE_UP(sizeof(struct kiss_fftndr_state));
+    
+    st->dimReal = dimReal;
+    st->dimOther = dimOther;
+    st->cfg_r = kiss_fftr_alloc( dimReal,inverse_fft,ptr,&nr);
+    ptr += KISS_FFT_ALIGN_SIZE_UP(nr);
+    st->cfg_nd = kiss_fftnd_alloc(dims,ndims-1,inverse_fft, ptr,&nd);
+    ptr += KISS_FFT_ALIGN_SIZE_UP(nd);
+    st->tmpbuf = ptr;
+
+    return st;
+}
+
+void kiss_fftndr(kiss_fftndr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata)
+{
+    int k1,k2;
+    int dimReal = st->dimReal;
+    int dimOther = st->dimOther;
+    int nrbins = dimReal/2+1;
+
+    kiss_fft_cpx * tmp1 = (kiss_fft_cpx*)st->tmpbuf; 
+    kiss_fft_cpx * tmp2 = tmp1 + MAX(nrbins,dimOther);
+
+    // timedata is N0 x N1 x ... x Nk real
+
+    // take a real chunk of data, fft it and place the output at correct intervals
+    for (k1=0;k1<dimOther;++k1) {
+        kiss_fftr( st->cfg_r, timedata + k1*dimReal , tmp1 ); // tmp1 now holds nrbins complex points
+        for (k2=0;k2<nrbins;++k2)
+           tmp2[ k2*dimOther+k1 ] = tmp1[k2];
+    }
+
+    for (k2=0;k2<nrbins;++k2) {
+        kiss_fftnd(st->cfg_nd, tmp2+k2*dimOther, tmp1);  // tmp1 now holds dimOther complex points
+        for (k1=0;k1<dimOther;++k1) 
+            freqdata[ k1*(nrbins) + k2] = tmp1[k1];
+    }
+}
+
+void kiss_fftndri(kiss_fftndr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
+{
+    int k1,k2;
+    int dimReal = st->dimReal;
+    int dimOther = st->dimOther;
+    int nrbins = dimReal/2+1;
+    kiss_fft_cpx * tmp1 = (kiss_fft_cpx*)st->tmpbuf; 
+    kiss_fft_cpx * tmp2 = tmp1 + MAX(nrbins,dimOther);
+
+    for (k2=0;k2<nrbins;++k2) {
+        for (k1=0;k1<dimOther;++k1) 
+            tmp1[k1] = freqdata[ k1*(nrbins) + k2 ];
+        kiss_fftnd(st->cfg_nd, tmp1, tmp2+k2*dimOther);
+    }
+
+    for (k1=0;k1<dimOther;++k1) {
+        for (k2=0;k2<nrbins;++k2)
+            tmp1[k2] = tmp2[ k2*dimOther+k1 ];
+        kiss_fftri( st->cfg_r,tmp1,timedata + k1*dimReal);
+    }
+}
diff --git a/kiss/kiss_fftndr.h b/kiss/kiss_fftndr.h
new file mode 100644
index 0000000..0d56a1f
--- /dev/null
+++ b/kiss/kiss_fftndr.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef KISS_NDR_H
+#define KISS_NDR_H
+
+#include "kiss_fft.h"
+#include "kiss_fftr.h"
+#include "kiss_fftnd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    
+typedef struct kiss_fftndr_state *kiss_fftndr_cfg;
+
+
+kiss_fftndr_cfg KISS_FFT_API kiss_fftndr_alloc(const int *dims,int ndims,int inverse_fft,void*mem,size_t*lenmem);
+/*
+ dims[0] must be even
+
+ If you don't care to allocate space, use mem = lenmem = NULL 
+*/
+
+
+void KISS_FFT_API kiss_fftndr(
+        kiss_fftndr_cfg cfg,
+        const kiss_fft_scalar *timedata,
+        kiss_fft_cpx *freqdata);
+/*
+ input timedata has dims[0] X dims[1] X ... X  dims[ndims-1] scalar points
+ output freqdata has dims[0] X dims[1] X ... X  dims[ndims-1]/2+1 complex points
+*/
+
+void KISS_FFT_API kiss_fftndri(
+        kiss_fftndr_cfg cfg,
+        const kiss_fft_cpx *freqdata,
+        kiss_fft_scalar *timedata);
+/*
+ input and output dimensions are the exact opposite of kiss_fftndr
+*/
+
+
+#define kiss_fftndr_free free
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/kiss/kiss_fftr.c b/kiss/kiss_fftr.c
new file mode 100644
index 0000000..778a9a6
--- /dev/null
+++ b/kiss/kiss_fftr.c
@@ -0,0 +1,155 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#include "kiss_fftr.h"
+#include "_kiss_fft_guts.h"
+
+struct kiss_fftr_state{
+    kiss_fft_cfg substate;
+    kiss_fft_cpx * tmpbuf;
+    kiss_fft_cpx * super_twiddles;
+#ifdef USE_SIMD
+    void * pad;
+#endif
+};
+
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
+{
+	KISS_FFT_ALIGN_CHECK(mem)
+
+    int i;
+    kiss_fftr_cfg st = NULL;
+    size_t subsize = 0, memneeded;
+
+    if (nfft & 1) {
+        KISS_FFT_ERROR("Real FFT optimization must be even.");
+        return NULL;
+    }
+    nfft >>= 1;
+
+    kiss_fft_alloc (nfft, inverse_fft, NULL, &subsize);
+    memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 3 / 2);
+
+    if (lenmem == NULL) {
+        st = (kiss_fftr_cfg) KISS_FFT_MALLOC (memneeded);
+    } else {
+        if (*lenmem >= memneeded)
+            st = (kiss_fftr_cfg) mem;
+        *lenmem = memneeded;
+    }
+    if (!st)
+        return NULL;
+
+    st->substate = (kiss_fft_cfg) (st + 1); /*just beyond kiss_fftr_state struct */
+    st->tmpbuf = (kiss_fft_cpx *) (((char *) st->substate) + subsize);
+    st->super_twiddles = st->tmpbuf + nfft;
+    kiss_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
+
+    for (i = 0; i < nfft/2; ++i) {
+        double phase =
+            -3.14159265358979323846264338327 * ((double) (i+1) / nfft + .5);
+        if (inverse_fft)
+            phase *= -1;
+        kf_cexp (st->super_twiddles+i,phase);
+    }
+    return st;
+}
+
+void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k,ncfft;
+    kiss_fft_cpx fpnk,fpk,f1k,f2k,tw,tdc;
+
+    if ( st->substate->inverse) {
+        KISS_FFT_ERROR("kiss fft usage error: improper alloc");
+        return;/* The caller did not call the correct function */
+    }
+
+    ncfft = st->substate->nfft;
+
+    /*perform the parallel fft of two real signals packed in real,imag*/
+    kiss_fft( st->substate , (const kiss_fft_cpx*)timedata, st->tmpbuf );
+    /* The real part of the DC element of the frequency spectrum in st->tmpbuf
+     * contains the sum of the even-numbered elements of the input time sequence
+     * The imag part is the sum of the odd-numbered elements
+     *
+     * The sum of tdc.r and tdc.i is the sum of the input time sequence.
+     *      yielding DC of input time sequence
+     * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1...
+     *      yielding Nyquist bin of input time sequence
+     */
+
+    tdc.r = st->tmpbuf[0].r;
+    tdc.i = st->tmpbuf[0].i;
+    C_FIXDIV(tdc,2);
+    CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
+    CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
+    freqdata[0].r = tdc.r + tdc.i;
+    freqdata[ncfft].r = tdc.r - tdc.i;
+#ifdef USE_SIMD
+    freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
+#else
+    freqdata[ncfft].i = freqdata[0].i = 0;
+#endif
+
+    for ( k=1;k <= ncfft/2 ; ++k ) {
+        fpk    = st->tmpbuf[k];
+        fpnk.r =   st->tmpbuf[ncfft-k].r;
+        fpnk.i = - st->tmpbuf[ncfft-k].i;
+        C_FIXDIV(fpk,2);
+        C_FIXDIV(fpnk,2);
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( tw , f2k , st->super_twiddles[k-1]);
+
+        freqdata[k].r = HALF_OF(f1k.r + tw.r);
+        freqdata[k].i = HALF_OF(f1k.i + tw.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
+        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
+    }
+}
+
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k, ncfft;
+
+    if (st->substate->inverse == 0) {
+        KISS_FFT_ERROR("kiss fft usage error: improper alloc");
+        return;/* The caller did not call the correct function */
+    }
+
+    ncfft = st->substate->nfft;
+
+    st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
+    st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
+    C_FIXDIV(st->tmpbuf[0],2);
+
+    for (k = 1; k <= ncfft / 2; ++k) {
+        kiss_fft_cpx fk, fnkc, fek, fok, tmp;
+        fk = freqdata[k];
+        fnkc.r = freqdata[ncfft - k].r;
+        fnkc.i = -freqdata[ncfft - k].i;
+        C_FIXDIV( fk , 2 );
+        C_FIXDIV( fnkc , 2 );
+
+        C_ADD (fek, fk, fnkc);
+        C_SUB (tmp, fk, fnkc);
+        C_MUL (fok, tmp, st->super_twiddles[k-1]);
+        C_ADD (st->tmpbuf[k],     fek, fok);
+        C_SUB (st->tmpbuf[ncfft - k], fek, fok);
+#ifdef USE_SIMD
+        st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#else
+        st->tmpbuf[ncfft - k].i *= -1;
+#endif
+    }
+    kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
diff --git a/kiss/kiss_fftr.h b/kiss/kiss_fftr.h
new file mode 100644
index 0000000..7fd73d2
--- /dev/null
+++ b/kiss/kiss_fftr.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
+ *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
+ *
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  See COPYING file for more information.
+ */
+
+#ifndef KISS_FTR_H
+#define KISS_FTR_H
+
+#include "kiss_fft.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    
+/* 
+ 
+ Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
+
+ 
+ 
+ */
+
+typedef struct kiss_fftr_state *kiss_fftr_cfg;
+
+
+kiss_fftr_cfg KISS_FFT_API kiss_fftr_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
+/*
+ nfft must be even
+
+ If you don't care to allocate space, use mem = lenmem = NULL 
+*/
+
+
+void KISS_FFT_API kiss_fftr(kiss_fftr_cfg cfg,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata);
+/*
+ input timedata has nfft scalar points
+ output freqdata has nfft/2+1 complex points
+*/
+
+void KISS_FFT_API kiss_fftri(kiss_fftr_cfg cfg,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata);
+/*
+ input freqdata has  nfft/2+1 complex points
+ output timedata has nfft scalar points
+*/
+
+#define kiss_fftr_free KISS_FFT_FREE
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/m4/ax_cxx_compile_stdcxx.m4 b/m4/ax_cxx_compile_stdcxx.m4
index 43087b2..8edf515 100644
--- a/m4/ax_cxx_compile_stdcxx.m4
+++ b/m4/ax_cxx_compile_stdcxx.m4
@@ -10,13 +10,13 @@
 #
 #   Check for baseline language coverage in the compiler for the specified
 #   version of the C++ standard.  If necessary, add switches to CXX and
-#   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
-#   or '14' (for the C++14 standard).
+#   CXXCPP to enable support.  VERSION may be '11', '14', '17', or '20' for
+#   the respective C++ standard version.
 #
 #   The second argument, if specified, indicates whether you insist on an
 #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
 #   -std=c++11).  If neither is specified, you get whatever works, with
-#   preference for an extended mode.
+#   preference for no added switch, and then for an extended mode.
 #
 #   The third argument, if specified 'mandatory' or if left unspecified,
 #   indicates that baseline support for the specified C++ standard is
@@ -35,13 +35,15 @@
 #   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
 #   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
 #   Copyright (c) 2019 Enji Cooper <yaneurabeya@gmail.com>
+#   Copyright (c) 2020 Jason Merrill <jason@redhat.com>
+#   Copyright (c) 2021 Jörn Heusipp <osmanx@problemloesungsmaschine.de>
 #
 #   Copying and distribution of this file, with or without modification, are
 #   permitted in any medium without royalty provided the copyright notice
 #   and this notice are preserved.  This file is offered as-is, without any
 #   warranty.
 
-#serial 11
+#serial 18
 
 dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
 dnl  (serial version number 13).
@@ -50,6 +52,7 @@ AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
   m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"],
         [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
         [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
+        [$1], [20], [ax_cxx_compile_alternatives="20"],
         [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
   m4_if([$2], [], [],
         [$2], [ext], [],
@@ -62,6 +65,16 @@ AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
   AC_LANG_PUSH([C++])dnl
   ac_success=no
 
+  m4_if([$2], [], [dnl
+    AC_CACHE_CHECK(whether $CXX supports C++$1 features by default,
+		   ax_cv_cxx_compile_cxx$1,
+      [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+        [ax_cv_cxx_compile_cxx$1=yes],
+        [ax_cv_cxx_compile_cxx$1=no])])
+    if test x$ax_cv_cxx_compile_cxx$1 = xyes; then
+      ac_success=yes
+    fi])
+
   m4_if([$2], [noext], [], [dnl
   if test x$ac_success = xno; then
     for alternative in ${ax_cxx_compile_alternatives}; do
@@ -91,9 +104,18 @@ AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
     dnl HP's aCC needs +std=c++11 according to:
     dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
     dnl Cray's crayCC needs "-h std=c++11"
+    dnl MSVC needs -std:c++NN for C++17 and later (default is C++14)
     for alternative in ${ax_cxx_compile_alternatives}; do
-      for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
-        cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}" MSVC; do
+        if test x"$switch" = xMSVC; then
+          dnl AS_TR_SH maps both `:` and `=` to `_` so -std:c++17 would collide
+          dnl with -std=c++17.  We suffix the cache variable name with _MSVC to
+          dnl avoid this.
+          switch=-std:c++${alternative}
+          cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_${switch}_MSVC])
+        else
+          cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+        fi
         AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
                        $cachevar,
           [ac_save_CXX="$CXX"
@@ -140,7 +162,6 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
 )
 
-
 dnl  Test body for checking C++14 support
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
@@ -148,12 +169,24 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
 )
 
+dnl  Test body for checking C++17 support
+
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
 )
 
+dnl  Test body for checking C++20 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_20],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_20
+)
+
+
 dnl  Tests for new features in C++11
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
@@ -165,7 +198,11 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
 
 #error "This is not a C++ compiler"
 
-#elif __cplusplus < 201103L
+// MSVC always sets __cplusplus to 199711L in older versions; newer versions
+// only set it correctly if /Zc:__cplusplus is specified as well as a
+// /std:c++NN switch:
+// https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+#elif __cplusplus < 201103L && !defined _MSC_VER
 
 #error "This is not a C++11 compiler"
 
@@ -456,7 +493,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
 
 #error "This is not a C++ compiler"
 
-#elif __cplusplus < 201402L
+#elif __cplusplus < 201402L && !defined _MSC_VER
 
 #error "This is not a C++14 compiler"
 
@@ -580,7 +617,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
 
 #error "This is not a C++ compiler"
 
-#elif __cplusplus < 201703L
+#elif __cplusplus < 201703L && !defined _MSC_VER
 
 #error "This is not a C++17 compiler"
 
@@ -946,6 +983,36 @@ namespace cxx17
 
 }  // namespace cxx17
 
-#endif  // __cplusplus < 201703L
+#endif  // __cplusplus < 201703L && !defined _MSC_VER
+
+]])
+
+
+dnl  Tests for new features in C++20
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_20], [[
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 202002L && !defined _MSC_VER
+
+#error "This is not a C++20 compiler"
+
+#else
+
+#include <version>
+
+namespace cxx20
+{
+
+// As C++20 supports feature test macros in the standard, there is no
+// immediate need to actually test for feature availability on the
+// Autoconf side.
+
+}  // namespace cxx20
+
+#endif  // __cplusplus < 202002L && !defined _MSC_VER
 
 ]])
diff --git a/src/Buffer.h b/src/Buffer.h
index af52e93..2c2a65e 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -33,9 +33,17 @@
 
 #include <vector>
 #include <memory>
+#include <complex>
+#include "fpm/fixed.hpp"
+
+typedef std::complex<float> complexf;
+
+using fixed_16 = fpm::fixed<std::int16_t, std::int32_t, 14>;
+typedef std::complex<fixed_16> complexfix;
+typedef std::complex<fpm::fixed_16_16> complexfix_wide;
 
 /* Buffer is a container for a byte array, which is memory-aligned
- * to 32 bytes for SSE performance.
+ * to 32 bytes for SIMD performance.
  *
  * The allocation/freeing of the data is handled internally.
  */
diff --git a/src/CicEqualizer.h b/src/CicEqualizer.h
index 792da02..4510d0c 100644
--- a/src/CicEqualizer.h
+++ b/src/CicEqualizer.h
@@ -25,18 +25,10 @@
 #   include <config.h>
 #endif
 
-
 #include "ModPlugin.h"
 
 #include <vector>
 #include <sys/types.h>
-#include <complex>
-#ifdef __SSE__
-#   include <xmmintrin.h>
-#endif
-
-
-typedef std::complex<float> complexf;
 
 class CicEqualizer : public ModCodec
 {
diff --git a/src/ConfigParser.cpp b/src/ConfigParser.cpp
index fb2c1a1..c92a520 100644
--- a/src/ConfigParser.cpp
+++ b/src/ConfigParser.cpp
@@ -63,6 +63,27 @@ static GainMode parse_gainmode(const std::string &gainMode_setting)
     throw std::runtime_error("Configuration error");
 }
 
+static FFTEngine parse_fft_engine(const std::string &fft_engine_setting)
+{
+    string fft_engine_minuscule(fft_engine_setting);
+    std::transform(fft_engine_minuscule.begin(), fft_engine_minuscule.end(),
+            fft_engine_minuscule.begin(), ::tolower);
+
+    if (fft_engine_minuscule == "fftw") {
+        return FFTEngine::FFTW;
+    }
+    else if (fft_engine_minuscule == "kiss") {
+        return FFTEngine::KISS;
+    }
+    else if (fft_engine_minuscule == "dexter") {
+        return FFTEngine::DEXTER;
+    }
+
+    cerr << "Modulator fft_engine setting '" << fft_engine_setting <<
+        "' not recognised." << endl;
+    throw std::runtime_error("Configuration error");
+}
+
 static void parse_configfile(
         const std::string& configuration_file,
         mod_settings_t& mod_settings)
@@ -156,6 +177,9 @@ static void parse_configfile(
             mod_settings.showProcessTime);
 
     // modulator parameters:
+    const string fft_engine_setting = pt.Get("modulator.fft_engine", "fftw");
+    mod_settings.fftEngine = parse_fft_engine(fft_engine_setting);
+
     const string gainMode_setting = pt.Get("modulator.gainmode", "var");
     mod_settings.gainMode = parse_gainmode(gainMode_setting);
     mod_settings.gainmodeVariance = pt.GetReal("modulator.normalise_variance",
diff --git a/src/ConfigParser.h b/src/ConfigParser.h
index ae76dee..3bacfdd 100644
--- a/src/ConfigParser.h
+++ b/src/ConfigParser.h
@@ -36,6 +36,12 @@
 #include "TII.h"
 #include "output/SDRDevice.h"
 
+enum class FFTEngine {
+    FFTW, // floating point in software
+    KISS, // fixed-point in software
+    DEXTER // fixed-point in FPGA
+};
+
 struct mod_settings_t {
     std::string startupCheck;
 
@@ -51,6 +57,8 @@ struct mod_settings_t {
     bool useLimeOutput = false;
     bool useBladeRFOutput = false;
 
+    FFTEngine fftEngine = FFTEngine::FFTW;
+
     size_t outputRate = 2048000;
     size_t clockRate = 0;
     unsigned dabMode = 1;
diff --git a/src/DabMod.cpp b/src/DabMod.cpp
index 3b072c1..7866818 100644
--- a/src/DabMod.cpp
+++ b/src/DabMod.cpp
@@ -31,10 +31,8 @@
 #endif
 
 #include <memory>
-#include <complex>
 #include <string>
 #include <iostream>
-#include <iomanip>
 #include <cstdlib>
 #include <stdexcept>
 #include <cstdio>
@@ -51,7 +49,6 @@
 #include "Utils.h"
 #include "Log.h"
 #include "DabModulator.h"
-#include "InputMemory.h"
 #include "OutputFile.h"
 #include "FormatConverter.h"
 #include "FrameMultiplexer.h"
@@ -75,16 +72,16 @@
  * samples can have peaks up to about 48000. The value of 50000
  * should guarantee that with a digital gain of 1.0, UHD never clips
  * our samples.
+ *
+ * This only applies when fixed_point == false.
  */
 static const float normalise_factor = 50000.0f;
 
-//Empirical normalisation factors used to normalise the samples to amplitude 1.
+// Empirical normalisation factors used to normalise the samples to amplitude 1.
 static const float normalise_factor_file_fix = 81000.0f;
 static const float normalise_factor_file_var = 46000.0f;
 static const float normalise_factor_file_max = 46000.0f;
 
-typedef std::complex<float> complexf;
-
 using namespace std;
 
 volatile sig_atomic_t running = 1;
@@ -255,7 +252,11 @@ static shared_ptr<ModOutput> prepare_output(mod_settings_t& s)
     shared_ptr<ModOutput> output;
 
     if (s.useFileOutput) {
-        if (s.fileOutputFormat == "complexf") {
+        if (s.fftEngine != FFTEngine::FFTW) {
+            // Intentionally ignore fileOutputFormat, it is always sc16
+            output = make_shared<OutputFile>(s.outputName, s.fileOutputShowMetadata);
+        }
+        else if (s.fileOutputFormat == "complexf") {
             output = make_shared<OutputFile>(s.outputName, s.fileOutputShowMetadata);
         }
         else if (s.fileOutputFormat == "complexf_normalised") {
@@ -291,6 +292,7 @@ static shared_ptr<ModOutput> prepare_output(mod_settings_t& s)
     else if (s.useUHDOutput) {
         s.normalise = 1.0f / normalise_factor;
         s.sdr_device_config.sampleRate = s.outputRate;
+        s.sdr_device_config.fixedPoint = (s.fftEngine != FFTEngine::FFTW);
         auto uhddevice = make_shared<Output::UHD>(s.sdr_device_config);
         output = make_shared<Output::SDR>(s.sdr_device_config, uhddevice);
         rcs.enrol((Output::SDR*)output.get());
@@ -301,6 +303,7 @@ static shared_ptr<ModOutput> prepare_output(mod_settings_t& s)
         /* We normalise the same way as for the UHD output */
         s.normalise = 1.0f / normalise_factor;
         s.sdr_device_config.sampleRate = s.outputRate;
+        if (s.fftEngine != FFTEngine::FFTW) throw runtime_error("soapy fixed_point unsupported");
         auto soapydevice = make_shared<Output::Soapy>(s.sdr_device_config);
         output = make_shared<Output::SDR>(s.sdr_device_config, soapydevice);
         rcs.enrol((Output::SDR*)output.get());
@@ -320,6 +323,7 @@ static shared_ptr<ModOutput> prepare_output(mod_settings_t& s)
     else if (s.useLimeOutput) {
         /* We normalise the same way as for the UHD output */
         s.normalise = 1.0f / normalise_factor;
+        if (s.fftEngine != FFTEngine::FFTW) throw runtime_error("limesdr fixed_point unsupported");
         s.sdr_device_config.sampleRate = s.outputRate;
         auto limedevice = make_shared<Output::Lime>(s.sdr_device_config);
         output = make_shared<Output::SDR>(s.sdr_device_config, limedevice);
@@ -330,6 +334,7 @@ static shared_ptr<ModOutput> prepare_output(mod_settings_t& s)
     else if (s.useBladeRFOutput) {
         /* We normalise specifically for the BladeRF output : range [-2048; 2047] */
         s.normalise = 2047.0f / normalise_factor;
+        if (s.fftEngine != FFTEngine::FFTW) throw runtime_error("bladerf fixed_point unsupported");
         s.sdr_device_config.sampleRate = s.outputRate;
         auto bladerfdevice = make_shared<Output::BladeRF>(s.sdr_device_config);
         output = make_shared<Output::SDR>(s.sdr_device_config, bladerfdevice);
@@ -420,7 +425,8 @@ int launch_modulator(int argc, char* argv[])
     ModulatorData m;
     rcs.enrol(&m);
 
-    {
+    // Neither KISS FFT used for fixedpoint nor the FFT Accelerator used for DEXTER need planning.
+    if (mod_settings.fftEngine == FFTEngine::FFTW) {
         // This is mostly useful on ARM systems where FFTW planning takes some time. If we do it here
         // it will be done before the modulator starts up
         etiLog.level(debug) << "Running FFTW planning...";
@@ -442,7 +448,14 @@ int launch_modulator(int argc, char* argv[])
     }
 
     std::string output_format;
-    if (mod_settings.useFileOutput and
+    if (mod_settings.fftEngine == FFTEngine::KISS) {
+        output_format = ""; //fixed point is native sc16, no converter needed
+    }
+    else if (mod_settings.fftEngine == FFTEngine::DEXTER) {
+        output_format = "s16"; // FPGA FFT Engine outputs s32
+    }
+    // else FFTW, i.e. floating point
+    else if (mod_settings.useFileOutput and
             (mod_settings.fileOutputFormat == "s8" or
              mod_settings.fileOutputFormat == "u8" or
              mod_settings.fileOutputFormat == "s16")) {
diff --git a/src/DabModulator.cpp b/src/DabModulator.cpp
index 4a29132..5f7aaf6 100644
--- a/src/DabModulator.cpp
+++ b/src/DabModulator.cpp
@@ -3,7 +3,7 @@
    Her Majesty the Queen in Right of Canada (Communications Research
    Center Canada)
 
-   Copyright (C) 2023
+   Copyright (C) 2024
    Matthias P. Braendli, matthias.braendli@mpb.li
 
     http://opendigitalradio.org
@@ -54,7 +54,6 @@
 #include "SignalMultiplexer.h"
 #include "TII.h"
 #include "TimeInterleaver.h"
-#include "TimestampDecoder.h"
 
 using namespace std;
 
@@ -142,14 +141,15 @@ int DabModulator::process(Buffer* dataOut)
         auto cifMux = make_shared<FrameMultiplexer>(m_etiSource);
         auto cifPart = make_shared<BlockPartitioner>(mode);
 
-        auto cifMap = make_shared<QpskSymbolMapper>(m_nbCarriers);
-        auto cifRef = make_shared<PhaseReference>(mode);
-        auto cifFreq = make_shared<FrequencyInterleaver>(mode);
-        auto cifDiff = make_shared<DifferentialModulator>(m_nbCarriers);
+        const bool fixedPoint = m_settings.fftEngine != FFTEngine::FFTW;
+        auto cifMap = make_shared<QpskSymbolMapper>(m_nbCarriers, fixedPoint);
+        auto cifRef = make_shared<PhaseReference>(mode, fixedPoint);
+        auto cifFreq = make_shared<FrequencyInterleaver>(mode, fixedPoint);
+        auto cifDiff = make_shared<DifferentialModulator>(m_nbCarriers, fixedPoint);
 
-        auto cifNull = make_shared<NullSymbol>(m_nbCarriers);
-        auto cifSig = make_shared<SignalMultiplexer>(
-                (1 + m_nbSymbols) * m_nbCarriers * sizeof(complexf));
+        auto cifNull = make_shared<NullSymbol>(m_nbCarriers,
+                fixedPoint ? sizeof(complexfix) : sizeof(complexf));
+        auto cifSig = make_shared<SignalMultiplexer>();
 
         // TODO this needs a review
         bool useCicEq = false;
@@ -180,46 +180,79 @@ int DabModulator::process(Buffer* dataOut)
         try {
             tii = make_shared<TII>(
                     m_settings.dabMode,
-                    m_settings.tiiConfig);
+                    m_settings.tiiConfig,
+                    fixedPoint);
             rcs.enrol(tii.get());
-            tiiRef = make_shared<PhaseReference>(mode);
+            tiiRef = make_shared<PhaseReference>(mode, fixedPoint);
         }
         catch (const TIIError& e) {
             etiLog.level(error) << "Could not initialise TII: " << e.what();
         }
 
-        auto cifOfdm = make_shared<OfdmGenerator>(
-                (1 + m_nbSymbols),
-                m_nbCarriers,
-                m_spacing,
-                m_settings.enableCfr,
-                m_settings.cfrClip,
-                m_settings.cfrErrorClip);
+        shared_ptr<ModPlugin> cifOfdm;
+
+        switch (m_settings.fftEngine) {
+            case FFTEngine::FFTW:
+                {
+                    auto ofdm = make_shared<OfdmGeneratorCF32>(
+                            (1 + m_nbSymbols),
+                            m_nbCarriers,
+                            m_spacing,
+                            m_settings.enableCfr,
+                            m_settings.cfrClip,
+                            m_settings.cfrErrorClip);
+                    rcs.enrol(ofdm.get());
+                    cifOfdm = ofdm;
+                }
+                break;
+            case FFTEngine::KISS:
+                cifOfdm = make_shared<OfdmGeneratorFixed>(
+                        (1 + m_nbSymbols),
+                        m_nbCarriers,
+                        m_spacing);
+                break;
+            case FFTEngine::DEXTER:
+#if defined(HAVE_DEXTER)
+                cifOfdm = make_shared<OfdmGeneratorDEXTER>(
+                        (1 + m_nbSymbols),
+                        m_nbCarriers,
+                        m_spacing);
+#else
+                throw std::runtime_error("Cannot use DEXTER fft engine without --enable-dexter");
+#endif
+                break;
+        }
 
-        rcs.enrol(cifOfdm.get());
+        shared_ptr<GainControl> cifGain;
 
-        auto cifGain = make_shared<GainControl>(
-                m_spacing,
-                m_settings.gainMode,
-                m_settings.digitalgain,
-                m_settings.normalise,
-                m_settings.gainmodeVariance);
+        if (not fixedPoint) {
+            cifGain = make_shared<GainControl>(
+                    m_spacing,
+                    m_settings.gainMode,
+                    m_settings.digitalgain,
+                    m_settings.normalise,
+                    m_settings.gainmodeVariance);
 
-        rcs.enrol(cifGain.get());
+            rcs.enrol(cifGain.get());
+        }
 
         auto cifGuard = make_shared<GuardIntervalInserter>(
                 m_nbSymbols, m_spacing, m_nullSize, m_symSize,
-                m_settings.ofdmWindowOverlap);
+                m_settings.ofdmWindowOverlap, m_settings.fftEngine);
         rcs.enrol(cifGuard.get());
 
         shared_ptr<FIRFilter> cifFilter;
         if (not m_settings.filterTapsFilename.empty()) {
+            if (fixedPoint) throw std::runtime_error("fixed point doesn't support fir filter");
+
             cifFilter = make_shared<FIRFilter>(m_settings.filterTapsFilename);
             rcs.enrol(cifFilter.get());
         }
 
         shared_ptr<MemlessPoly> cifPoly;
         if (not m_settings.polyCoefFilename.empty()) {
+            if (fixedPoint) throw std::runtime_error("fixed point doesn't support predistortion");
+
             cifPoly = make_shared<MemlessPoly>(m_settings.polyCoefFilename,
                                                m_settings.polyNumThreads);
             rcs.enrol(cifPoly.get());
@@ -227,15 +260,21 @@ int DabModulator::process(Buffer* dataOut)
 
         shared_ptr<Resampler> cifRes;
         if (m_settings.outputRate != 2048000) {
+            if (fixedPoint) throw std::runtime_error("fixed point doesn't support resampler");
+
             cifRes = make_shared<Resampler>(
                     2048000,
                     m_settings.outputRate,
                     m_spacing);
         }
 
-        if (not m_format.empty()) {
-            m_formatConverter = make_shared<FormatConverter>(m_format);
+        if (m_settings.fftEngine == FFTEngine::FFTW and not m_format.empty()) {
+            m_formatConverter = make_shared<FormatConverter>(false, m_format);
+        }
+        else if (m_settings.fftEngine == FFTEngine::DEXTER) {
+            m_formatConverter = make_shared<FormatConverter>(true, m_format);
         }
+        // KISS is already in s16
 
         m_output = make_shared<OutputMemory>(dataOut);
 
diff --git a/src/DabModulator.h b/src/DabModulator.h
index 093a782..82782cd 100644
--- a/src/DabModulator.h
+++ b/src/DabModulator.h
@@ -40,12 +40,8 @@
 #include "EtiReader.h"
 #include "Flowgraph.h"
 #include "FormatConverter.h"
-#include "GainControl.h"
 #include "OutputMemory.h"
 #include "RemoteControl.h"
-#include "Log.h"
-#include "TII.h"
-
 
 class DabModulator : public ModInput, public ModMetadata, public RemoteControllable
 {
diff --git a/src/DifferentialModulator.cpp b/src/DifferentialModulator.cpp
index 97a7998..21b4c3e 100644
--- a/src/DifferentialModulator.cpp
+++ b/src/DifferentialModulator.cpp
@@ -22,17 +22,14 @@
 #include "DifferentialModulator.h"
 #include "PcDebug.h"
 
-#include <stdio.h>
+#include <cstdio>
 #include <stdexcept>
-#include <complex>
-#include <string.h>
+#include <cstring>
 
-typedef std::complex<float> complexf;
-
-
-DifferentialModulator::DifferentialModulator(size_t carriers) :
+DifferentialModulator::DifferentialModulator(size_t carriers, bool fixedPoint) :
     ModMux(),
-    d_carriers(carriers)
+    m_carriers(carriers),
+    m_fixedPoint(fixedPoint)
 {
     PDEBUG("DifferentialModulator::DifferentialModulator(%zu)\n", carriers);
 
@@ -42,10 +39,42 @@ DifferentialModulator::DifferentialModulator(size_t carriers) :
 DifferentialModulator::~DifferentialModulator()
 {
     PDEBUG("DifferentialModulator::~DifferentialModulator()\n");
-
 }
 
 
+template<typename T>
+void do_process(size_t carriers, const std::vector<Buffer*>& dataIn, Buffer* dataOut)
+{
+    size_t phaseSize = dataIn[0]->getLength() / sizeof(T);
+    size_t dataSize = dataIn[1]->getLength() / sizeof(T);
+    dataOut->setLength((phaseSize + dataSize) * sizeof(T));
+
+    const T* phase = reinterpret_cast<const T*>(dataIn[0]->getData());
+    const T* in = reinterpret_cast<const T*>(dataIn[1]->getData());
+    T* out = reinterpret_cast<T*>(dataOut->getData());
+
+    if (phaseSize != carriers) {
+        throw std::runtime_error(
+                "DifferentialModulator::process input phase size not valid!");
+    }
+    if (dataSize % carriers != 0) {
+        throw std::runtime_error(
+                "DifferentialModulator::process input data size not valid!");
+    }
+
+    memcpy(dataOut->getData(), phase, phaseSize * sizeof(T));
+    for (size_t i = 0; i < dataSize; i += carriers) {
+        for (size_t j = 0; j < carriers; j += 4) {
+            out[carriers + j] = out[j] * in[j];
+            out[carriers + j + 1] = out[j + 1] * in[j + 1];
+            out[carriers + j + 2] = out[j + 2] * in[j + 2];
+            out[carriers + j + 3] = out[j + 3] * in[j + 3];
+        }
+        in += carriers;
+        out += carriers;
+    }
+}
+
 // dataIn[0] -> phase reference
 // dataIn[1] -> data symbols
 int DifferentialModulator::process(std::vector<Buffer*> dataIn, Buffer* dataOut)
@@ -67,33 +96,11 @@ int DifferentialModulator::process(std::vector<Buffer*> dataIn, Buffer* dataOut)
                 "DifferentialModulator::process nb of input streams not 2!");
     }
 
-    size_t phaseSize = dataIn[0]->getLength() / sizeof(complexf);
-    size_t dataSize = dataIn[1]->getLength() / sizeof(complexf);
-    dataOut->setLength((phaseSize + dataSize) * sizeof(complexf));
-
-    const complexf* phase = reinterpret_cast<const complexf*>(dataIn[0]->getData());
-    const complexf* in = reinterpret_cast<const complexf*>(dataIn[1]->getData());
-    complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
-
-    if (phaseSize != d_carriers) {
-        throw std::runtime_error(
-                "DifferentialModulator::process input phase size not valid!");
-    }
-    if (dataSize % d_carriers != 0) {
-        throw std::runtime_error(
-                "DifferentialModulator::process input data size not valid!");
+    if (m_fixedPoint) {
+        do_process<complexfix>(m_carriers, dataIn, dataOut);
     }
-
-    memcpy(dataOut->getData(), phase, phaseSize * sizeof(complexf));
-    for (size_t i = 0; i < dataSize; i += d_carriers) {
-        for (size_t j = 0; j < d_carriers; j += 4) {
-            out[d_carriers + j] = out[j] * in[j];
-            out[d_carriers + j + 1] = out[j + 1] * in[j + 1];
-            out[d_carriers + j + 2] = out[j + 2] * in[j + 2];
-            out[d_carriers + j + 3] = out[j + 3] * in[j + 3];
-        }
-        in += d_carriers;
-        out += d_carriers;
+    else {
+        do_process<complexf>(m_carriers, dataIn, dataOut);
     }
 
     return dataOut->getLength();
diff --git a/src/DifferentialModulator.h b/src/DifferentialModulator.h
index b26ea8b..9cc5081 100644
--- a/src/DifferentialModulator.h
+++ b/src/DifferentialModulator.h
@@ -35,7 +35,7 @@
 class DifferentialModulator : public ModMux
 {
 public:
-    DifferentialModulator(size_t carriers);
+    DifferentialModulator(size_t carriers, bool fixedPoint);
     virtual ~DifferentialModulator();
     DifferentialModulator(const DifferentialModulator&);
     DifferentialModulator& operator=(const DifferentialModulator&);
@@ -45,6 +45,7 @@ public:
     const char* name() { return "DifferentialModulator"; }
 
 protected:
-    size_t d_carriers;
+    size_t m_carriers;
+    size_t m_fixedPoint;
 };
 
diff --git a/src/FIRFilter.h b/src/FIRFilter.h
index a4effa1..2d8fba9 100644
--- a/src/FIRFilter.h
+++ b/src/FIRFilter.h
@@ -33,21 +33,14 @@
 
 #include "RemoteControl.h"
 #include "ModPlugin.h"
-#include "PcDebug.h"
 
 #include <sys/types.h>
-#include <complex>
-#include <thread>
 #include <vector>
-#include <time.h>
 #include <cstdio>
 #include <string>
-#include <memory>
 
 #define FIRFILTER_PIPELINE_DELAY 1
 
-typedef std::complex<float> complexf;
-
 class FIRFilter : public PipelinedModCodec, public RemoteControllable
 {
 public:
diff --git a/src/Flowgraph.cpp b/src/Flowgraph.cpp
index 3d4cdcc..339e326 100644
--- a/src/Flowgraph.cpp
+++ b/src/Flowgraph.cpp
@@ -27,12 +27,10 @@
 #include "Flowgraph.h"
 #include "PcDebug.h"
 #include "Log.h"
-#include <string>
 #include <memory>
 #include <algorithm>
 #include <sstream>
 #include <sys/types.h>
-#include <stdexcept>
 #include <assert.h>
 #include <sys/time.h>
 
@@ -254,15 +252,15 @@ Flowgraph::~Flowgraph()
         char node_time_sz[1024] = {};
 
         for (const auto &node : nodes) {
-            snprintf(node_time_sz, 1023, "  %30s: %10lu us (%2.2f %%)\n",
+            snprintf(node_time_sz, 1023, "  %30s: %10lld us (%2.2f %%)\n",
                     node->plugin()->name(),
-                    node->processTime(),
+                    (long long)node->processTime(),
                     node->processTime() * 100.0 / myProcessTime);
             ss << node_time_sz;
         }
 
-        snprintf(node_time_sz, 1023, "  %30s: %10lu us (100.00 %%)\n", "total",
-                myProcessTime);
+        snprintf(node_time_sz, 1023, "  %30s: %10lld us (100.00 %%)\n", "total",
+                (long long)myProcessTime);
         ss << node_time_sz;
 
         etiLog.level(debug) << ss.str();
diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp
index e8e76ed..517f26e 100644
--- a/src/FormatConverter.cpp
+++ b/src/FormatConverter.cpp
@@ -28,17 +28,37 @@
 
 #include "FormatConverter.h"
 #include "PcDebug.h"
+#include "Log.h"
 
-#include <sys/types.h>
-#include <string.h>
 #include <stdexcept>
+#include <cstring>
 #include <assert.h>
+#include <sys/types.h>
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 
-FormatConverter::FormatConverter(const std::string& format) :
+FormatConverter::FormatConverter(bool input_is_complexfix_wide, const std::string& format_out) :
     ModCodec(),
-    m_format(format)
+    m_input_complexfix_wide(input_is_complexfix_wide),
+    m_format_out(format_out)
 { }
 
+FormatConverter::~FormatConverter()
+{
+    if (
+#if defined(__ARM_NEON)
+    not m_input_complexfix_wide
+#else
+    true
+#endif
+    ) {
+        etiLog.level(debug) << "FormatConverter: " <<
+            m_num_clipped_samples.load() << " clipped";
+    }
+}
+
+
 /* Expect the input samples to be in the correct range for the required format */
 int FormatConverter::process(Buffer* const dataIn, Buffer* dataOut)
 {
@@ -47,71 +67,113 @@ int FormatConverter::process(Buffer* const dataIn, Buffer* dataOut)
 
     size_t num_clipped_samples = 0;
 
-    size_t sizeIn = dataIn->getLength() / sizeof(float);
-    float* in = reinterpret_cast<float*>(dataIn->getData());
+    if (m_input_complexfix_wide) {
+        size_t sizeIn = dataIn->getLength() / sizeof(int32_t);
+        if (m_format_out == "s16") {
+            dataOut->setLength(sizeIn * sizeof(int16_t));
+            const int32_t *in = reinterpret_cast<int32_t*>(dataIn->getData());
+            int16_t* out = reinterpret_cast<int16_t*>(dataOut->getData());
 
-    if (m_format == "s16") {
-        dataOut->setLength(sizeIn * sizeof(int16_t));
-        int16_t* out = reinterpret_cast<int16_t*>(dataOut->getData());
+            constexpr int shift = 6;
 
-        for (size_t i = 0; i < sizeIn; i++) {
-            if (in[i] < INT16_MIN) {
-                out[i] = INT16_MIN;
-                num_clipped_samples++;
+#if defined(__ARM_NEON)
+            if (sizeIn % 4 != 0) {
+                throw std::logic_error("Unexpected length not multiple of 4");
             }
-            else if (in[i] > INT16_MAX) {
-                out[i] = INT16_MAX;
-                num_clipped_samples++;
+
+            for (size_t i = 0; i < sizeIn; i += 4) {
+                int32x4_t input_vec = vld1q_s32(&in[i]);
+                // Apply shift right, saturate on conversion to int16_t
+                int16x4_t output_vec = vqshrn_n_s32(input_vec, shift);
+                vst1_s16(&out[i], output_vec);
             }
-            else {
-                out[i] = in[i];
+#else
+            for (size_t i = 0; i < sizeIn; i++) {
+                const int32_t val = in[i] >> shift;
+                if (val < INT16_MIN) {
+                    out[i] = INT16_MIN;
+                    num_clipped_samples++;
+                }
+                else if (val > INT16_MAX) {
+                    out[i] = INT16_MAX;
+                    num_clipped_samples++;
+                }
+                else {
+                    out[i] = val;
+                }
             }
+#endif
         }
-    }
-    else if (m_format == "u8") {
-        dataOut->setLength(sizeIn * sizeof(int8_t));
-        uint8_t* out = reinterpret_cast<uint8_t*>(dataOut->getData());
-
-        for (size_t i = 0; i < sizeIn; i++) {
-            const auto samp = in[i] + 128.0f;
-            if (samp < 0) {
-                out[i] = 0;
-                num_clipped_samples++;
-            }
-            else if (samp > UINT8_MAX) {
-                out[i] = UINT8_MAX;
-                num_clipped_samples++;
-            }
-            else {
-                out[i] = samp;
-            }
-
+        else {
+            throw std::runtime_error("FormatConverter: Invalid fix format " + m_format_out);
         }
     }
-    else if (m_format == "s8") {
-        dataOut->setLength(sizeIn * sizeof(int8_t));
-        int8_t* out = reinterpret_cast<int8_t*>(dataOut->getData());
-
-        for (size_t i = 0; i < sizeIn; i++) {
-            if (in[i] < INT8_MIN) {
-                out[i] = INT8_MIN;
-                num_clipped_samples++;
+    else {
+        size_t sizeIn = dataIn->getLength() / sizeof(float);
+        const float* in = reinterpret_cast<float*>(dataIn->getData());
+
+        if (m_format_out == "s16") {
+            dataOut->setLength(sizeIn * sizeof(int16_t));
+            int16_t* out = reinterpret_cast<int16_t*>(dataOut->getData());
+
+            for (size_t i = 0; i < sizeIn; i++) {
+                if (in[i] < INT16_MIN) {
+                    out[i] = INT16_MIN;
+                    num_clipped_samples++;
+                }
+                else if (in[i] > INT16_MAX) {
+                    out[i] = INT16_MAX;
+                    num_clipped_samples++;
+                }
+                else {
+                    out[i] = in[i];
+                }
             }
-            else if (in[i] > INT8_MAX) {
-                out[i] = INT8_MAX;
-                num_clipped_samples++;
+        }
+        else if (m_format_out == "u8") {
+            dataOut->setLength(sizeIn * sizeof(int8_t));
+            uint8_t* out = reinterpret_cast<uint8_t*>(dataOut->getData());
+
+            for (size_t i = 0; i < sizeIn; i++) {
+                const auto samp = in[i] + 128.0f;
+                if (samp < 0) {
+                    out[i] = 0;
+                    num_clipped_samples++;
+                }
+                else if (samp > UINT8_MAX) {
+                    out[i] = UINT8_MAX;
+                    num_clipped_samples++;
+                }
+                else {
+                    out[i] = samp;
+                }
+
             }
-            else {
-                out[i] = in[i];
+        }
+        else if (m_format_out == "s8") {
+            dataOut->setLength(sizeIn * sizeof(int8_t));
+            int8_t* out = reinterpret_cast<int8_t*>(dataOut->getData());
+
+            for (size_t i = 0; i < sizeIn; i++) {
+                if (in[i] < INT8_MIN) {
+                    out[i] = INT8_MIN;
+                    num_clipped_samples++;
+                }
+                else if (in[i] > INT8_MAX) {
+                    out[i] = INT8_MAX;
+                    num_clipped_samples++;
+                }
+                else {
+                    out[i] = in[i];
+                }
             }
         }
-    }
-    else {
-        throw std::runtime_error("FormatConverter: Invalid format " + m_format);
+        else {
+            throw std::runtime_error("FormatConverter: Invalid format " + m_format_out);
+        }
     }
 
     m_num_clipped_samples.store(num_clipped_samples);
-
     return dataOut->getLength();
 }
 
diff --git a/src/FormatConverter.h b/src/FormatConverter.h
index 05511c0..1ed2283 100644
--- a/src/FormatConverter.h
+++ b/src/FormatConverter.h
@@ -33,18 +33,19 @@
 #endif
 
 #include "ModPlugin.h"
-#include <complex>
 #include <atomic>
 #include <string>
-#include <cstdint>
 
 class FormatConverter : public ModCodec
 {
     public:
         static size_t get_format_size(const std::string& format);
 
-        // Allowed formats: s8, u8 and s16
-        FormatConverter(const std::string& format);
+        // floating-point input allows output formats: s8, u8 and s16
+        // complexfix_wide input allows output formats: s16
+        // complexfix input is already in s16, and needs no converter
+        FormatConverter(bool input_is_complexfix_wide, const std::string& format_out);
+        virtual ~FormatConverter();
 
         int process(Buffer* const dataIn, Buffer* dataOut);
         const char* name();
@@ -52,7 +53,8 @@ class FormatConverter : public ModCodec
         size_t get_num_clipped_samples() const;
 
     private:
-        std::string m_format;
+        bool m_input_complexfix_wide;
+        std::string m_format_out;
 
         std::atomic<size_t> m_num_clipped_samples = 0;
 };
diff --git a/src/FrameMultiplexer.cpp b/src/FrameMultiplexer.cpp
index e893120..ebd8b76 100644
--- a/src/FrameMultiplexer.cpp
+++ b/src/FrameMultiplexer.cpp
@@ -25,17 +25,11 @@
  */
 
 #include "FrameMultiplexer.h"
-#include "PcDebug.h"
 
-#include <stdio.h>
 #include <string>
-#include <stdexcept>
-#include <complex>
-#include <memory>
-#include <assert.h>
-#include <string.h>
-
-typedef std::complex<float> complexf;
+#include <cstdio>
+#include <cassert>
+#include <cstring>
 
 FrameMultiplexer::FrameMultiplexer(
         const EtiSource& etiSource) :
diff --git a/src/FrequencyInterleaver.cpp b/src/FrequencyInterleaver.cpp
index e76d525..6f36dcb 100644
--- a/src/FrequencyInterleaver.cpp
+++ b/src/FrequencyInterleaver.cpp
@@ -22,17 +22,15 @@
 #include "FrequencyInterleaver.h"
 #include "PcDebug.h"
 
-#include <stdio.h>
 #include <stdexcept>
 #include <string>
-#include <stdlib.h>
-#include <complex>
+#include <cstdio>
+#include <cstdlib>
 
-typedef std::complex<float> complexf;
 
-
-FrequencyInterleaver::FrequencyInterleaver(size_t mode) :
-    ModCodec()
+FrequencyInterleaver::FrequencyInterleaver(size_t mode, bool fixedPoint) :
+    ModCodec(),
+    m_fixedPoint(fixedPoint)
 {
     PDEBUG("FrequencyInterleaver::FrequencyInterleaver(%zu) @ %p\n",
             mode, this);
@@ -42,54 +40,53 @@ FrequencyInterleaver::FrequencyInterleaver(size_t mode) :
     size_t beta;
     switch (mode) {
     case 1:
-        d_carriers = 1536;
+        m_carriers = 1536;
         num = 2048;
         beta = 511;
         break;
     case 2:
-        d_carriers = 384;
+        m_carriers = 384;
         num = 512;
         beta = 127;
         break;
     case 3:
-        d_carriers = 192;
+        m_carriers = 192;
         num = 256;
         beta = 63;
         break;
     case 0:
     case 4:
-        d_carriers = 768;
+        m_carriers = 768;
         num = 1024;
         beta = 255;
         break;
     default:
-        PDEBUG("Carriers: %zu\n", (d_carriers >> 1) << 1);
-        throw std::runtime_error("FrequencyInterleaver::FrequencyInterleaver "
-                "nb of carriers invalid!");
-        break;
+        PDEBUG("Carriers: %zu\n", (m_carriers >> 1) << 1);
+        throw std::runtime_error("FrequencyInterleaver: invalid dab mode");
     }
 
-    const int ret = posix_memalign((void**)(&d_indexes), 16, d_carriers * sizeof(size_t));
+    const int ret = posix_memalign((void**)(&m_indices), 16, m_carriers * sizeof(size_t));
     if (ret != 0) {
         throw std::runtime_error("memory allocation failed: " + std::to_string(ret));
     }
 
-    size_t* index = d_indexes;
+    size_t *index = m_indices;
     size_t perm = 0;
     PDEBUG("i: %4u, R: %4u\n", 0, 0);
     for (size_t j = 1; j < num; ++j) {
         perm = (alpha * perm + beta) & (num - 1);
-        if (perm >= ((num - d_carriers) / 2)
-                && perm <= (num - (num - d_carriers) / 2)
+        if (perm >= ((num - m_carriers) / 2)
+                && perm <= (num - (num - m_carriers) / 2)
                 && perm != (num / 2)) {
             PDEBUG("i: %4zu, R: %4zu, d: %4zu, n: %4zu, k: %5zi, index: %zu\n",
-                    j, perm, perm, index - d_indexes, perm - num / 2,
+                    j, perm, perm, index - m_indices, perm - num / 2,
                     perm > num / 2
                     ?  perm - (1 + (num / 2))
-                    : perm + (d_carriers - (num / 2)));
+                    : perm + (m_carriers - (num / 2)));
             *(index++) = perm > num / 2 ?
-                perm - (1 + (num / 2)) : perm + (d_carriers - (num / 2));
-        } else {
+                perm - (1 + (num / 2)) : perm + (m_carriers - (num / 2));
+        }
+        else {
             PDEBUG("i: %4zu, R: %4zu\n", j, perm);
         }
     }
@@ -100,9 +97,33 @@ FrequencyInterleaver::~FrequencyInterleaver()
 {
     PDEBUG("FrequencyInterleaver::~FrequencyInterleaver() @ %p\n", this);
 
-    free(d_indexes);
+    free(m_indices);
 }
 
+template<typename T>
+void do_process(Buffer* const dataIn, Buffer* dataOut,
+        size_t carriers, const size_t * const indices)
+{
+    const T* in = reinterpret_cast<const T*>(dataIn->getData());
+    T* out = reinterpret_cast<T*>(dataOut->getData());
+    size_t sizeIn = dataIn->getLength() / sizeof(T);
+
+    if (sizeIn % carriers != 0) {
+        throw std::runtime_error(
+                "FrequencyInterleaver::process input size not valid!");
+    }
+
+    for (size_t i = 0; i < sizeIn;) {
+//      memset(out, 0, m_carriers * sizeof(T));
+        for (size_t j = 0; j < carriers; i += 4, j += 4) {
+            out[indices[j]] = in[i];
+            out[indices[j + 1]] = in[i + 1];
+            out[indices[j + 2]] = in[i + 2];
+            out[indices[j + 3]] = in[i + 3];
+        }
+        out += carriers;
+    }
+}
 
 int FrequencyInterleaver::process(Buffer* const dataIn, Buffer* dataOut)
 {
@@ -112,24 +133,11 @@ int FrequencyInterleaver::process(Buffer* const dataIn, Buffer* dataOut)
 
     dataOut->setLength(dataIn->getLength());
 
-    const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData());
-    complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
-    size_t sizeIn = dataIn->getLength() / sizeof(complexf);
-
-    if (sizeIn % d_carriers != 0) {
-        throw std::runtime_error(
-                "FrequencyInterleaver::process input size not valid!");
+    if (m_fixedPoint) {
+        do_process<complexfix>(dataIn, dataOut, m_carriers, m_indices);
     }
-
-    for (size_t i = 0; i < sizeIn;) {
-//        memset(out, 0, d_carriers * sizeof(complexf));
-        for (size_t j = 0; j < d_carriers; i += 4, j += 4) {
-            out[d_indexes[j]] = in[i];
-            out[d_indexes[j + 1]] = in[i + 1];
-            out[d_indexes[j + 2]] = in[i + 2];
-            out[d_indexes[j + 3]] = in[i + 3];
-        }
-        out += d_carriers;
+    else {
+        do_process<complexf>(dataIn, dataOut, m_carriers, m_indices);
     }
 
     return 1;
diff --git a/src/FrequencyInterleaver.h b/src/FrequencyInterleaver.h
index 43ca21a..b31b968 100644
--- a/src/FrequencyInterleaver.h
+++ b/src/FrequencyInterleaver.h
@@ -25,16 +25,14 @@
 #   include <config.h>
 #endif
 
-
 #include "ModPlugin.h"
 
 #include <sys/types.h>
 
-
 class FrequencyInterleaver : public ModCodec
 {
 public:
-    FrequencyInterleaver(size_t mode);
+    FrequencyInterleaver(size_t mode, bool fixedPoint);
     virtual ~FrequencyInterleaver();
     FrequencyInterleaver(const FrequencyInterleaver&) = delete;
     FrequencyInterleaver& operator=(const FrequencyInterleaver&) = delete;
@@ -43,7 +41,8 @@ public:
     const char* name() override { return "FrequencyInterleaver"; }
 
 protected:
-    size_t d_carriers;
-    size_t* d_indexes;
+    bool m_fixedPoint;
+    size_t m_carriers;
+    size_t *m_indices;
 };
 
diff --git a/src/GainControl.h b/src/GainControl.h
index 04f6b58..d40a7d7 100644
--- a/src/GainControl.h
+++ b/src/GainControl.h
@@ -35,7 +35,6 @@
 #include "RemoteControl.h"
 
 #include <sys/types.h>
-#include <complex>
 #include <string>
 #include <mutex>
 
@@ -43,9 +42,6 @@
 #   include <xmmintrin.h>
 #endif
 
-
-typedef std::complex<float> complexf;
-
 enum class GainMode { GAIN_FIX = 0, GAIN_MAX = 1, GAIN_VAR = 2 };
 
 class GainControl : public PipelinedModCodec, public RemoteControllable
diff --git a/src/GuardIntervalInserter.cpp b/src/GuardIntervalInserter.cpp
index 3c2db14..26d4fd1 100644
--- a/src/GuardIntervalInserter.cpp
+++ b/src/GuardIntervalInserter.cpp
@@ -29,39 +29,47 @@
 #include <cstring>
 #include <cassert>
 #include <stdexcept>
-#include <complex>
 #include <mutex>
 
-typedef std::complex<float> complexf;
+GuardIntervalInserter::Params::Params(
+        size_t nbSymbols,
+        size_t spacing,
+        size_t nullSize,
+        size_t symSize,
+        size_t& windowOverlap) :
+    nbSymbols(nbSymbols),
+    spacing(spacing),
+    nullSize(nullSize),
+    symSize(symSize),
+    windowOverlap(windowOverlap) {}
 
 GuardIntervalInserter::GuardIntervalInserter(
         size_t nbSymbols,
         size_t spacing,
         size_t nullSize,
         size_t symSize,
-        size_t& windowOverlap) :
+        size_t& windowOverlap,
+        FFTEngine fftEngine) :
     ModCodec(),
     RemoteControllable("guardinterval"),
-    d_nbSymbols(nbSymbols),
-    d_spacing(spacing),
-    d_nullSize(nullSize),
-    d_symSize(symSize),
-    d_windowOverlap(windowOverlap)
+    m_fftEngine(fftEngine),
+    m_params(nbSymbols, spacing, nullSize, symSize, windowOverlap)
 {
-    if (d_nullSize == 0) {
+    if (nullSize == 0) {
         throw std::logic_error("NULL symbol must be present");
     }
 
+
     RC_ADD_PARAMETER(windowlen, "Window length for OFDM windowng [0 to disable]");
 
     /* We use a raised-cosine window for the OFDM windowing.
-     * Each symbol is extended on both sides by d_windowOverlap samples.
+     * Each symbol is extended on both sides by windowOverlap samples.
      *
      *
      * Sym n             |####################|
      * Sym n+1                                 |####################|
      *
-     * We now extend the symbols by d_windowOverlap (one dash)
+     * We now extend the symbols by windowOverlap (one dash)
      *
      * Sym n extended   -|####################|-
      * Sym n+1 extended                       -|####################|-
@@ -75,7 +83,7 @@ GuardIntervalInserter::GuardIntervalInserter(
      *                                         /                    \
      *                    ... ________________/                      \__ ...
      *
-     * The window length is 2*d_windowOverlap.
+     * The window length is 2*windowOverlap.
      */
 
     update_window(windowOverlap);
@@ -87,44 +95,43 @@ GuardIntervalInserter::GuardIntervalInserter(
 
 void GuardIntervalInserter::update_window(size_t new_window_overlap)
 {
-    std::lock_guard<std::mutex> lock(d_windowMutex);
+    std::lock_guard<std::mutex> lock(m_params.windowMutex);
 
-    d_windowOverlap = new_window_overlap;
+    m_params.windowOverlap = new_window_overlap;
 
-    // d_window only contains the rising window edge.
-    d_window.resize(2*d_windowOverlap);
-    for (size_t i = 0; i < 2*d_windowOverlap; i++) {
-        d_window[i] = (float)(0.5 * (1.0 - cos(M_PI * i / (2*d_windowOverlap - 1))));
+    // m_params.window only contains the rising window edge.
+    m_params.window.resize(2*m_params.windowOverlap);
+    for (size_t i = 0; i < 2*m_params.windowOverlap; i++) {
+        m_params.window[i] = (float)(0.5 * (1.0 - cos(M_PI * i / (2*m_params.windowOverlap - 1))));
     }
 }
 
-int GuardIntervalInserter::process(Buffer* const dataIn, Buffer* dataOut)
+template<typename T>
+int do_process(const GuardIntervalInserter::Params& p, Buffer* const dataIn, Buffer* dataOut)
 {
-    PDEBUG("GuardIntervalInserter::process(dataIn: %p, dataOut: %p)\n",
+    PDEBUG("GuardIntervalInserter do_process(dataIn: %p, dataOut: %p)\n",
             dataIn, dataOut);
 
-    std::lock_guard<std::mutex> lock(d_windowMutex);
-
-    // Every symbol overlaps over a length of d_windowOverlap with
+    // Every symbol overlaps over a length of windowOverlap with
     // the previous symbol, and with the next symbol. First symbol
     // receives no prefix window, because we don't remember the
     // last symbol from the previous TF (yet). Last symbol also
     // receives no suffix window, for the same reason.
     // Overall output buffer length must stay independent of the windowing.
-    dataOut->setLength((d_nullSize + (d_nbSymbols * d_symSize)) * sizeof(complexf));
+    dataOut->setLength((p.nullSize + (p.nbSymbols * p.symSize)) * sizeof(T));
 
-    const complexf* in = reinterpret_cast<const complexf*>(dataIn->getData());
-    complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
-    size_t sizeIn = dataIn->getLength() / sizeof(complexf);
+    const T* in = reinterpret_cast<const T*>(dataIn->getData());
+    T* out = reinterpret_cast<T*>(dataOut->getData());
+    size_t sizeIn = dataIn->getLength() / sizeof(T);
 
-    const size_t num_symbols = d_nbSymbols + 1;
-    if (sizeIn != num_symbols * d_spacing)
+    const size_t num_symbols = p.nbSymbols + 1;
+    if (sizeIn != num_symbols * p.spacing)
     {
-        PDEBUG("Nb symbols: %zu\n", d_nbSymbols);
-        PDEBUG("Spacing: %zu\n", d_spacing);
-        PDEBUG("Null size: %zu\n", d_nullSize);
-        PDEBUG("Sym size: %zu\n", d_symSize);
-        PDEBUG("\n%zu != %zu\n", sizeIn, (d_nbSymbols + 1) * d_spacing);
+        PDEBUG("Nb symbols: %zu\n", p.nbSymbols);
+        PDEBUG("Spacing: %zu\n", p.spacing);
+        PDEBUG("Null size: %zu\n", p.nullSize);
+        PDEBUG("Sym size: %zu\n", p.symSize);
+        PDEBUG("\n%zu != %zu\n", sizeIn, (p.nbSymbols + 1) * p.spacing);
         throw std::runtime_error(
                 "GuardIntervalInserter::process input size not valid!");
     }
@@ -132,139 +139,162 @@ int GuardIntervalInserter::process(Buffer* const dataIn, Buffer* dataOut)
     // TODO remember the end of the last TF so that we can do some
     //      windowing too.
 
-    if (d_windowOverlap) {
-        {
-            // Handle Null symbol separately because it is longer
-            const size_t prefixlength = d_nullSize - d_spacing;
-
-            // end = spacing
-            memcpy(out, &in[d_spacing - prefixlength],
-                    prefixlength * sizeof(complexf));
-
-            memcpy(&out[prefixlength], in, (d_spacing - d_windowOverlap) * sizeof(complexf));
+    std::lock_guard<std::mutex> lock(p.windowMutex);
+    if (p.windowOverlap) {
+        if constexpr (std::is_same_v<complexf, T>) {
+            {
+                // Handle Null symbol separately because it is longer
+                const size_t prefixlength = p.nullSize - p.spacing;
+
+                // end = spacing
+                memcpy(out, &in[p.spacing - prefixlength],
+                        prefixlength * sizeof(T));
+
+                memcpy(&out[prefixlength], in, (p.spacing - p.windowOverlap) * sizeof(T));
+
+                // The remaining part of the symbol must have half of the window applied,
+                // sloping down from 1 to 0.5
+                for (size_t i = 0; i < p.windowOverlap; i++) {
+                    const size_t out_ix = prefixlength + p.spacing - p.windowOverlap + i;
+                    const size_t in_ix = p.spacing - p.windowOverlap + i;
+                    out[out_ix] = in[in_ix] * p.window[2*p.windowOverlap - (i+1)];
+                }
 
-            // The remaining part of the symbol must have half of the window applied,
-            // sloping down from 1 to 0.5
-            for (size_t i = 0; i < d_windowOverlap; i++) {
-                const size_t out_ix = prefixlength + d_spacing - d_windowOverlap + i;
-                const size_t in_ix = d_spacing - d_windowOverlap + i;
-                out[out_ix] = in[in_ix] * d_window[2*d_windowOverlap - (i+1)];
-            }
+                // Suffix is taken from the beginning of the symbol, and sees the other
+                // half of the window applied.
+                for (size_t i = 0; i < p.windowOverlap; i++) {
+                    const size_t out_ix = prefixlength + p.spacing + i;
+                    out[out_ix] = in[i] * p.window[p.windowOverlap - (i+1)];
+                }
 
-            // Suffix is taken from the beginning of the symbol, and sees the other
-            // half of the window applied.
-            for (size_t i = 0; i < d_windowOverlap; i++) {
-                const size_t out_ix = prefixlength + d_spacing + i;
-                out[out_ix] = in[i] * d_window[d_windowOverlap - (i+1)];
+                in += p.spacing;
+                out += p.nullSize;
+                // out is now pointing to the proper end of symbol. There are
+                // windowOverlap samples ahead that were already written.
             }
 
-            in += d_spacing;
-            out += d_nullSize;
-            // out is now pointing to the proper end of symbol. There are
-            // d_windowOverlap samples ahead that were already written.
-        }
-
-        // Data symbols
-        for (size_t sym_ix = 0; sym_ix < d_nbSymbols; sym_ix++) {
-            /* _ix variables are indices into in[], _ox variables are
-             * indices for out[] */
-            const ssize_t start_rise_ox = -d_windowOverlap;
-            const size_t start_rise_ix = 2 * d_spacing - d_symSize - d_windowOverlap;
-            /*
-            const size_t start_real_symbol_ox = 0;
-            const size_t start_real_symbol_ix = 2 * d_spacing - d_symSize;
-            */
-            const ssize_t end_rise_ox = d_windowOverlap;
-            const size_t end_rise_ix = 2 * d_spacing - d_symSize + d_windowOverlap;
-            const ssize_t end_cyclic_prefix_ox = d_symSize - d_spacing;
-            /* end_cyclic_prefix_ix = end of symbol
-            const size_t begin_fall_ox = d_symSize - d_windowOverlap;
-            const size_t begin_fall_ix = d_spacing - d_windowOverlap;
-            const size_t end_real_symbol_ox = d_symSize;
-             end_real_symbol_ix = end of symbol
-            const size_t end_fall_ox = d_symSize + d_windowOverlap;
-            const size_t end_fall_ix = d_spacing + d_windowOverlap;
-            */
-
-            ssize_t ox = start_rise_ox;
-            size_t ix = start_rise_ix;
-
-            for (size_t i = 0; ix < end_rise_ix; i++) {
-                out[ox] += in[ix] * d_window.at(i);
-                ix++;
-                ox++;
-            }
-            assert(ox == end_rise_ox);
-
-            const size_t remaining_prefix_length = end_cyclic_prefix_ox - end_rise_ox;
-            memcpy( &out[ox], &in[ix],
-                    remaining_prefix_length * sizeof(complexf));
-            ox += remaining_prefix_length;
-            assert(ox == end_cyclic_prefix_ox);
-            ix = 0;
-
-            const bool last_symbol = (sym_ix + 1 >= d_nbSymbols);
-            if (last_symbol) {
-                // No windowing at all at end
-                memcpy(&out[ox], &in[ix], d_spacing * sizeof(complexf));
-                ox += d_spacing;
-            }
-            else {
-                // Copy the middle part of the symbol, d_windowOverlap samples
-                // short of the end.
-                memcpy( &out[ox],
-                        &in[ix],
-                        (d_spacing - d_windowOverlap) * sizeof(complexf));
-                ox += d_spacing - d_windowOverlap;
-                ix += d_spacing - d_windowOverlap;
-                assert(ox == (ssize_t)(d_symSize - d_windowOverlap));
-
-                // Apply window from 1 to 0.5 for the end of the symbol
-                for (size_t i = 0; ox < (ssize_t)d_symSize; i++) {
-                    out[ox] = in[ix] * d_window[2*d_windowOverlap - (i+1)];
-                    ox++;
+            // Data symbols
+            for (size_t sym_ix = 0; sym_ix < p.nbSymbols; sym_ix++) {
+                /* _ix variables are indices into in[], _ox variables are
+                 * indices for out[] */
+                const ssize_t start_rise_ox = -p.windowOverlap;
+                const size_t start_rise_ix = 2 * p.spacing - p.symSize - p.windowOverlap;
+                /*
+                   const size_t start_real_symbol_ox = 0;
+                   const size_t start_real_symbol_ix = 2 * p.spacing - p.symSize;
+                   */
+                const ssize_t end_rise_ox = p.windowOverlap;
+                const size_t end_rise_ix = 2 * p.spacing - p.symSize + p.windowOverlap;
+                const ssize_t end_cyclic_prefix_ox = p.symSize - p.spacing;
+                /* end_cyclic_prefix_ix = end of symbol
+                   const size_t begin_fall_ox = p.symSize - p.windowOverlap;
+                   const size_t begin_fall_ix = p.spacing - p.windowOverlap;
+                   const size_t end_real_symbol_ox = p.symSize;
+                   end_real_symbol_ix = end of symbol
+                   const size_t end_fall_ox = p.symSize + p.windowOverlap;
+                   const size_t end_fall_ix = p.spacing + p.windowOverlap;
+                   */
+
+                ssize_t ox = start_rise_ox;
+                size_t ix = start_rise_ix;
+
+                for (size_t i = 0; ix < end_rise_ix; i++) {
+                    out[ox] += in[ix] * p.window.at(i);
                     ix++;
+                    ox++;
                 }
-                assert(ix == d_spacing);
+                assert(ox == end_rise_ox);
 
+                const size_t remaining_prefix_length = end_cyclic_prefix_ox - end_rise_ox;
+                memcpy( &out[ox], &in[ix],
+                        remaining_prefix_length * sizeof(T));
+                ox += remaining_prefix_length;
+                assert(ox == end_cyclic_prefix_ox);
                 ix = 0;
-                // Cyclic suffix, with window from 0.5 to 0
-                for (size_t i = 0; ox < (ssize_t)(d_symSize + d_windowOverlap); i++) {
-                    out[ox] = in[ix] * d_window[d_windowOverlap - (i+1)];
-                    ox++;
-                    ix++;
+
+                const bool last_symbol = (sym_ix + 1 >= p.nbSymbols);
+                if (last_symbol) {
+                    // No windowing at all at end
+                    memcpy(&out[ox], &in[ix], p.spacing * sizeof(T));
+                    ox += p.spacing;
+                }
+                else {
+                    // Copy the middle part of the symbol, p.windowOverlap samples
+                    // short of the end.
+                    memcpy( &out[ox],
+                            &in[ix],
+                            (p.spacing - p.windowOverlap) * sizeof(T));
+                    ox += p.spacing - p.windowOverlap;
+                    ix += p.spacing - p.windowOverlap;
+                    assert(ox == (ssize_t)(p.symSize - p.windowOverlap));
+
+                    // Apply window from 1 to 0.5 for the end of the symbol
+                    for (size_t i = 0; ox < (ssize_t)p.symSize; i++) {
+                        out[ox] = in[ix] * p.window[2*p.windowOverlap - (i+1)];
+                        ox++;
+                        ix++;
+                    }
+                    assert(ix == p.spacing);
+
+                    ix = 0;
+                    // Cyclic suffix, with window from 0.5 to 0
+                    for (size_t i = 0; ox < (ssize_t)(p.symSize + p.windowOverlap); i++) {
+                        out[ox] = in[ix] * p.window[p.windowOverlap - (i+1)];
+                        ox++;
+                        ix++;
+                    }
+
+                    assert(ix == p.windowOverlap);
                 }
 
-                assert(ix == d_windowOverlap);
+                out += p.symSize;
+                in += p.spacing;
+                // out is now pointing to the proper end of symbol. There are
+                // windowOverlap samples ahead that were already written.
             }
-
-            out += d_symSize;
-            in += d_spacing;
-            // out is now pointing to the proper end of symbol. There are
-            // d_windowOverlap samples ahead that were already written.
+        }
+        else {
+            throw std::runtime_error("fixed-point doesn't support window overlap");
         }
     }
     else {
         // Handle Null symbol separately because it is longer
         // end - (nullSize - spacing) = 2 * spacing - nullSize
-        memcpy(out, &in[2 * d_spacing - d_nullSize],
-                (d_nullSize - d_spacing) * sizeof(complexf));
-        memcpy(&out[d_nullSize - d_spacing], in, d_spacing * sizeof(complexf));
-        in += d_spacing;
-        out += d_nullSize;
+        memcpy(out, &in[2 * p.spacing - p.nullSize],
+                (p.nullSize - p.spacing) * sizeof(T));
+        memcpy(&out[p.nullSize - p.spacing], in, p.spacing * sizeof(T));
+        in += p.spacing;
+        out += p.nullSize;
 
         // Data symbols
-        for (size_t i = 0; i < d_nbSymbols; ++i) {
+        for (size_t i = 0; i < p.nbSymbols; ++i) {
             // end - (symSize - spacing) = 2 * spacing - symSize
-            memcpy(out, &in[2 * d_spacing - d_symSize],
-                    (d_symSize - d_spacing) * sizeof(complexf));
-            memcpy(&out[d_symSize - d_spacing], in, d_spacing * sizeof(complexf));
-            in += d_spacing;
-            out += d_symSize;
+            memcpy(out, &in[2 * p.spacing - p.symSize],
+                    (p.symSize - p.spacing) * sizeof(T));
+            memcpy(&out[p.symSize - p.spacing], in, p.spacing * sizeof(T));
+            in += p.spacing;
+            out += p.symSize;
         }
     }
 
-    return sizeIn;
+    const auto sizeOut = dataOut->getLength();
+    return sizeOut;
+}
+
+int GuardIntervalInserter::process(Buffer* const dataIn, Buffer* dataOut)
+{
+    switch (m_fftEngine) {
+        case FFTEngine::FFTW:
+            return do_process<complexf>(m_params, dataIn, dataOut);
+        case FFTEngine::KISS:
+            if (m_params.windowOverlap) {
+                throw std::runtime_error("fixed point and ofdm windowing not supported");
+            }
+            return do_process<complexfix>(m_params, dataIn, dataOut);
+        case FFTEngine::DEXTER:
+            return do_process<complexfix_wide>(m_params, dataIn, dataOut);
+    }
+    throw std::logic_error("Unhandled fftEngine variant");
 }
 
 void GuardIntervalInserter::set_parameter(
@@ -293,7 +323,7 @@ const std::string GuardIntervalInserter::get_parameter(const std::string& parame
     using namespace std;
     stringstream ss;
     if (parameter == "windowlen") {
-        ss << d_windowOverlap;
+        ss << m_params.windowOverlap;
     }
     else {
         ss << "Parameter '" << parameter <<
@@ -306,6 +336,6 @@ const std::string GuardIntervalInserter::get_parameter(const std::string& parame
 const json::map_t GuardIntervalInserter::get_all_values() const
 {
     json::map_t map;
-    map["windowlen"].v = d_windowOverlap;
+    map["windowlen"].v = m_params.windowOverlap;
     return map;
 }
diff --git a/src/GuardIntervalInserter.h b/src/GuardIntervalInserter.h
index f78ac91..8d329ff 100644
--- a/src/GuardIntervalInserter.h
+++ b/src/GuardIntervalInserter.h
@@ -30,6 +30,7 @@
 #   include <config.h>
 #endif
 
+#include "ConfigParser.h"
 #include "ModPlugin.h"
 #include "RemoteControl.h"
 #include <stdint.h>
@@ -50,7 +51,8 @@ class GuardIntervalInserter : public ModCodec, public RemoteControllable
                 size_t spacing,
                 size_t nullSize,
                 size_t symSize,
-                size_t& windowOverlap);
+                size_t& windowOverlap,
+                FFTEngine fftEngine);
 
         virtual ~GuardIntervalInserter() {}
 
@@ -62,16 +64,30 @@ class GuardIntervalInserter : public ModCodec, public RemoteControllable
         virtual const std::string get_parameter(const std::string& parameter) const override;
         virtual const json::map_t get_all_values() const override;
 
+        struct Params {
+            Params(
+                size_t nbSymbols,
+                size_t spacing,
+                size_t nullSize,
+                size_t symSize,
+                size_t& windowOverlap);
+
+            size_t nbSymbols;
+            size_t spacing;
+            size_t nullSize;
+            size_t symSize;
+            size_t& windowOverlap;
+
+            mutable std::mutex windowMutex;
+            std::vector<float> window;
+        };
+
     protected:
         void update_window(size_t new_window_overlap);
 
-        size_t d_nbSymbols;
-        size_t d_spacing;
-        size_t d_nullSize;
-        size_t d_symSize;
+        FFTEngine m_fftEngine;
+
+        Params m_params;
 
-        mutable std::mutex d_windowMutex;
-        size_t& d_windowOverlap;
-        std::vector<float> d_window;
 };
 
diff --git a/src/MemlessPoly.h b/src/MemlessPoly.h
index 91e6860..72de62c 100644
--- a/src/MemlessPoly.h
+++ b/src/MemlessPoly.h
@@ -32,13 +32,10 @@
 
 #include "RemoteControl.h"
 #include "ModPlugin.h"
-#include "PcDebug.h"
 #include "ThreadsafeQueue.h"
 
 #include <sys/types.h>
 #include <array>
-#include <complex>
-#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
@@ -47,8 +44,6 @@
 
 #define MEMLESSPOLY_PIPELINE_DELAY 1
 
-typedef std::complex<float> complexf;
-
 enum class dpd_type_t {
     odd_only_poly,
     lookup_table
diff --git a/src/ModPlugin.h b/src/ModPlugin.h
index 470508f..bb3ee2c 100644
--- a/src/ModPlugin.h
+++ b/src/ModPlugin.h
@@ -33,9 +33,7 @@
 #include "Buffer.h"
 #include "ThreadsafeQueue.h"
 #include "TimestampDecoder.h"
-#include <cstddef>
 #include <vector>
-#include <memory>
 #include <thread>
 #include <atomic>
 
diff --git a/src/NullSymbol.cpp b/src/NullSymbol.cpp
index 4684dfe..526e662 100644
--- a/src/NullSymbol.cpp
+++ b/src/NullSymbol.cpp
@@ -27,18 +27,16 @@
 #include "NullSymbol.h"
 #include "PcDebug.h"
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <complex>
-#include <string.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 
-typedef std::complex<float> complexf;
-
-NullSymbol::NullSymbol(size_t nbCarriers) :
+NullSymbol::NullSymbol(size_t numCarriers, size_t typeSize) :
     ModInput(),
-    myNbCarriers(nbCarriers)
+    m_numCarriers(numCarriers),
+    m_typeSize(typeSize)
 {
-    PDEBUG("NullSymbol::NullSymbol(%zu) @ %p\n", nbCarriers, this);
+    PDEBUG("NullSymbol::NullSymbol(%zu) @ %p\n", numCarriers, this);
 }
 
 
@@ -52,7 +50,7 @@ int NullSymbol::process(Buffer* dataOut)
 {
     PDEBUG("NullSymbol::process(dataOut: %p)\n", dataOut);
 
-    dataOut->setLength(myNbCarriers * 2 * sizeof(float));
+    dataOut->setLength(m_numCarriers * m_typeSize);
     memset(dataOut->getData(), 0, dataOut->getLength());
 
     return dataOut->getLength();
diff --git a/src/NullSymbol.h b/src/NullSymbol.h
index 814e434..6ba9e63 100644
--- a/src/NullSymbol.h
+++ b/src/NullSymbol.h
@@ -39,14 +39,14 @@
 class NullSymbol : public ModInput
 {
 public:
-    NullSymbol(size_t nbCarriers);
+    NullSymbol(size_t nunCarriers, size_t typeSize);
     virtual ~NullSymbol();
 
     int process(Buffer* dataOut);
     const char* name() { return "NullSymbol"; }
 
 private:
-    size_t myNbCarriers;
-
+    size_t m_numCarriers;
+    size_t m_typeSize;
 };
 
diff --git a/src/OfdmGenerator.cpp b/src/OfdmGenerator.cpp
index cb799d3..38648c9 100644
--- a/src/OfdmGenerator.cpp
+++ b/src/OfdmGenerator.cpp
@@ -2,7 +2,7 @@
    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty
    the Queen in Right of Canada (Communications Research Center Canada)
 
-   Copyright (C) 2023
+   Copyright (C) 2024
    Matthias P. Braendli, matthias.braendli@mpb.li
 
     http://opendigitalradio.org
@@ -27,17 +27,19 @@
 #include "OfdmGenerator.h"
 #include "PcDebug.h"
 
-#define FFT_TYPE fftwf_complex
-
-#include <string.h>
 #include <stdexcept>
 #include <assert.h>
 #include <string>
 #include <numeric>
+#include <vector>
+#include <cstring>
+#include <complex>
 
 static const size_t MAX_CLIP_STATS = 10;
 
-OfdmGenerator::OfdmGenerator(size_t nbSymbols,
+using FFTW_TYPE = fftwf_complex;
+
+OfdmGeneratorCF32::OfdmGeneratorCF32(size_t nbSymbols,
                              size_t nbCarriers,
                              size_t spacing,
                              bool& enableCfr,
@@ -62,8 +64,7 @@ OfdmGenerator::OfdmGenerator(size_t nbSymbols,
             nbSymbols, nbCarriers, spacing, inverse ? "true" : "false", this);
 
     if (nbCarriers > spacing) {
-        throw std::runtime_error(
-                "OfdmGenerator::OfdmGenerator nbCarriers > spacing!");
+        throw std::runtime_error("OfdmGenerator nbCarriers > spacing!");
     }
 
     /* register the parameters that can be remote controlled */
@@ -102,29 +103,29 @@ OfdmGenerator::OfdmGenerator(size_t nbSymbols,
     PDEBUG("  myZeroSize: %u\n", myZeroSize);
 
     const int N = mySpacing; // The size of the FFT
-    myFftIn = (FFT_TYPE*)fftwf_malloc(sizeof(FFT_TYPE) * N);
-    myFftOut = (FFT_TYPE*)fftwf_malloc(sizeof(FFT_TYPE) * N);
+    myFftIn = (FFTW_TYPE*)fftwf_malloc(sizeof(FFTW_TYPE) * N);
+    myFftOut = (FFTW_TYPE*)fftwf_malloc(sizeof(FFTW_TYPE) * N);
     fftwf_set_timelimit(2);
     myFftPlan = fftwf_plan_dft_1d(N,
             myFftIn, myFftOut,
             FFTW_BACKWARD, FFTW_MEASURE);
 
-    myCfrPostClip = (FFT_TYPE*)fftwf_malloc(sizeof(FFT_TYPE) * N);
-    myCfrPostFft = (FFT_TYPE*)fftwf_malloc(sizeof(FFT_TYPE) * N);
+    myCfrPostClip = (FFTW_TYPE*)fftwf_malloc(sizeof(FFTW_TYPE) * N);
+    myCfrPostFft = (FFTW_TYPE*)fftwf_malloc(sizeof(FFTW_TYPE) * N);
     myCfrFft = fftwf_plan_dft_1d(N,
             myCfrPostClip, myCfrPostFft,
             FFTW_FORWARD, FFTW_MEASURE);
 
-    if (sizeof(complexf) != sizeof(FFT_TYPE)) {
+    if (sizeof(complexf) != sizeof(FFTW_TYPE)) {
         printf("sizeof(complexf) %zu\n", sizeof(complexf));
-        printf("sizeof(FFT_TYPE) %zu\n", sizeof(FFT_TYPE));
+        printf("sizeof(FFT_TYPE) %zu\n", sizeof(FFTW_TYPE));
         throw std::runtime_error(
                 "OfdmGenerator::process complexf size is not FFT_TYPE size!");
     }
 }
 
 
-OfdmGenerator::~OfdmGenerator()
+OfdmGeneratorCF32::~OfdmGeneratorCF32()
 {
     PDEBUG("OfdmGenerator::~OfdmGenerator() @ %p\n", this);
 
@@ -153,15 +154,15 @@ OfdmGenerator::~OfdmGenerator()
     }
 }
 
-int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
+int OfdmGeneratorCF32::process(Buffer* const dataIn, Buffer* dataOut)
 {
     PDEBUG("OfdmGenerator::process(dataIn: %p, dataOut: %p)\n",
             dataIn, dataOut);
 
     dataOut->setLength(myNbSymbols * mySpacing * sizeof(complexf));
 
-    FFT_TYPE* in = reinterpret_cast<FFT_TYPE*>(dataIn->getData());
-    FFT_TYPE* out = reinterpret_cast<FFT_TYPE*>(dataOut->getData());
+    FFTW_TYPE *in = reinterpret_cast<FFTW_TYPE*>(dataIn->getData());
+    FFTW_TYPE *out = reinterpret_cast<FFTW_TYPE*>(dataOut->getData());
 
     size_t sizeIn = dataIn->getLength() / sizeof(complexf);
     size_t sizeOut = dataOut->getLength() / sizeof(complexf);
@@ -203,7 +204,7 @@ int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
         myPaprAfterCFR.clear();
     }
 
-    for (size_t i = 0; i < myNbSymbols; ++i) {
+    for (size_t i = 0; i < myNbSymbols; i++) {
         myFftIn[0][0] = 0;
         myFftIn[0][1] = 0;
 
@@ -212,22 +213,20 @@ int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
          * PosSrc=0 PosDst=1 PosSize=768
          * NegSrc=768 NegDst=1280 NegSize=768
          */
-        memset(&myFftIn[myZeroDst], 0, myZeroSize * sizeof(FFT_TYPE));
+        memset(&myFftIn[myZeroDst], 0, myZeroSize * sizeof(FFTW_TYPE));
         memcpy(&myFftIn[myPosDst], &in[myPosSrc],
-                myPosSize * sizeof(FFT_TYPE));
+                myPosSize * sizeof(FFTW_TYPE));
         memcpy(&myFftIn[myNegDst], &in[myNegSrc],
-                myNegSize * sizeof(FFT_TYPE));
-
+                myNegSize * sizeof(FFTW_TYPE));
 
         if (myCfr) {
             reference.resize(mySpacing);
             memcpy(reinterpret_cast<fftwf_complex*>(reference.data()),
-                    myFftIn, mySpacing * sizeof(FFT_TYPE));
+                    myFftIn, mySpacing * sizeof(FFTW_TYPE));
         }
 
         fftwf_execute(myFftPlan); // IFFT from myFftIn to myFftOut
 
-        
         if (myCfr) {
             complexf *symbol = reinterpret_cast<complexf*>(myFftOut);
             myPaprBeforeCFR.process_block(symbol, mySpacing);
@@ -235,7 +234,7 @@ int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
             if (myMERCalcIndex == i) {
                 before_cfr.resize(mySpacing);
                 memcpy(reinterpret_cast<fftwf_complex*>(before_cfr.data()),
-                        myFftOut, mySpacing * sizeof(FFT_TYPE));
+                        myFftOut, mySpacing * sizeof(FFTW_TYPE));
             }
 
             /* cfr_one_iteration runs the myFftPlan again at the end, and
@@ -277,7 +276,7 @@ int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
             num_error_clip += stat.errclip_count;
         }
 
-        memcpy(out, myFftOut, mySpacing * sizeof(FFT_TYPE));
+        memcpy(out, myFftOut, mySpacing * sizeof(FFTW_TYPE));
 
         in += myNbCarriers;
         out += mySpacing;
@@ -308,14 +307,14 @@ int OfdmGenerator::process(Buffer* const dataIn, Buffer* dataOut)
     return sizeOut;
 }
 
-OfdmGenerator::cfr_iter_stat_t OfdmGenerator::cfr_one_iteration(
+OfdmGeneratorCF32::cfr_iter_stat_t OfdmGeneratorCF32::cfr_one_iteration(
         complexf *symbol, const complexf *reference)
 {
     // use std::norm instead of std::abs to avoid calculating the
     // square roots
     const float clip_squared = myCfrClip * myCfrClip;
 
-    OfdmGenerator::cfr_iter_stat_t ret;
+    OfdmGeneratorCF32::cfr_iter_stat_t ret;
 
     // Clip
     for (size_t i = 0; i < mySpacing; i++) {
@@ -331,7 +330,7 @@ OfdmGenerator::cfr_iter_stat_t OfdmGenerator::cfr_one_iteration(
     }
 
     // Take FFT of our clipped signal
-    memcpy(myCfrPostClip, symbol, mySpacing * sizeof(FFT_TYPE));
+    memcpy(myCfrPostClip, symbol, mySpacing * sizeof(FFTW_TYPE));
     fftwf_execute(myCfrFft); // FFT from myCfrPostClip to myCfrPostFft
 
     // Calculate the error in frequency domain by subtracting our reference
@@ -374,7 +373,7 @@ OfdmGenerator::cfr_iter_stat_t OfdmGenerator::cfr_one_iteration(
 }
 
 
-void OfdmGenerator::set_parameter(const std::string& parameter,
+void OfdmGeneratorCF32::set_parameter(const std::string& parameter,
                                   const std::string& value)
 {
     using namespace std;
@@ -404,7 +403,7 @@ void OfdmGenerator::set_parameter(const std::string& parameter,
     }
 }
 
-const std::string OfdmGenerator::get_parameter(const std::string& parameter) const
+const std::string OfdmGeneratorCF32::get_parameter(const std::string& parameter) const
 {
     using namespace std;
     stringstream ss;
@@ -458,9 +457,333 @@ const std::string OfdmGenerator::get_parameter(const std::string& parameter) con
     return ss.str();
 }
 
-const json::map_t OfdmGenerator::get_all_values() const
+const json::map_t OfdmGeneratorCF32::get_all_values() const
 {
     json::map_t map;
     // TODO needs rework of the values
     return map;
 }
+
+OfdmGeneratorFixed::OfdmGeneratorFixed(size_t nbSymbols,
+                             size_t nbCarriers,
+                             size_t spacing,
+                             bool inverse) :
+    ModCodec(),
+    myNbSymbols(nbSymbols),
+    myNbCarriers(nbCarriers),
+    mySpacing(spacing)
+{
+    PDEBUG("OfdmGenerator::OfdmGenerator(%zu, %zu, %zu, %s) @ %p\n",
+            nbSymbols, nbCarriers, spacing, inverse ? "true" : "false", this);
+
+    etiLog.level(info) << "Using KISS FFT by Mark Borgerding for fixed-point transform";
+
+    if (nbCarriers > spacing) {
+        throw std::runtime_error("OfdmGenerator nbCarriers > spacing!");
+    }
+
+    if (inverse) {
+        myPosDst = (nbCarriers & 1 ? 0 : 1);
+        myPosSrc = 0;
+        myPosSize = (nbCarriers + 1) / 2;
+        myNegDst = spacing - (nbCarriers / 2);
+        myNegSrc = (nbCarriers + 1) / 2;
+        myNegSize = nbCarriers / 2;
+    }
+    else {
+        myPosDst = (nbCarriers & 1 ? 0 : 1);
+        myPosSrc = nbCarriers / 2;
+        myPosSize = (nbCarriers + 1) / 2;
+        myNegDst = spacing - (nbCarriers / 2);
+        myNegSrc = 0;
+        myNegSize = nbCarriers / 2;
+    }
+    myZeroDst = myPosDst + myPosSize;
+    myZeroSize = myNegDst - myZeroDst;
+
+    PDEBUG("  myPosDst: %u\n", myPosDst);
+    PDEBUG("  myPosSrc: %u\n", myPosSrc);
+    PDEBUG("  myPosSize: %u\n", myPosSize);
+    PDEBUG("  myNegDst: %u\n", myNegDst);
+    PDEBUG("  myNegSrc: %u\n", myNegSrc);
+    PDEBUG("  myNegSize: %u\n", myNegSize);
+    PDEBUG("  myZeroDst: %u\n", myZeroDst);
+    PDEBUG("  myZeroSize: %u\n", myZeroSize);
+
+    const int N = mySpacing; // The size of the FFT
+
+    const size_t nbytes = N * sizeof(kiss_fft_cpx);
+    myFftIn = (kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes);
+    myFftOut = (kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes);
+    memset(myFftIn, 0, nbytes);
+
+    myKissCfg = kiss_fft_alloc(N, inverse, nullptr, nullptr);
+}
+
+OfdmGeneratorFixed::~OfdmGeneratorFixed()
+{
+    if (myKissCfg) KISS_FFT_FREE(myKissCfg);
+    if (myFftIn) KISS_FFT_FREE(myFftIn);
+    if (myFftOut) KISS_FFT_FREE(myFftOut);
+}
+
+int OfdmGeneratorFixed::process(Buffer* const dataIn, Buffer* dataOut)
+{
+    dataOut->setLength(myNbSymbols * mySpacing * sizeof(kiss_fft_cpx));
+
+    kiss_fft_cpx *in = reinterpret_cast<kiss_fft_cpx*>(dataIn->getData());
+    kiss_fft_cpx *out = reinterpret_cast<kiss_fft_cpx*>(dataOut->getData());
+
+    size_t sizeIn = dataIn->getLength() / sizeof(kiss_fft_cpx);
+    size_t sizeOut = dataOut->getLength() / sizeof(kiss_fft_cpx);
+
+    if (sizeIn != myNbSymbols * myNbCarriers) {
+        PDEBUG("Nb symbols: %zu\n", myNbSymbols);
+        PDEBUG("Nb carriers: %zu\n", myNbCarriers);
+        PDEBUG("Spacing: %zu\n", mySpacing);
+        PDEBUG("\n%zu != %zu\n", sizeIn, myNbSymbols * myNbCarriers);
+        throw std::runtime_error(
+                "OfdmGenerator::process input size not valid!");
+    }
+    if (sizeOut != myNbSymbols * mySpacing) {
+        PDEBUG("Nb symbols: %zu\n", myNbSymbols);
+        PDEBUG("Nb carriers: %zu\n", myNbCarriers);
+        PDEBUG("Spacing: %zu\n", mySpacing);
+        PDEBUG("\n%zu != %zu\n", sizeIn, myNbSymbols * mySpacing);
+        throw std::runtime_error(
+                "OfdmGenerator::process output size not valid!");
+    }
+
+    for (size_t i = 0; i < myNbSymbols; i++) {
+        myFftIn[0].r = 0;
+        myFftIn[0].i = 0;
+
+        /* For TM I this is:
+         * ZeroDst=769 ZeroSize=511
+         * PosSrc=0 PosDst=1 PosSize=768
+         * NegSrc=768 NegDst=1280 NegSize=768
+         */
+        memset(&myFftIn[myZeroDst], 0, myZeroSize * sizeof(kiss_fft_cpx));
+        memcpy(&myFftIn[myPosDst], &in[myPosSrc], myPosSize * sizeof(kiss_fft_cpx));
+        memcpy(&myFftIn[myNegDst], &in[myNegSrc], myNegSize * sizeof(kiss_fft_cpx));
+
+        kiss_fft(myKissCfg, myFftIn, myFftOut);
+
+        memcpy(out, myFftOut, mySpacing * sizeof(kiss_fft_cpx));
+
+        in += myNbCarriers;
+        out += mySpacing;
+    }
+
+    return sizeOut;
+}
+
+#ifdef HAVE_DEXTER
+OfdmGeneratorDEXTER::OfdmGeneratorDEXTER(size_t nbSymbols,
+                             size_t nbCarriers,
+                             size_t spacing) :
+    ModCodec(),
+    myNbSymbols(nbSymbols),
+    myNbCarriers(nbCarriers),
+    mySpacing(spacing)
+{
+    PDEBUG("OfdmGeneratorDEXTER::OfdmGeneratorDEXTER(%zu, %zu, %zu) @ %p\n",
+            nbSymbols, nbCarriers, spacing, this);
+
+    etiLog.level(info) << "Using DEXTER FFT Accelerator for fixed-point transform";
+
+    if (nbCarriers > spacing) {
+        throw std::runtime_error("OfdmGenerator nbCarriers > spacing!");
+    }
+
+    myPosDst = (nbCarriers & 1 ? 0 : 1);
+    myPosSrc = 0;
+    myPosSize = (nbCarriers + 1) / 2;
+    myNegDst = spacing - (nbCarriers / 2);
+    myNegSrc = (nbCarriers + 1) / 2;
+    myNegSize = nbCarriers / 2;
+
+    myZeroDst = myPosDst + myPosSize;
+    myZeroSize = myNegDst - myZeroDst;
+
+    PDEBUG("  myPosDst: %u\n", myPosDst);
+    PDEBUG("  myPosSrc: %u\n", myPosSrc);
+    PDEBUG("  myPosSize: %u\n", myPosSize);
+    PDEBUG("  myNegDst: %u\n", myNegDst);
+    PDEBUG("  myNegSrc: %u\n", myNegSrc);
+    PDEBUG("  myNegSize: %u\n", myNegSize);
+    PDEBUG("  myZeroDst: %u\n", myZeroDst);
+    PDEBUG("  myZeroSize: %u\n", myZeroSize);
+
+    const size_t nbytes_in = mySpacing * sizeof(complexfix);
+    const size_t nbytes_out = mySpacing * sizeof(complexfix_wide);
+
+#define IIO_ENSURE(expr, err) { \
+    if (!(expr)) { \
+        etiLog.log(error, "%s (%s:%d)\n", err, __FILE__, __LINE__); \
+        throw std::runtime_error("Failed to set FFT for OfdmGeneratorDEXTER"); \
+    } \
+}
+    IIO_ENSURE((m_ctx = iio_create_default_context()), "No context");
+    IIO_ENSURE(m_dev_in = iio_context_find_device(m_ctx, "fft-accelerator-in"), "no dev");
+    IIO_ENSURE(m_dev_out = iio_context_find_device(m_ctx, "fft-accelerator-out"), "no dev");
+    IIO_ENSURE(m_channel_in = iio_device_find_channel(m_dev_in, "voltage0", true), "no channel");
+    IIO_ENSURE(m_channel_out = iio_device_find_channel(m_dev_out, "voltage0", false), "no channel");
+
+    iio_channel_enable(m_channel_in);
+    iio_channel_enable(m_channel_out);
+
+    m_buf_in = iio_device_create_buffer(m_dev_in, nbytes_in, false);
+    if (!m_buf_in) {
+        throw std::runtime_error("OfdmGeneratorDEXTER could not create in buffer");
+    }
+
+    m_buf_out = iio_device_create_buffer(m_dev_out, nbytes_out, false);
+    if (!m_buf_out) {
+        throw std::runtime_error("OfdmGeneratorDEXTER could not create out buffer");
+    }
+}
+
+OfdmGeneratorDEXTER::~OfdmGeneratorDEXTER()
+{
+    if (m_buf_in) {
+        iio_buffer_destroy(m_buf_in);
+        m_buf_in = nullptr;
+    }
+
+    if (m_buf_out) {
+        iio_buffer_destroy(m_buf_out);
+        m_buf_out = nullptr;
+    }
+
+    if (m_channel_in) {
+        iio_channel_disable(m_channel_in);
+        m_channel_in = nullptr;
+    }
+
+    if (m_channel_out) {
+        iio_channel_disable(m_channel_out);
+        m_channel_out = nullptr;
+    }
+
+    if (m_ctx) {
+        iio_context_destroy(m_ctx);
+        m_ctx = nullptr;
+    }
+}
+
+int OfdmGeneratorDEXTER::process(Buffer* const dataIn, Buffer* dataOut)
+{
+    dataOut->setLength(myNbSymbols * mySpacing * sizeof(complexfix_wide));
+
+    complexfix *in = reinterpret_cast<complexfix*>(dataIn->getData());
+    complexfix_wide *out = reinterpret_cast<complexfix_wide*>(dataOut->getData());
+
+    size_t sizeIn = dataIn->getLength() / sizeof(complexfix);
+    size_t sizeOut = dataOut->getLength() / sizeof(complexfix_wide);
+
+    if (sizeIn != myNbSymbols * myNbCarriers) {
+        PDEBUG("Nb symbols: %zu\n", myNbSymbols);
+        PDEBUG("Nb carriers: %zu\n", myNbCarriers);
+        PDEBUG("Spacing: %zu\n", mySpacing);
+        PDEBUG("\n%zu != %zu\n", sizeIn, myNbSymbols * myNbCarriers);
+        throw std::runtime_error(
+                "OfdmGenerator::process input size not valid!");
+    }
+    if (sizeOut != myNbSymbols * mySpacing) {
+        PDEBUG("Nb symbols: %zu\n", myNbSymbols);
+        PDEBUG("Nb carriers: %zu\n", myNbCarriers);
+        PDEBUG("Spacing: %zu\n", mySpacing);
+        PDEBUG("\n%zu != %zu\n", sizeIn, myNbSymbols * mySpacing);
+        throw std::runtime_error("OfdmGenerator::process output size not valid!");
+    }
+
+    ptrdiff_t iio_buf_size = (uint8_t*)iio_buffer_end(m_buf_in) - (uint8_t*)iio_buffer_start(m_buf_in);
+    if (iio_buf_size != (ssize_t)(mySpacing * sizeof(complexfix))) {
+        throw std::runtime_error("OfdmGenerator::process incorrect iio buffer size!");
+    }
+
+    for (size_t i = 0; i < myNbSymbols; i++) {
+        complexfix *fft_in = reinterpret_cast<complexfix*>(iio_buffer_start(m_buf_in));
+
+        /* For TM I this is:
+         * ZeroDst=769 ZeroSize=511
+         * PosSrc=0 PosDst=1 PosSize=768
+         * NegSrc=768 NegDst=1280 NegSize=768
+         */
+
+        fft_in[0] = static_cast<complexfix::value_type>(0);
+        for (size_t i = 0; i < myZeroSize; i++) {
+            fft_in[myZeroDst + i] = static_cast<complexfix::value_type>(0);
+        }
+
+        memcpy(&fft_in[myPosDst], &in[myPosSrc], myPosSize * sizeof(complexfix));
+        memcpy(&fft_in[myNegDst], &in[myNegSrc], myNegSize * sizeof(complexfix));
+
+        ssize_t nbytes_tx = iio_buffer_push(m_buf_in);
+        if (nbytes_tx < 0) {
+            throw std::runtime_error("OfdmGenerator::process error pushing IIO buffer!");
+        }
+
+        in += myNbCarriers;
+
+        // Keep one buffer in flight while we're doing shuffling data around here,
+        // this improves performance.
+        // I believe that, by default, IIO allocates four buffers in total.
+        if (i > 0) {
+            ssize_t nbytes_rx = iio_buffer_refill(m_buf_out);
+            if (nbytes_rx < 0) {
+                throw std::runtime_error("OfdmGenerator::process error refilling IIO buffer!");
+            }
+
+            ptrdiff_t p_inc = iio_buffer_step(m_buf_out);
+            if (p_inc != 1) {
+                throw std::runtime_error("OfdmGenerator::process Wrong p_inc");
+            }
+
+            // The FFT Accelerator takes 16-bit I + 16-bit Q, and outputs 32-bit I and 32-bit Q.
+            // The formatconvert will take care of this
+            const uint8_t *fft_out = (const uint8_t*)iio_buffer_first(m_buf_out, m_channel_out);
+            const uint8_t *fft_out_end = (const uint8_t*)iio_buffer_end(m_buf_out);
+            constexpr size_t sizeof_out_iq = sizeof(complexfix_wide);
+            if ((fft_out_end - fft_out) != (ssize_t)(mySpacing * sizeof_out_iq)) {
+                fprintf(stderr, "FFT_OUT: %p %p %zu %zu\n",
+                        fft_out, fft_out_end, (fft_out_end - fft_out),
+                        mySpacing * sizeof_out_iq);
+                throw std::runtime_error("OfdmGenerator::process fft_out length invalid!");
+            }
+
+            memcpy(out, fft_out, mySpacing * sizeof_out_iq);
+
+            out += mySpacing;
+        }
+    }
+
+    ssize_t nbytes_rx = iio_buffer_refill(m_buf_out);
+    if (nbytes_rx < 0) {
+        throw std::runtime_error("OfdmGenerator::process error refilling IIO buffer!");
+    }
+
+    ptrdiff_t p_inc = iio_buffer_step(m_buf_out);
+    if (p_inc != 1) {
+        throw std::runtime_error("OfdmGenerator::process Wrong p_inc");
+    }
+
+    // The FFT Accelerator takes 16-bit I + 16-bit Q, and outputs 32-bit I and 32-bit Q.
+    // The formatconvert will take care of this
+    const uint8_t *fft_out = (const uint8_t*)iio_buffer_first(m_buf_out, m_channel_out);
+    const uint8_t *fft_out_end = (const uint8_t*)iio_buffer_end(m_buf_out);
+    constexpr size_t sizeof_out_iq = sizeof(complexfix_wide);
+    if ((fft_out_end - fft_out) != (ssize_t)(mySpacing * sizeof_out_iq)) {
+        fprintf(stderr, "FFT_OUT: %p %p %zu %zu\n",
+                fft_out, fft_out_end, (fft_out_end - fft_out),
+                mySpacing * sizeof_out_iq);
+        throw std::runtime_error("OfdmGenerator::process fft_out length invalid!");
+    }
+
+    memcpy(out, fft_out, mySpacing * sizeof_out_iq);
+
+    return sizeOut;
+}
+
+#endif // HAVE_DEXTER
diff --git a/src/OfdmGenerator.h b/src/OfdmGenerator.h
index dc1ad46..475b2a4 100644
--- a/src/OfdmGenerator.h
+++ b/src/OfdmGenerator.h
@@ -2,7 +2,7 @@
    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty
    the Queen in Right of Canada (Communications Research Center Canada)
 
-   Copyright (C) 2023
+   Copyright (C) 2024
    Matthias P. Braendli, matthias.braendli@mpb.li
 
     http://opendigitalradio.org
@@ -33,27 +33,30 @@
 #include "ModPlugin.h"
 #include "RemoteControl.h"
 #include "PAPRStats.h"
-#include "fftw3.h"
+#include "kiss_fft.h"
+
 #include <cstddef>
-#include <vector>
-#include <complex>
 #include <atomic>
+#include <fftw3.h>
 
-typedef std::complex<float> complexf;
+#ifdef HAVE_DEXTER
+#   include <iio.h>
+#endif
 
-class OfdmGenerator : public ModCodec, public RemoteControllable
+// Complex Float uses FFTW
+class OfdmGeneratorCF32 : public ModCodec, public RemoteControllable
 {
     public:
-        OfdmGenerator(size_t nbSymbols,
+        OfdmGeneratorCF32(size_t nbSymbols,
                       size_t nbCarriers,
                       size_t spacing,
                       bool& enableCfr,
                       float& cfrClip,
                       float& cfrErrorClip,
                       bool inverse = true);
-        virtual ~OfdmGenerator();
-        OfdmGenerator(const OfdmGenerator&) = delete;
-        OfdmGenerator& operator=(const OfdmGenerator&) = delete;
+        virtual ~OfdmGeneratorCF32();
+        OfdmGeneratorCF32(const OfdmGeneratorCF32&) = delete;
+        OfdmGeneratorCF32& operator=(const OfdmGeneratorCF32&) = delete;
 
         int process(Buffer* const dataIn, Buffer* dataOut) override;
         const char* name() override { return "OfdmGenerator"; }
@@ -107,4 +110,76 @@ class OfdmGenerator : public ModCodec, public RemoteControllable
         std::deque<double> myMERs;
 };
 
+// Fixed point implementation uses KISS FFT with -DFIXED_POINT=32
+class OfdmGeneratorFixed : public ModCodec
+{
+    public:
+        OfdmGeneratorFixed(size_t nbSymbols,
+                      size_t nbCarriers,
+                      size_t spacing,
+                      bool inverse = true);
+        virtual ~OfdmGeneratorFixed();
+        OfdmGeneratorFixed(const OfdmGeneratorFixed&) = delete;
+        OfdmGeneratorFixed& operator=(const OfdmGeneratorFixed&) = delete;
+
+        int process(Buffer* const dataIn, Buffer* dataOut) override;
+        const char* name() override { return "OfdmGenerator"; }
+
+    private:
+        kiss_fft_cfg myKissCfg = nullptr;
+        kiss_fft_cpx *myFftIn, *myFftOut;
+
+        const size_t myNbSymbols;
+        const size_t myNbCarriers;
+        const size_t mySpacing;
+        unsigned myPosSrc;
+        unsigned myPosDst;
+        unsigned myPosSize;
+        unsigned myNegSrc;
+        unsigned myNegDst;
+        unsigned myNegSize;
+        unsigned myZeroDst;
+        unsigned myZeroSize;
+};
+
+#ifdef HAVE_DEXTER
+// The PrecisionWave DEXTER device contains an FFT accelerator in FPGA
+// It only does inverse FFTs
+class OfdmGeneratorDEXTER : public ModCodec
+{
+    public:
+        OfdmGeneratorDEXTER(size_t nbSymbols,
+                      size_t nbCarriers,
+                      size_t spacing);
+        virtual ~OfdmGeneratorDEXTER();
+        OfdmGeneratorDEXTER(const OfdmGeneratorDEXTER&) = delete;
+        OfdmGeneratorDEXTER& operator=(const OfdmGeneratorDEXTER&) = delete;
+
+        int process(Buffer* const dataIn, Buffer* dataOut) override;
+        const char* name() override { return "OfdmGenerator"; }
+
+    private:
+        struct iio_context *m_ctx = nullptr;
 
+        // "in" and "out" are from the point of view of the FFT Accelerator block
+        struct iio_device *m_dev_in = nullptr;
+        struct iio_channel *m_channel_in = nullptr;
+        struct iio_buffer *m_buf_in = nullptr;
+
+        struct iio_device *m_dev_out = nullptr;
+        struct iio_channel *m_channel_out = nullptr;
+        struct iio_buffer *m_buf_out = nullptr;
+
+        const size_t myNbSymbols;
+        const size_t myNbCarriers;
+        const size_t mySpacing;
+        unsigned myPosSrc;
+        unsigned myPosDst;
+        unsigned myPosSize;
+        unsigned myNegSrc;
+        unsigned myNegDst;
+        unsigned myNegSize;
+        unsigned myZeroDst;
+        unsigned myZeroSize;
+};
+#endif // HAVE_DEXTER
diff --git a/src/OutputMemory.cpp b/src/OutputMemory.cpp
index d6ef917..f673555 100644
--- a/src/OutputMemory.cpp
+++ b/src/OutputMemory.cpp
@@ -26,20 +26,14 @@
 
 #include "OutputMemory.h"
 #include "PcDebug.h"
-#include "Log.h"
-#include "TimestampDecoder.h"
-
-#include <stdexcept>
-#include <string.h>
-#include <math.h>
-
+#include <cmath>
 
 OutputMemory::OutputMemory(Buffer* dataOut)
     : ModOutput()
 {
     PDEBUG("OutputMemory::OutputMemory(%p) @ %p\n", dataOut, this);
 
-    setOutput(dataOut);
+    m_dataOut = dataOut;
 
 #if OUTPUT_MEM_HISTOGRAM
     myMax = 0.0f;
@@ -49,7 +43,6 @@ OutputMemory::OutputMemory(Buffer* dataOut)
 #endif
 }
 
-
 OutputMemory::~OutputMemory()
 {
 #if OUTPUT_MEM_HISTOGRAM
@@ -66,19 +59,12 @@ OutputMemory::~OutputMemory()
     PDEBUG("OutputMemory::~OutputMemory() @ %p\n", this);
 }
 
-
-void OutputMemory::setOutput(Buffer* dataOut)
-{
-    myDataOut = dataOut;
-}
-
-
 int OutputMemory::process(Buffer* dataIn)
 {
     PDEBUG("OutputMemory::process(dataIn: %p)\n",
             dataIn);
 
-    *myDataOut = *dataIn;
+    *m_dataOut = *dataIn;
 
 #if OUTPUT_MEM_HISTOGRAM
     const float* in = (const float*)dataIn->getData();
@@ -93,17 +79,17 @@ int OutputMemory::process(Buffer* dataIn)
     }
 #endif
 
-    return myDataOut->getLength();
+    return m_dataOut->getLength();
 }
 
 meta_vec_t OutputMemory::process_metadata(const meta_vec_t& metadataIn)
 {
-    myMetadata = metadataIn;
+    m_metadata = metadataIn;
     return {};
 }
 
 meta_vec_t OutputMemory::get_latest_metadata()
 {
-    return myMetadata;
+    return m_metadata;
 }
 
diff --git a/src/OutputMemory.h b/src/OutputMemory.h
index f0a5fbb..299d31d 100644
--- a/src/OutputMemory.h
+++ b/src/OutputMemory.h
@@ -61,11 +61,9 @@ public:
 
     meta_vec_t get_latest_metadata(void);
 
-    void setOutput(Buffer* dataOut);
-
 protected:
-    Buffer* myDataOut;
-    meta_vec_t myMetadata;
+    Buffer* m_dataOut;
+    meta_vec_t m_metadata;
 
 #if OUTPUT_MEM_HISTOGRAM
     // keep track of max value
diff --git a/src/PAPRStats.cpp b/src/PAPRStats.cpp
index 0c9764a..103f02f 100644
--- a/src/PAPRStats.cpp
+++ b/src/PAPRStats.cpp
@@ -33,7 +33,6 @@
 #  include <iostream>
 #endif
 
-
 PAPRStats::PAPRStats(size_t num_blocks_to_accumulate) :
     m_num_blocks_to_accumulate(num_blocks_to_accumulate)
 {
diff --git a/src/PAPRStats.h b/src/PAPRStats.h
index 86ad8b0..a4ded86 100644
--- a/src/PAPRStats.h
+++ b/src/PAPRStats.h
@@ -31,12 +31,9 @@
 #endif
 
 #include <cstddef>
-#include <vector>
 #include <deque>
 #include <complex>
 
-typedef std::complex<float> complexf;
-
 /* Helper class to calculate Peak-to-average-power ratio.
  * Definition of PAPR:
  *
@@ -53,6 +50,8 @@ typedef std::complex<float> complexf;
  */
 class PAPRStats
 {
+    typedef std::complex<float> complexf;
+
     public:
         PAPRStats(size_t num_blocks_to_accumulate);
 
diff --git a/src/PhaseReference.cpp b/src/PhaseReference.cpp
index 568e15e..71dec87 100644
--- a/src/PhaseReference.cpp
+++ b/src/PhaseReference.cpp
@@ -29,12 +29,10 @@
 
 #include <stdexcept>
 
-using complexf = std::complex<float>;
-
 /* ETSI EN 300 401 Table 43 (Clause 14.3.2)
  * Contains h_{i,k} values
  */
-const uint8_t PhaseReference::d_h[4][32] = {
+static const uint8_t d_h[4][32] = {
     /* h0 */ { 0, 2, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 2, 1, 1,
         0, 2, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 2, 1, 1 },
     /* h1 */ { 0, 3, 2, 3, 0, 1, 3, 0, 2, 1, 2, 3, 2, 3, 3, 0,
@@ -54,41 +52,80 @@ const uint8_t PhaseReference::d_h[4][32] = {
  * Tables 44 to 47 describe the frequency interleaving done in
  * FrequencyInterleaver.
  */
-PhaseReference::PhaseReference(unsigned int dabmode) :
+PhaseReference::PhaseReference(unsigned int dabmode, bool fixedPoint) :
     ModInput(),
-    d_dabmode(dabmode)
+    d_dabmode(dabmode),
+    d_fixedPoint(fixedPoint)
 {
     PDEBUG("PhaseReference::PhaseReference(%u) @ %p\n", dabmode, this);
 
     switch (d_dabmode) {
         case 1:
             d_carriers = 1536;
-            d_num = 2048;
             break;
         case 2:
             d_carriers = 384;
-            d_num = 512;
             break;
         case 3:
             d_carriers = 192;
-            d_num = 256;
             break;
         case 4:
             d_dabmode = 0;
         case 0:
             d_carriers = 768;
-            d_num = 1024;
             break;
         default:
             throw std::runtime_error(
                     "PhaseReference::PhaseReference DAB mode not valid!");
     }
-    d_dataIn.resize(d_carriers);
-    fillData();
+
+    if (d_fixedPoint) {
+        d_phaseRefFixed.fillData(d_dabmode, d_carriers);
+    }
+    else {
+        d_phaseRefCF32.fillData(d_dabmode, d_carriers);
+    }
 }
 
 
-complexf convert(uint8_t data) {
+static const int table[][48][2] = {
+    { // Mode 0/4
+        // Positive part
+        { 0, 0 }, { 3, 1 }, { 2, 0 }, { 1, 2 }, { 0, 0 }, { 3, 1 },
+        { 2, 2 }, { 1, 2 }, { 0, 2 }, { 3, 1 }, { 2, 3 }, { 1, 0 },
+        // Negative part
+        { 0, 0 }, { 1, 1 }, { 2, 1 }, { 3, 2 }, { 0, 2 }, { 1, 2 },
+        { 2, 0 }, { 3, 3 }, { 0, 3 }, { 1, 1 }, { 2, 3 }, { 3, 2 },
+    },
+    { // Mode 1
+        // Positive part
+        { 0, 3 }, { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 2 }, { 3, 2 },
+        { 2, 1 }, { 1, 0 }, { 0, 2 }, { 3, 2 }, { 2, 3 }, { 1, 3 },
+        { 0, 0 }, { 3, 2 }, { 2, 1 }, { 1, 3 }, { 0, 3 }, { 3, 3 },
+        { 2, 3 }, { 1, 0 }, { 0, 3 }, { 3, 0 }, { 2, 1 }, { 1, 1 },
+        // Negative part
+        { 0, 1 }, { 1, 2 }, { 2, 0 }, { 3, 1 }, { 0, 3 }, { 1, 2 },
+        { 2, 2 }, { 3, 3 }, { 0, 2 }, { 1, 1 }, { 2, 2 }, { 3, 3 },
+        { 0, 1 }, { 1, 2 }, { 2, 3 }, { 3, 3 }, { 0, 2 }, { 1, 2 },
+        { 2, 2 }, { 3, 1 }, { 0, 1 }, { 1, 3 }, { 2, 1 }, { 3, 2 },
+    },
+    { // Mode 2
+        // Positive part
+        { 2, 0 }, { 1, 2 }, { 0, 2 }, { 3, 1 }, { 2, 0 }, { 1, 3 },
+        // Negative part
+        { 0, 2 }, { 1, 3 }, { 2, 2 }, { 3, 2 }, { 0, 1 }, { 1, 2 },
+    },
+    { // Mode 3
+        // Positive part
+        { 3, 2 }, { 2, 2 }, { 1, 2 },
+        // Negative part
+        { 0, 2 }, { 1, 3 }, { 2, 0 },
+    },
+};
+
+
+template <>
+complexf PhaseRefGen<complexf>::convert(uint8_t data) {
     const complexf value[] = {
         complexf(1, 0),
         complexf(0, 1),
@@ -98,62 +135,37 @@ complexf convert(uint8_t data) {
     return value[data % 4];
 }
 
+template <>
+complexfix PhaseRefGen<complexfix>::convert(uint8_t data) {
+    constexpr auto one = fixed_16{1};
+    constexpr auto zero = fixed_16{0};
 
-void PhaseReference::fillData()
-{
-    const int table[][48][2] = {
-        { // Mode 0/4
-            // Positive part
-            { 0, 0 }, { 3, 1 }, { 2, 0 }, { 1, 2 }, { 0, 0 }, { 3, 1 },
-            { 2, 2 }, { 1, 2 }, { 0, 2 }, { 3, 1 }, { 2, 3 }, { 1, 0 },
-            // Negative part
-            { 0, 0 }, { 1, 1 }, { 2, 1 }, { 3, 2 }, { 0, 2 }, { 1, 2 },
-            { 2, 0 }, { 3, 3 }, { 0, 3 }, { 1, 1 }, { 2, 3 }, { 3, 2 },
-        },
-        { // Mode 1
-            // Positive part
-            { 0, 3 }, { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 2 }, { 3, 2 },
-            { 2, 1 }, { 1, 0 }, { 0, 2 }, { 3, 2 }, { 2, 3 }, { 1, 3 },
-            { 0, 0 }, { 3, 2 }, { 2, 1 }, { 1, 3 }, { 0, 3 }, { 3, 3 },
-            { 2, 3 }, { 1, 0 }, { 0, 3 }, { 3, 0 }, { 2, 1 }, { 1, 1 },
-            // Negative part
-            { 0, 1 }, { 1, 2 }, { 2, 0 }, { 3, 1 }, { 0, 3 }, { 1, 2 },
-            { 2, 2 }, { 3, 3 }, { 0, 2 }, { 1, 1 }, { 2, 2 }, { 3, 3 },
-            { 0, 1 }, { 1, 2 }, { 2, 3 }, { 3, 3 }, { 0, 2 }, { 1, 2 },
-            { 2, 2 }, { 3, 1 }, { 0, 1 }, { 1, 3 }, { 2, 1 }, { 3, 2 },
-        },
-        { // Mode 2
-            // Positive part
-            { 2, 0 }, { 1, 2 }, { 0, 2 }, { 3, 1 }, { 2, 0 }, { 1, 3 },
-            // Negative part
-            { 0, 2 }, { 1, 3 }, { 2, 2 }, { 3, 2 }, { 0, 1 }, { 1, 2 },
-        },
-        { // Mode 3
-            // Positive part
-            { 3, 2 }, { 2, 2 }, { 1, 2 },
-            // Negative part
-            { 0, 2 }, { 1, 3 }, { 2, 0 },
-        },
+    const complexfix value[] = {
+        complexfix(one, zero),
+        complexfix(zero, one),
+        complexfix(-one, zero),
+        complexfix(zero, -one),
     };
+    return value[data % 4];
+}
 
-    if (d_dabmode > 3) {
-        throw std::runtime_error(
-                "PhaseReference::fillData invalid DAB mode!");
-    }
-
-    if (d_dataIn.size() != d_carriers) {
+template <typename T>
+void PhaseRefGen<T>::fillData(unsigned int dabmode, size_t carriers)
+{
+    dataIn.resize(carriers);
+    if (dataIn.size() != carriers) {
         throw std::runtime_error(
-                "PhaseReference::fillData d_dataIn has incorrect size!");
+                "PhaseReference::fillData dataIn has incorrect size!");
     }
 
     for (size_t index = 0,
                 offset = 0;
-                index < d_dataIn.size();
+                index < dataIn.size();
                 ++offset) {
         for (size_t k = 0; k < 32; ++k) {
-            d_dataIn[index++] = convert(
-                    d_h[ table[d_dabmode][offset][0] ][k] +
-                    table[d_dabmode][offset][1] );
+            dataIn[index++] = convert(
+                    d_h[ table[dabmode][offset][0] ][k] +
+                    table[dabmode][offset][1] );
         }
     }
 }
@@ -163,7 +175,12 @@ int PhaseReference::process(Buffer* dataOut)
 {
     PDEBUG("PhaseReference::process(dataOut: %p)\n", dataOut);
 
-    dataOut->setData(&d_dataIn[0], d_carriers * sizeof(complexf));
+    if (d_fixedPoint) {
+        dataOut->setData(d_phaseRefFixed.dataIn.data(), d_carriers * sizeof(complexfix));
+    }
+    else {
+        dataOut->setData(d_phaseRefCF32.dataIn.data(), d_carriers * sizeof(complexf));
+    }
 
     return 1;
 }
diff --git a/src/PhaseReference.h b/src/PhaseReference.h
index 6ecdc4e..735009c 100644
--- a/src/PhaseReference.h
+++ b/src/PhaseReference.h
@@ -32,25 +32,33 @@
 
 #include "ModPlugin.h"
 
-#include <cstddef>
-#include <complex>
 #include <vector>
+#include <cstddef>
+
+template <typename T>
+struct PhaseRefGen {
+    std::vector<T> dataIn;
+    void fillData(unsigned int dabmode, size_t carriers);
+
+    private:
+    T convert(uint8_t data);
+};
+
 
 class PhaseReference : public ModInput
 {
     public:
-        PhaseReference(unsigned int dabmode);
+        PhaseReference(unsigned int dabmode, bool fixedPoint);
 
         int process(Buffer* dataOut) override;
         const char* name() override { return "PhaseReference"; }
 
     protected:
         unsigned int d_dabmode;
+        bool d_fixedPoint;
         size_t d_carriers;
-        size_t d_num;
-        const static uint8_t d_h[4][32];
-        std::vector<std::complex<float> > d_dataIn;
 
-        void fillData();
+        PhaseRefGen<complexf> d_phaseRefCF32;
+        PhaseRefGen<complexfix> d_phaseRefFixed;
 };
 
diff --git a/src/QpskSymbolMapper.cpp b/src/QpskSymbolMapper.cpp
index e26853a..c12ad80 100644
--- a/src/QpskSymbolMapper.cpp
+++ b/src/QpskSymbolMapper.cpp
@@ -23,7 +23,6 @@
 #include <cstdio>
 #include <cstring>
 #include <stdexcept>
-#include <complex>
 #include <cmath>
 #ifdef __SSE__
 #   include <xmmintrin.h>
@@ -32,12 +31,10 @@
 #include "QpskSymbolMapper.h"
 #include "PcDebug.h"
 
-
-typedef std::complex<float> complexf;
-
-QpskSymbolMapper::QpskSymbolMapper(size_t carriers) :
+QpskSymbolMapper::QpskSymbolMapper(size_t carriers, bool fixedPoint) :
     ModCodec(),
-    d_carriers(carriers) { }
+    m_fixedPoint(fixedPoint),
+    m_carriers(carriers) { }
 
 int QpskSymbolMapper::process(Buffer* const dataIn, Buffer* dataOut)
 {
@@ -45,112 +42,172 @@ int QpskSymbolMapper::process(Buffer* const dataIn, Buffer* dataOut)
             "(dataIn: %p, dataOut: %p)\n",
             dataIn, dataOut);
 
-    dataOut->setLength(dataIn->getLength() * 4 * 2 * sizeof(float));   // 4 output complex symbols per input byte
-#ifdef __SSE__
-    const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
-    __m128* out = reinterpret_cast<__m128*>(dataOut->getData());
-
-    if (dataIn->getLength() % (d_carriers / 4) != 0) {
-        throw std::runtime_error(
-                "QpskSymbolMapper::process input size not valid: " +
-                std::to_string(dataIn->getLength()) +
-                "(input size) % (" + std::to_string(d_carriers) +
-                " (carriers) / 4) != 0");
-    }
+    // 4 output complex symbols per input byte
+
+    if (m_fixedPoint) {
+        dataOut->setLength(dataIn->getLength() * 4 * sizeof(complexfix));
+
+        using fixed_t = complexfix::value_type;
 
-    const static __m128 symbols[16] = {
-        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
-        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2)
-    };
-    size_t inOffset = 0;
-    size_t outOffset = 0;
-    uint8_t tmp = 0;
-    for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) {
-        for (size_t j = 0; j < d_carriers / 8; ++j) {
-            tmp =  (in[inOffset] & 0xc0) >> 4;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6;
-            out[outOffset] = symbols[tmp];
-            tmp =  (in[inOffset] & 0x30) >> 2;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4;
-            out[outOffset + 1] = symbols[tmp];
-            tmp =  (in[inOffset] & 0x0c);
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2;
-            out[outOffset + 2] = symbols[tmp];
-            tmp =  (in[inOffset] & 0x03) << 2;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x03);
-            out[outOffset + 3] = symbols[tmp];
-            ++inOffset;
-            outOffset += 4;
+        const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
+        fixed_t* out = reinterpret_cast<fixed_t*>(dataOut->getData());
+
+        if (dataIn->getLength() % (m_carriers / 4) != 0) {
+            throw std::runtime_error(
+                    "QpskSymbolMapper::process input size not valid!");
+        }
+
+        constexpr fixed_t v = static_cast<fixed_t>(M_SQRT1_2);
+
+        const static fixed_t symbols[16][4] = {
+            { v,  v,  v,  v},
+            { v,  v,  v, -v},
+            { v, -v,  v,  v},
+            { v, -v,  v, -v},
+            { v,  v, -v,  v},
+            { v,  v, -v, -v},
+            { v, -v, -v,  v},
+            { v, -v, -v, -v},
+            {-v,  v,  v,  v},
+            {-v,  v,  v, -v},
+            {-v, -v,  v,  v},
+            {-v, -v,  v, -v},
+            {-v,  v, -v,  v},
+            {-v,  v, -v, -v},
+            {-v, -v, -v,  v},
+            {-v, -v, -v, -v}
+        };
+        size_t inOffset = 0;
+        size_t outOffset = 0;
+        uint8_t tmp;
+        for (size_t i = 0; i < dataIn->getLength(); i += m_carriers / 4) {
+            for (size_t j = 0; j < m_carriers / 8; ++j) {
+                tmp =  (in[inOffset] & 0xc0) >> 4;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0xc0) >> 6;
+                memcpy(&out[outOffset], symbols[tmp], sizeof(fixed_t) * 4);
+                tmp =  (in[inOffset] & 0x30) >> 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x30) >> 4;
+                memcpy(&out[outOffset + 4], symbols[tmp], sizeof(fixed_t) * 4);
+                tmp =  (in[inOffset] & 0x0c);
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x0c) >> 2;
+                memcpy(&out[outOffset + 8], symbols[tmp], sizeof(fixed_t) * 4);
+                tmp =  (in[inOffset] & 0x03) << 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x03);
+                memcpy(&out[outOffset + 12], symbols[tmp], sizeof(fixed_t) * 4);
+                ++inOffset;
+                outOffset += 4*4;
+            }
+            inOffset += m_carriers / 8;
         }
-        inOffset += d_carriers / 8;
     }
+    else {
+        dataOut->setLength(dataIn->getLength() * 4 * sizeof(complexf));
+#ifdef __SSE__
+        const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
+        __m128* out = reinterpret_cast<__m128*>(dataOut->getData());
+
+        if (dataIn->getLength() % (m_carriers / 4) != 0) {
+            throw std::runtime_error(
+                    "QpskSymbolMapper::process input size not valid: " +
+                    std::to_string(dataIn->getLength()) +
+                    "(input size) % (" + std::to_string(m_carriers) +
+                    " (carriers) / 4) != 0");
+        }
+
+        const static __m128 symbols[16] = {
+            _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
+            _mm_setr_ps(-M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2)
+        };
+        size_t inOffset = 0;
+        size_t outOffset = 0;
+        uint8_t tmp = 0;
+        for (size_t i = 0; i < dataIn->getLength(); i += m_carriers / 4) {
+            for (size_t j = 0; j < m_carriers / 8; ++j) {
+                tmp =  (in[inOffset] & 0xc0) >> 4;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0xc0) >> 6;
+                out[outOffset] = symbols[tmp];
+                tmp =  (in[inOffset] & 0x30) >> 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x30) >> 4;
+                out[outOffset + 1] = symbols[tmp];
+                tmp =  (in[inOffset] & 0x0c);
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x0c) >> 2;
+                out[outOffset + 2] = symbols[tmp];
+                tmp =  (in[inOffset] & 0x03) << 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x03);
+                out[outOffset + 3] = symbols[tmp];
+                ++inOffset;
+                outOffset += 4;
+            }
+            inOffset += m_carriers / 8;
+        }
 #else // !__SSE__
-    const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
-    float* out = reinterpret_cast<float*>(dataOut->getData());
-    if (dataIn->getLength() % (d_carriers / 4) != 0) {
-        throw std::runtime_error(
-                "QpskSymbolMapper::process input size not valid!");
-    }
-    if (dataOut->getLength() / sizeof(float) != dataIn->getLength() * 4 * 2) {    // 4 output complex symbols per input byte
-        throw std::runtime_error(
-                "QpskSymbolMapper::process output size not valid!");
-    }
+        const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
+        float* out = reinterpret_cast<float*>(dataOut->getData());
+        if (dataIn->getLength() % (m_carriers / 4) != 0) {
+            throw std::runtime_error(
+                    "QpskSymbolMapper::process input size not valid!");
+        }
+        if (dataOut->getLength() / sizeof(float) != dataIn->getLength() * 4 * 2) {    // 4 output complex symbols per input byte
+            throw std::runtime_error(
+                    "QpskSymbolMapper::process output size not valid!");
+        }
 
-    const static float symbols[16][4] = {
-        { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
-        { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
-        { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
-        { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
-        { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
-        { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
-        { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
-        { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
-        {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
-        {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
-        {-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
-        {-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
-        {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
-        {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
-        {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
-        {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}
-    };
-    size_t inOffset = 0;
-    size_t outOffset = 0;
-    uint8_t tmp;
-    for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) {
-        for (size_t j = 0; j < d_carriers / 8; ++j) {
-            tmp =  (in[inOffset] & 0xc0) >> 4;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6;
-            memcpy(&out[outOffset], symbols[tmp], sizeof(float) * 4);
-            tmp =  (in[inOffset] & 0x30) >> 2;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4;
-            memcpy(&out[outOffset + 4], symbols[tmp], sizeof(float) * 4);
-            tmp =  (in[inOffset] & 0x0c);
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2;
-            memcpy(&out[outOffset + 8], symbols[tmp], sizeof(float) * 4);
-            tmp =  (in[inOffset] & 0x03) << 2;
-            tmp |= (in[inOffset + (d_carriers / 8)] & 0x03);
-            memcpy(&out[outOffset + 12], symbols[tmp], sizeof(float) * 4);
-            ++inOffset;
-            outOffset += 4*4;
+        const static float symbols[16][4] = {
+            { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
+            { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
+            { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
+            { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
+            { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
+            { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
+            { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
+            { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
+            {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
+            {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
+            {-M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
+            {-M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
+            {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
+            {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
+            {-M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
+            {-M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}
+        };
+        size_t inOffset = 0;
+        size_t outOffset = 0;
+        uint8_t tmp;
+        for (size_t i = 0; i < dataIn->getLength(); i += m_carriers / 4) {
+            for (size_t j = 0; j < m_carriers / 8; ++j) {
+                tmp =  (in[inOffset] & 0xc0) >> 4;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0xc0) >> 6;
+                memcpy(&out[outOffset], symbols[tmp], sizeof(float) * 4);
+                tmp =  (in[inOffset] & 0x30) >> 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x30) >> 4;
+                memcpy(&out[outOffset + 4], symbols[tmp], sizeof(float) * 4);
+                tmp =  (in[inOffset] & 0x0c);
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x0c) >> 2;
+                memcpy(&out[outOffset + 8], symbols[tmp], sizeof(float) * 4);
+                tmp =  (in[inOffset] & 0x03) << 2;
+                tmp |= (in[inOffset + (m_carriers / 8)] & 0x03);
+                memcpy(&out[outOffset + 12], symbols[tmp], sizeof(float) * 4);
+                ++inOffset;
+                outOffset += 4*4;
+            }
+            inOffset += m_carriers / 8;
         }
-        inOffset += d_carriers / 8;
-    }
 #endif // __SSE__
+    }
 
     return 1;
 }
diff --git a/src/QpskSymbolMapper.h b/src/QpskSymbolMapper.h
index dbcf4dd..6cf7a2e 100644
--- a/src/QpskSymbolMapper.h
+++ b/src/QpskSymbolMapper.h
@@ -31,12 +31,13 @@
 class QpskSymbolMapper : public ModCodec
 {
 public:
-    QpskSymbolMapper(size_t carriers);
+    QpskSymbolMapper(size_t carriers, bool fixedPoint);
 
     int process(Buffer* const dataIn, Buffer* dataOut);
     const char* name() { return "QpskSymbolMapper"; }
 
 protected:
-    size_t d_carriers;
+    bool m_fixedPoint;
+    size_t m_carriers;
 };
 
diff --git a/src/Resampler.h b/src/Resampler.h
index d1a9f7a..2c810f6 100644
--- a/src/Resampler.h
+++ b/src/Resampler.h
@@ -37,9 +37,6 @@
 #define FFT_TYPE fftwf_complex
 #define FFT_PLAN fftwf_plan
 
-#include <complex>
-typedef std::complex<float> complexf;
-
 
 class Resampler : public ModCodec
 {
diff --git a/src/SignalMultiplexer.cpp b/src/SignalMultiplexer.cpp
index 1d95bdd..d4955d0 100644
--- a/src/SignalMultiplexer.cpp
+++ b/src/SignalMultiplexer.cpp
@@ -22,25 +22,20 @@
 #include "SignalMultiplexer.h"
 #include "PcDebug.h"
 
-#include <stdio.h>
-#include <stdexcept>
+#include <cstdio>
 #include <assert.h>
-#include <string.h>
 
 
-SignalMultiplexer::SignalMultiplexer(size_t framesize) :
-    ModMux(),
-    d_frameSize(framesize)
+SignalMultiplexer::SignalMultiplexer() :
+    ModMux()
 {
-    PDEBUG("SignalMultiplexer::SignalMultiplexer(%zu) @ %p\n", framesize, this);
-
+    PDEBUG("SignalMultiplexer::SignalMultiplexer() @ %p\n", this);
 }
 
 
 SignalMultiplexer::~SignalMultiplexer()
 {
     PDEBUG("SignalMultiplexer::~SignalMultiplexer() @ %p\n", this);
-
 }
 
 
diff --git a/src/SignalMultiplexer.h b/src/SignalMultiplexer.h
index 5186a8d..1f6bc12 100644
--- a/src/SignalMultiplexer.h
+++ b/src/SignalMultiplexer.h
@@ -36,7 +36,7 @@
 class SignalMultiplexer : public ModMux
 {
 public:
-    SignalMultiplexer(size_t frameSize);
+    SignalMultiplexer();
     virtual ~SignalMultiplexer();
     SignalMultiplexer(const SignalMultiplexer&);
     SignalMultiplexer& operator=(const SignalMultiplexer&);
@@ -44,8 +44,5 @@ public:
 
     int process(std::vector<Buffer*> dataIn, Buffer* dataOut);
     const char* name() { return "SignalMultiplexer"; }
-
-protected:
-    size_t d_frameSize;
 };
 
diff --git a/src/TII.cpp b/src/TII.cpp
index 2656cbf..bce15aa 100644
--- a/src/TII.cpp
+++ b/src/TII.cpp
@@ -2,7 +2,7 @@
    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty
    the Queen in Right of Canada (Communications Research Center Canada)
 
-   Copyright (C) 2023
+   Copyright (C) 2024
    Matthias P. Braendli, matthias.braendli@mpb.li
 
     http://opendigitalradio.org
@@ -27,11 +27,8 @@
 #include "TII.h"
 #include "PcDebug.h"
 
-#include <stdio.h>
-#include <stdexcept>
-#include <string.h>
-
-typedef std::complex<float> complexf;
+#include <cstdio>
+#include <cstring>
 
 /* TII pattern for TM I, II, IV */
 const int pattern_tm1_2_4[][8] = { // {{{
@@ -106,11 +103,12 @@ const int pattern_tm1_2_4[][8] = { // {{{
     {1,1,1,0,1,0,0,0},
     {1,1,1,1,0,0,0,0} }; // }}}
 
-TII::TII(unsigned int dabmode, tii_config_t& tii_config) :
+TII::TII(unsigned int dabmode, tii_config_t& tii_config, bool fixedPoint) :
     ModCodec(),
     RemoteControllable("tii"),
     m_dabmode(dabmode),
-    m_conf(tii_config)
+    m_conf(tii_config),
+    m_fixedPoint(fixedPoint)
 {
     PDEBUG("TII::TII(%u) @ %p\n", dabmode, this);
 
@@ -171,56 +169,72 @@ const char* TII::name()
     return m_name.c_str();
 }
 
+template<typename T>
+void do_process(size_t carriers, bool old_variant, const std::vector<bool>& Acp, Buffer* dataIn, Buffer* dataOut)
+{
+    const T* in = reinterpret_cast<const T*>(dataIn->getData());
+    T* out = reinterpret_cast<T*>(dataOut->getData());
+
+    /* Normalise the TII carrier power according to ETSI TR 101 496-3
+     * Clause 5.4.2.2 Paragraph 7:
+     *
+     * > The ratio of carriers in a TII symbol to a normal DAB symbol
+     * > is 1:48 for all Modes, so that the signal power in a TII symbol is
+     * > 16 dB below the signal power of the other symbols.
+     *
+     * This is because we only enable 32 out of 1536 carriers, not because
+     * every carrier is lower power.
+     */
+    for (size_t i = 0; i < Acp.size(); i++) {
+        /* See header file for an explanation of the old variant.
+         *
+         * A_{c,p}(k) and A_{c,p}(k-1) are never both simultaneously true,
+         * so instead of doing the sum inside z_{m,0,k}, we could do
+         *
+         * if (m_Acp[i]) out[i] = in[i];
+         * if (m_Acp[i-1]) out[i] = in[i-1]
+         *
+         * (Considering only the new variant)
+         *
+         * To avoid messing with indices, we substitute j = i-1
+         *
+         * if (m_Acp[i]) out[i] = in[i];
+         * if (m_Acp[j]) out[j+1] = in[j]
+         *
+         * and fuse the two conditionals together:
+         */
+        if (Acp[i]) {
+            out[i] = in[i];
+            out[i+1] = (old_variant ? in[i+1] : in[i]);
+        }
+    }
+}
 
 int TII::process(Buffer* dataIn, Buffer* dataOut)
 {
+    const size_t sizeof_samples = m_fixedPoint ? sizeof(complexfix) : sizeof(complexf);
+
     PDEBUG("TII::process(dataOut: %p)\n",
             dataOut);
     if (    (dataIn == NULL) or
-            (dataIn->getLength() != m_carriers * sizeof(complexf))) {
+            (dataIn->getLength() != m_carriers * sizeof_samples)) {
         throw TIIError("TII::process input size not valid!");
     }
 
-    dataOut->setLength(m_carriers * sizeof(complexf));
-    memset(dataOut->getData(), 0,  dataOut->getLength());
+    dataOut->setLength(m_carriers * sizeof_samples);
+    memset(dataOut->getData(), 0, dataOut->getLength());
 
     if (m_conf.enable and m_insert) {
         std::lock_guard<std::mutex> lock(m_enabled_carriers_mutex);
-        complexf* in = reinterpret_cast<complexf*>(dataIn->getData());
-        complexf* out = reinterpret_cast<complexf*>(dataOut->getData());
-
-        /* Normalise the TII carrier power according to ETSI TR 101 496-3
-         * Clause 5.4.2.2 Paragraph 7:
-         *
-         * > The ratio of carriers in a TII symbol to a normal DAB symbol
-         * > is 1:48 for all Modes, so that the signal power in a TII symbol is
-         * > 16 dB below the signal power of the other symbols.
-         *
-         * This is because we only enable 32 out of 1536 carriers, not because
-         * every carrier is lower power.
-         */
-        for (size_t i = 0; i < m_Acp.size(); i++) {
-            /* See header file for an explanation of the old variant.
-             *
-             * A_{c,p}(k) and A_{c,p}(k-1) are never both simultaneously true,
-             * so instead of doing the sum inside z_{m,0,k}, we could do
-             *
-             * if (m_Acp[i]) out[i] = in[i];
-             * if (m_Acp[i-1]) out[i] = in[i-1]
-             *
-             * (Considering only the new variant)
-             *
-             * To avoid messing with indices, we substitute j = i-1
-             *
-             * if (m_Acp[i]) out[i] = in[i];
-             * if (m_Acp[j]) out[j+1] = in[j]
-             *
-             * and fuse the two conditionals together:
-             */
-            if (m_Acp[i]) {
-                out[i] = in[i];
-                out[i+1] = (m_conf.old_variant ? in[i+1] : in[i]);
-            }
+        if (m_fixedPoint) {
+            do_process<complexfix>(
+                    m_carriers, m_conf.old_variant, m_Acp,
+                    dataIn, dataOut);
+        }
+        else {
+            do_process<complexf>(
+                    m_carriers, m_conf.old_variant, m_Acp,
+                    dataIn, dataOut);
         }
     }
 
diff --git a/src/TII.h b/src/TII.h
index f6de70b..6fe4d4f 100644
--- a/src/TII.h
+++ b/src/TII.h
@@ -2,7 +2,7 @@
    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty
    the Queen in Right of Canada (Communications Research Center Canada)
 
-   Copyright (C) 2023
+   Copyright (C) 2024
    Matthias P. Braendli, matthias.braendli@mpb.li
 
     http://opendigitalradio.org
@@ -36,8 +36,6 @@
 #include "RemoteControl.h"
 
 #include <cstddef>
-#include <thread>
-#include <complex>
 #include <vector>
 #include <string>
 
@@ -81,7 +79,7 @@ class TIIError : public std::runtime_error {
 class TII : public ModCodec, public RemoteControllable
 {
     public:
-        TII(unsigned int dabmode, tii_config_t& tii_config);
+        TII(unsigned int dabmode, tii_config_t& tii_config, bool fixedPoint);
         virtual ~TII() {}
 
         int process(Buffer* dataIn, Buffer* dataOut) override;
@@ -106,6 +104,8 @@ class TII : public ModCodec, public RemoteControllable
         // Remote-controllable settings
         tii_config_t& m_conf;
 
+        bool m_fixedPoint = false;
+
         // Internal flag when to insert TII
         bool m_insert = true;
 
diff --git a/src/Utils.cpp b/src/Utils.cpp
index fa2fd5d..f947acd 100644
--- a/src/Utils.cpp
+++ b/src/Utils.cpp
@@ -66,6 +66,9 @@ static void printHeader()
 #if defined(__SSE__)
         "SSE " <<
 #endif
+#if defined(__ARM_NEON)
+        "NEON " <<
+#endif
         "\n";
 }
 
diff --git a/src/output/Dexter.h b/src/output/Dexter.h
index d4f425f..f8a17ba 100644
--- a/src/output/Dexter.h
+++ b/src/output/Dexter.h
@@ -98,16 +98,16 @@ class Dexter : public Output::SDRDevice
 
         SDRDeviceConfig& m_conf;
 
-        struct iio_context* m_ctx = nullptr;
-        struct iio_device* m_dexter_dsp_tx = nullptr;
+        struct iio_context *m_ctx = nullptr;
+        struct iio_device *m_dexter_dsp_tx = nullptr;
 
-        struct iio_device* m_ad9957 = nullptr;
-        struct iio_device* m_ad9957_tx0 = nullptr;
-        struct iio_channel* m_tx_channel = nullptr;
+        struct iio_device *m_ad9957 = nullptr;
+        struct iio_device *m_ad9957_tx0 = nullptr;
+        struct iio_channel *m_tx_channel = nullptr;
         struct iio_buffer *m_buffer = nullptr;
 
         /* Underflows are counted in a separate thread */
-        struct iio_context* m_underflow_ctx = nullptr;
+        struct iio_context *m_underflow_ctx = nullptr;
         std::atomic<bool> m_running = ATOMIC_VAR_INIT(false);
         std::thread m_underflow_read_thread;
         void underflow_read_process();
diff --git a/src/output/SDR.cpp b/src/output/SDR.cpp
index 594171f..22398c7 100644
--- a/src/output/SDR.cpp
+++ b/src/output/SDR.cpp
@@ -34,6 +34,7 @@
 #include "RemoteControl.h"
 #include "Utils.h"
 
+#include <chrono>
 #include <cmath>
 #include <iostream>
 #include <assert.h>
diff --git a/src/output/SDR.h b/src/output/SDR.h
index 960de0c..86bf295 100644
--- a/src/output/SDR.h
+++ b/src/output/SDR.h
@@ -34,16 +34,12 @@ DESCRIPTION:
 #   include <config.h>
 #endif
 
-#include <chrono>
 #include "ModPlugin.h"
-#include "EtiReader.h"
 #include "output/SDRDevice.h"
 #include "output/Feedback.h"
 
 namespace Output {
 
-using complexf = std::complex<float>;
-
 class SDR : public ModOutput, public ModMetadata, public RemoteControllable {
     public:
         SDR(SDRDeviceConfig& config, std::shared_ptr<SDRDevice> device);
diff --git a/src/output/SDRDevice.h b/src/output/SDRDevice.h
index 378829c..ec9373d 100644
--- a/src/output/SDRDevice.h
+++ b/src/output/SDRDevice.h
@@ -38,9 +38,7 @@ DESCRIPTION:
 #include <string>
 #include <vector>
 #include <complex>
-#include <variant>
 #include <optional>
-#include <unordered_map>
 
 #include "TimestampDecoder.h"
 
@@ -59,6 +57,8 @@ struct SDRDeviceConfig {
     std::string tx_antenna;
     std::string rx_antenna;
 
+    bool fixedPoint = false;
+
     long masterClockRate = 32768000;
     unsigned sampleRate = 2048000;
     double frequency = 0.0;
diff --git a/src/output/UHD.cpp b/src/output/UHD.cpp
index e097692..b30f9e1 100644
--- a/src/output/UHD.cpp
+++ b/src/output/UHD.cpp
@@ -31,10 +31,7 @@
 //#define MDEBUG(fmt, args...) fprintf(LOG, fmt , ## args)
 #define MDEBUG(fmt, args...)
 
-#include "PcDebug.h"
 #include "Log.h"
-#include "RemoteControl.h"
-#include "Utils.h"
 
 #include <thread>
 #include <iomanip>
@@ -52,14 +49,12 @@
 # include <uhd/utils/thread_priority.hpp>
 #endif
 
-
-#include <cmath>
 #include <iostream>
-#include <assert.h>
+#include <cmath>
+#include <cassert>
 #include <stdexcept>
-#include <stdio.h>
+#include <cstdio>
 #include <time.h>
-#include <errno.h>
 #include <unistd.h>
 #include <pthread.h>
 
@@ -235,7 +230,8 @@ UHD::UHD(SDRDeviceConfig& config) :
     m_usrp->set_rx_gain(m_conf.rxgain);
     etiLog.log(debug, "OutputUHD:Actual RX Gain: %f", m_usrp->get_rx_gain());
 
-    const uhd::stream_args_t stream_args("fc32"); //complex floats
+    const uhd::stream_args_t stream_args(
+            m_conf.fixedPoint ? "sc16" : "fc32");
     m_rx_stream = m_usrp->get_rx_stream(stream_args);
     m_tx_stream = m_usrp->get_tx_stream(stream_args);
 
@@ -319,8 +315,9 @@ double UHD::get_bandwidth(void) const
 void UHD::transmit_frame(struct FrameData&& frame)
 {
     const double tx_timeout = 20.0;
-    const size_t sizeIn = frame.buf.size() / sizeof(complexf);
-    const complexf* in_data = reinterpret_cast<const complexf*>(&frame.buf[0]);
+
+    const size_t sample_size = m_conf.fixedPoint ? (2 * sizeof(int16_t)) : sizeof(complexf);
+    const size_t sizeIn = frame.buf.size() / sample_size;
 
     uhd::tx_metadata_t md_tx;
 
@@ -353,9 +350,9 @@ void UHD::transmit_frame(struct FrameData&& frame)
                 samps_to_send <= usrp_max_num_samps );
         m_require_timestamp_refresh = false;
 
-        //send a single packet
+        // send a single packet
         size_t num_tx_samps = m_tx_stream->send(
-                &in_data[num_acc_samps],
+                frame.buf.data() + sample_size * num_acc_samps,
                 samps_to_send, md_tx, tx_timeout);
         etiLog.log(trace, "UHD,sent %zu of %zu", num_tx_samps, samps_to_send);
 
diff --git a/src/output/UHD.h b/src/output/UHD.h
index 9891c7a..c4f1a45 100644
--- a/src/output/UHD.h
+++ b/src/output/UHD.h
@@ -45,12 +45,9 @@ DESCRIPTION:
 #include <atomic>
 #include <thread>
 
-#include "Log.h"
 #include "output/SDR.h"
 #include "output/USRPTime.h"
 #include "TimestampDecoder.h"
-#include "RemoteControl.h"
-#include "ThreadsafeQueue.h"
 
 #include <stdio.h>
 #include <sys/types.h>