aboutsummaryrefslogtreecommitdiffstats
path: root/host/lib/convert/sse2_fc64_to_sc8.cpp
blob: 95db4e92706e45befac3a3db6acd02f8e7c37630 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
//
// Copyright 2012-2013 Ettus Research LLC
// Copyright 2018 Ettus Research, a National Instruments Company
//
// SPDX-License-Identifier: GPL-3.0-or-later
//

#include "convert_common.hpp"
#include <uhd/utils/byteswap.hpp>
#include <emmintrin.h>

using namespace uhd::convert;

UHD_INLINE __m128i pack_sc8_item32_4x(
    const __m128i& in0, const __m128i& in1, const __m128i& in2, const __m128i& in3)
{
    const __m128i lo = _mm_packs_epi32(in0, in1);
    const __m128i hi = _mm_packs_epi32(in2, in3);
    return _mm_packs_epi16(lo, hi);
}

UHD_INLINE __m128i pack_sc32_4x(
    const __m128d& lo, const __m128d& hi, const __m128d& scalar)
{
    const __m128i tmpi_lo = _mm_cvttpd_epi32(_mm_mul_pd(hi, scalar));
    const __m128i tmpi_hi = _mm_cvttpd_epi32(_mm_mul_pd(lo, scalar));
    return _mm_unpacklo_epi64(tmpi_lo, tmpi_hi);
}

DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD)
{
    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);

    const __m128d scalar = _mm_set1_pd(scale_factor);

#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_)                           \
    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                          \
        /* load from input */                                                     \
        __m128d tmp0 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));   \
        __m128d tmp1 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1));   \
        __m128d tmp2 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));   \
        __m128d tmp3 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3));   \
        __m128d tmp4 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));   \
        __m128d tmp5 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5));   \
        __m128d tmp6 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));   \
        __m128d tmp7 =                                                            \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7));   \
                                                                                  \
        /* interleave */                                                          \
        const __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), \
            pack_sc32_4x(tmp3, tmp2, scalar),                                     \
            pack_sc32_4x(tmp5, tmp4, scalar),                                     \
            pack_sc32_4x(tmp7, tmp6, scalar));                                    \
                                                                                  \
        /* store to output */                                                     \
        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);           \
    }

    size_t i = 0;

    // dispatch according to alignment
    if ((size_t(input) & 0xf) == 0) {
        convert_fc64_1_to_sc8_item32_1_bswap_guts(_)
    } else {
        convert_fc64_1_to_sc8_item32_1_bswap_guts(u_)
    }

    // convert remainder
    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD)
{
    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);

    const __m128d scalar = _mm_set1_pd(scale_factor);

#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_)                                  \
    for (size_t j = 0; i + 7 < nsamps; i += 8, j += 4) {                                 \
        /* load from input */                                                            \
        __m128d tmp0 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));          \
        __m128d tmp1 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 1));          \
        __m128d tmp2 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));          \
        __m128d tmp3 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 3));          \
        __m128d tmp4 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));          \
        __m128d tmp5 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 5));          \
        __m128d tmp6 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));          \
        __m128d tmp7 =                                                                   \
            _mm_load##_al_##pd(reinterpret_cast<const double*>(input + i + 7));          \
                                                                                         \
        /* interleave */                                                                 \
        __m128i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar),              \
            pack_sc32_4x(tmp2, tmp3, scalar),                                            \
            pack_sc32_4x(tmp4, tmp5, scalar),                                            \
            pack_sc32_4x(tmp6, tmp7, scalar));                                           \
        tmpi =                                                                           \
            _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); /*byteswap*/ \
                                                                                         \
        /* store to output */                                                            \
        _mm_storeu_si128(reinterpret_cast<__m128i*>(output + j), tmpi);                  \
    }

    size_t i = 0;

    // dispatch according to alignment
    if ((size_t(input) & 0xf) == 0) {
        convert_fc64_1_to_sc8_item32_1_nswap_guts(_)
    } else {
        convert_fc64_1_to_sc8_item32_1_nswap_guts(u_)
    }

    // convert remainder
    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}