diff options
author | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
---|---|---|
committer | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
commit | a31630e0d5b9880c716d9004ef4154396ba41ebc (patch) | |
tree | aebbd3b132e5f2dd31bc34750ccded2378fc687a /dotprod_sse2_assist.s | |
parent | 9aaac5be9db5e1537badc65242412ef14c5096e3 (diff) | |
download | ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.gz ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.bz2 ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.zip |
Extract fec-3.0.1
Diffstat (limited to 'dotprod_sse2_assist.s')
-rw-r--r-- | dotprod_sse2_assist.s | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/dotprod_sse2_assist.s b/dotprod_sse2_assist.s new file mode 100644 index 0000000..47348fa --- /dev/null +++ b/dotprod_sse2_assist.s @@ -0,0 +1,85 @@ +# SIMD SSE2 dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 8; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b must be 128-bit aligned +# Copyright 2001, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_sse2_assist + .type dotprod_sse2_assist,@function +dotprod_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) + +# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movdqa (%esi),%xmm1 + pmaddwd (%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 16(%esi),%xmm1 + pmaddwd 16(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 32(%esi),%xmm1 + pmaddwd 32(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 48(%esi),%xmm1 + addl $64,%esi + pmaddwd 48(%edi),%xmm1 + addl $64,%edi + paddd %xmm1,%xmm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# SSE2 dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movdqa (%esi),%xmm1 + addl $16,%esi + pmaddwd (%edi),%xmm1 + addl $16,%edi + paddd %xmm1,%xmm0 + jmp .Loop2 +.Loop2Done: + + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 + paddd %xmm1,%xmm0 + movd %xmm0,%eax # right-hand word to eax + psrldq $4,%xmm0 + movd %xmm0,%ebx + addl %ebx,%eax + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret |