diff options
Diffstat (limited to 'dotprod_mmx_assist.s')
-rw-r--r-- | dotprod_mmx_assist.s | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/dotprod_mmx_assist.s b/dotprod_mmx_assist.s new file mode 100644 index 0000000..25deffd --- /dev/null +++ b/dotprod_mmx_assist.s @@ -0,0 +1,83 @@ +# SIMD MMX dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 4; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b should also be 64-bit aligned, or speed will suffer greatly +# Copyright 1999, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_mmx_assist + .type dotprod_mmx_assist,@function +dotprod_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %mm0,%mm0 # clear running sum (in two 32-bit halves) + +# MMX dot product loop unrolled 4 times, crunching 16 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0] + pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0] + paddd %mm1,%mm0 + + movq 8(%esi),%mm1 + pmaddwd 8(%edi),%mm1 + paddd %mm1,%mm0 + + movq 16(%esi),%mm1 + pmaddwd 16(%edi),%mm1 + paddd %mm1,%mm0 + + movq 24(%esi),%mm1 + addl $32,%esi + pmaddwd 24(%edi),%mm1 + addl $32,%edi + paddd %mm1,%mm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# MMX dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movq (%esi),%mm1 + addl $8,%esi + pmaddwd (%edi),%mm1 + addl $8,%edi + paddd %mm1,%mm0 + jmp .Loop2 +.Loop2Done: + + movd %mm0,%ebx # right-hand word to ebx + punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0 + movd %mm0,%eax + addl %ebx,%eax # running sum now in %eax + emms # done with MMX + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret |