1 files changed, 83 insertions, 0 deletions
diff --git a/dotprod_mmx_assist.s b/dotprod_mmx_assist.s
new file mode 100644
index 0000000..25deffd
--- /dev/null
+++ b/dotprod_mmx_assist.s
@@ -0,0 +1,83 @@
+# SIMD MMX dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+#	long sum = 0; 
+#	cnt *= 4; 
+#	while(cnt--)
+#		sum += *a++ + *b++;
+#	return sum;
+# }
+# a and b should also be 64-bit aligned, or speed will suffer greatly
+# Copyright 1999, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+	
+	.text
+	.global dotprod_mmx_assist
+	.type dotprod_mmx_assist,@function
+dotprod_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %ecx
+	pushl %ebx
+	movl 8(%ebp),%esi	# a
+	movl 12(%ebp),%edi	# b
+	movl 16(%ebp),%ecx	# cnt
+	pxor %mm0,%mm0		# clear running sum (in two 32-bit halves)
+	
+# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
+	.align 16
+.Loop1:	subl $4,%ecx
+	jl   .Loop1Done
+	
+	movq (%esi),%mm1	# mm1 = a[3],a[2],a[1],a[0]
+ 	pmaddwd (%edi),%mm1	# mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
+	paddd %mm1,%mm0
+	
+	movq 8(%esi),%mm1
+	pmaddwd 8(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 16(%esi),%mm1
+	pmaddwd 16(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 24(%esi),%mm1
+	addl $32,%esi	
+	pmaddwd 24(%edi),%mm1
+	addl $32,%edi	
+	paddd %mm1,%mm0
+
+	jmp .Loop1
+.Loop1Done:
+	
+	addl $4,%ecx	
+	
+# MMX dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2:	subl $1,%ecx
+	jl   .Loop2Done
+	
+	movq (%esi),%mm1
+	addl $8,%esi
+	pmaddwd (%edi),%mm1
+	addl $8,%edi
+	paddd %mm1,%mm0
+	jmp .Loop2
+.Loop2Done:
+	
+	movd %mm0,%ebx		# right-hand word to ebx
+	punpckhdq %mm0,%mm0	# left-hand word to right side of %mm0
+	movd %mm0,%eax
+	addl %ebx,%eax		# running sum now in %eax
+	emms			# done with MMX
+	
+	popl %ebx
+	popl %ecx
+	popl %edi
+	popl %esi
+	movl %ebp,%esp
+	popl %ebp
+	ret