aboutsummaryrefslogtreecommitdiffstats
path: root/sumsq_mmx_assist.s
diff options
context:
space:
mode:
Diffstat (limited to 'sumsq_mmx_assist.s')
-rw-r--r--sumsq_mmx_assist.s83
1 files changed, 83 insertions, 0 deletions
diff --git a/sumsq_mmx_assist.s b/sumsq_mmx_assist.s
new file mode 100644
index 0000000..b3bac66
--- /dev/null
+++ b/sumsq_mmx_assist.s
@@ -0,0 +1,83 @@
+# MMX assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+ .text
+
+# Evaluate sum of squares of signed 16-bit input samples
+# long long sumsq_mmx_assist(signed short *in,int cnt);
+ .global sumsq_mmx_assist
+ .type sumsq_mmx_assist,@function
+ .align 16
+sumsq_mmx_assist:
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ pushl %ebx
+
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ xor %eax,%eax
+ xor %edx,%edx
+
+ # Since 4 * 32767**2 < 2**32, we can accumulate two at a time
+1: subl $8,%ecx
+ jl 2f
+ movq (%esi),%mm0 # S0 S1 S2 S3
+ pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2)
+ movq 8(%esi),%mm6 # S4 S5 S6 S7
+ pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2)
+ paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
+ movd %mm0,%ebx
+ addl %ebx,%eax
+ adcl $0,%edx
+ psrlq $32,%mm0
+ movd %mm0,%ebx
+ addl %ebx,%eax
+ adcl $0,%edx
+ addl $16,%esi
+ jmp 1b
+
+2: emms
+ popl %ebx
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
+
+# Evaluate sum of squares of signed 16-bit input samples
+# long sumsq_wd_mmx_assist(signed short *in,int cnt);
+# Quick version, only safe for small numbers of small input values...
+ .global sumsq_wd_mmx_assist
+ .type sumsq_wd_mmx_assist,@function
+ .align 16
+sumsq_wd_mmx_assist:
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm2,%mm2 # zero sum
+
+1: subl $8,%ecx
+ jl 2f
+ movq (%esi),%mm0 # S0 S1 S2 S3
+ pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3)
+ movq 8(%esi),%mm1
+ pmaddwd %mm1,%mm1
+ paddd %mm1,%mm2
+ paddd %mm0,%mm2 # accumulate
+
+ addl $16,%esi
+ jmp 1b
+
+2: movd %mm2,%eax # even sum
+ psrlq $32,%mm2
+ movd %mm2,%edx # odd sum
+ addl %edx,%eax
+ emms
+ popl %esi
+ popl %ebp
+ ret