aboutsummaryrefslogtreecommitdiffstats
path: root/dotprod_mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'dotprod_mmx.c')
-rw-r--r--dotprod_mmx.c81
1 files changed, 81 insertions, 0 deletions
diff --git a/dotprod_mmx.c b/dotprod_mmx.c
new file mode 100644
index 0000000..c516afe
--- /dev/null
+++ b/dotprod_mmx.c
@@ -0,0 +1,81 @@
+/* 16-bit signed integer dot product
+ * MMX assisted version; also for SSE
+ *
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+ int len; /* Number of coefficients */
+
+ /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+ * preshifted by 0,1,2,3 words to meet all possible input data
+ * alignments (see Intel ap559 on MMX dot products).
+ */
+ signed short *coeffs[4];
+};
+long dotprod_mmx_assist(signed short *a,signed short *b,int cnt);
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_mmx(signed short coeffs[],int len){
+ struct dotprod *dp;
+ int i,j;
+
+
+ if(len == 0)
+ return NULL;
+
+ dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+ dp->len = len;
+
+ /* Make 4 copies of coefficients, one for each data alignment */
+ for(i=0;i<4;i++){
+ dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4,
+ 4*sizeof(signed short));
+ for(j=0;j<len;j++)
+ dp->coeffs[i][j+i] = coeffs[j];
+ }
+ return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_mmx(void *p){
+ struct dotprod *dp = (struct dotprod *)p;
+ int i;
+
+ for(i=0;i<4;i++)
+ if(dp->coeffs[i] != NULL)
+ free(dp->coeffs[i]);
+ free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_mmx(void *p,signed short a[]){
+ struct dotprod *dp = (struct dotprod *)p;
+ int al;
+ signed short *ar;
+
+ /* Round input data address down to 8 byte boundary
+ * NB: depending on the alignment of a[], memory
+ * before a[] will be accessed. The contents don't matter since they'll
+ * be multiplied by zero coefficients. I can't conceive of any
+ * situation where this could cause a segfault since memory protection
+ * in the x86 machines is done on much larger boundaries
+ */
+ ar = (signed short *)((int)a & ~7);
+
+ /* Choose one of 4 sets of pre-shifted coefficients. al is both the
+ * index into dp->coeffs[] and the number of 0 words padded onto
+ * that coefficients array for alignment purposes
+ */
+ al = a - ar;
+
+ /* Call assembler routine to do the work, passing number of 4-word blocks */
+ return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1);
+}
+