diff options
author | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
---|---|---|
committer | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
commit | a31630e0d5b9880c716d9004ef4154396ba41ebc (patch) | |
tree | aebbd3b132e5f2dd31bc34750ccded2378fc687a /dotprod_av.c | |
parent | 9aaac5be9db5e1537badc65242412ef14c5096e3 (diff) | |
download | ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.gz ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.bz2 ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.zip |
Extract fec-3.0.1
Diffstat (limited to 'dotprod_av.c')
-rw-r--r-- | dotprod_av.c | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/dotprod_av.c b/dotprod_av.c new file mode 100644 index 0000000..1f70471 --- /dev/null +++ b/dotprod_av.c @@ -0,0 +1,93 @@ +/* 16-bit signed integer dot product + * Altivec-assisted version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On an Altivec machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + */ + signed short *coeffs[8]; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_av(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); + for(j=0;j<len;j++) + dp->coeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_av(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_av(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + vector signed short *ar,*d; + vector signed int sums0,sums1,sums2,sums3; + union { vector signed int v; signed int w[4];} s; + int nblocks; + + /* round ar down to beginning of 16-byte block containing 0th element of + * input buffer. Then set d to one of 8 sets of shifted coefficients + */ + ar = (vector signed short *)((int)a & ~15); + al = ((int)a & 15)/sizeof(signed short); + d = (vector signed short *)dp->coeffs[al]; + + nblocks = (dp->len+al-1)/8+1; + + /* Sum into four vectors each holding four 32-bit partial sums */ + sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); + while(nblocks >= 4){ + sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); + sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); + sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); + sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); + nblocks -= 4; + } + sums0 = vec_adds(sums0,sums1); + sums2 = vec_adds(sums2,sums3); + sums0 = vec_adds(sums0,sums2); + while(nblocks-- > 0){ + sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); + } + /* Sum 4 partial sums into final result */ + s.v = vec_sums(sums0,(vector signed int)(0)); + + return s.w[3]; +} + + |