diff options
-rw-r--r-- | INSTALL | 39 | ||||
-rw-r--r-- | README | 120 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | ccsds.h | 5 | ||||
-rw-r--r-- | char.h | 24 | ||||
-rw-r--r-- | configure.in | 83 | ||||
-rw-r--r-- | cpu_features.s | 15 | ||||
-rw-r--r-- | cpu_mode_ppc.c | 40 | ||||
-rw-r--r-- | cpu_mode_x86.c | 33 | ||||
-rw-r--r-- | decode_rs.c | 262 | ||||
-rw-r--r-- | decode_rs.h | 298 | ||||
-rw-r--r-- | decode_rs_8.c | 24 | ||||
-rw-r--r-- | decode_rs_ccsds.c | 26 | ||||
-rw-r--r-- | decode_rs_char.c | 22 | ||||
-rw-r--r-- | decode_rs_int.c | 22 | ||||
-rw-r--r-- | dotprod.c | 94 | ||||
-rw-r--r-- | dotprod.h | 15 | ||||
-rw-r--r-- | dotprod_av.c | 93 | ||||
-rw-r--r-- | dotprod_mmx.c | 81 | ||||
-rw-r--r-- | dotprod_mmx_assist.s | 83 | ||||
-rw-r--r-- | dotprod_port.c | 58 | ||||
-rw-r--r-- | dotprod_sse2.c | 72 | ||||
-rw-r--r-- | dotprod_sse2_assist.s | 85 | ||||
-rw-r--r-- | dsp.3 | 63 | ||||
-rw-r--r-- | dtest.c | 99 | ||||
-rw-r--r-- | encode_rs.c | 52 | ||||
-rw-r--r-- | encode_rs.h | 58 | ||||
-rw-r--r-- | encode_rs_8.c | 109 | ||||
-rw-r--r-- | encode_rs_av.c | 61 | ||||
-rw-r--r-- | encode_rs_ccsds.c | 24 | ||||
-rw-r--r-- | encode_rs_char.c | 15 | ||||
-rw-r--r-- | encode_rs_int.c | 15 | ||||
-rw-r--r-- | exercise.c | 122 | ||||
-rw-r--r-- | fec.c | 66 | ||||
-rw-r--r-- | fec.h | 347 | ||||
-rw-r--r-- | fixed.h | 33 | ||||
-rw-r--r-- | gen_ccsds.c | 39 | ||||
-rw-r--r-- | gen_ccsds_tal.c | 53 | ||||
-rw-r--r-- | init_rs.c | 39 | ||||
-rw-r--r-- | init_rs.h | 106 | ||||
-rw-r--r-- | init_rs_char.c | 35 | ||||
-rw-r--r-- | init_rs_int.c | 35 | ||||
-rw-r--r-- | int.h | 22 | ||||
-rw-r--r-- | lesser.txt | 504 | ||||
-rw-r--r-- | makefile.in | 242 | ||||
-rw-r--r-- | mmxbfly27.s | 148 | ||||
-rw-r--r-- | mmxbfly29.s | 161 | ||||
-rw-r--r-- | peak_mmx_assist.s | 70 | ||||
-rw-r--r-- | peak_sse2_assist.s | 51 | ||||
-rw-r--r-- | peak_sse_assist.s | 49 | ||||
-rw-r--r-- | peaktest.c | 38 | ||||
-rw-r--r-- | peakval.c | 39 | ||||
-rw-r--r-- | peakval_av.c | 61 | ||||
-rw-r--r-- | peakval_mmx.c | 34 | ||||
-rw-r--r-- | peakval_mmx_assist.s | 70 | ||||
-rw-r--r-- | peakval_port.c | 16 | ||||
-rw-r--r-- | peakval_sse.c | 35 | ||||
-rw-r--r-- | peakval_sse2.c | 34 | ||||
-rw-r--r-- | peakval_sse2_assist.s | 51 | ||||
-rw-r--r-- | peakval_sse_assist.s | 49 | ||||
-rw-r--r-- | rs-common.h | 26 | ||||
-rw-r--r-- | rs.3 | 198 | ||||
-rw-r--r-- | rs_speedtest.c | 54 | ||||
-rw-r--r-- | rstest.c | 296 | ||||
-rw-r--r-- | sim.c | 43 | ||||
-rw-r--r-- | simd-viterbi.3 | 247 | ||||
-rw-r--r-- | sqtest.c | 42 | ||||
-rw-r--r-- | sse2bfly27.s | 202 | ||||
-rw-r--r-- | sse2bfly29.s | 245 | ||||
-rw-r--r-- | ssebfly27.s | 205 | ||||
-rw-r--r-- | ssebfly29.s | 271 | ||||
-rw-r--r-- | sumsq.c | 40 | ||||
-rw-r--r-- | sumsq_av.c | 78 | ||||
-rw-r--r-- | sumsq_mmx.c | 35 | ||||
-rw-r--r-- | sumsq_mmx_assist.s | 83 | ||||
-rw-r--r-- | sumsq_port.c | 16 | ||||
-rw-r--r-- | sumsq_sse2.c | 33 | ||||
-rw-r--r-- | sumsq_sse2_assist.s | 49 | ||||
-rw-r--r-- | sumsq_test.c | 101 | ||||
-rw-r--r-- | viterbi27.c | 161 | ||||
-rw-r--r-- | viterbi27_av.c | 210 | ||||
-rw-r--r-- | viterbi27_mmx.c | 115 | ||||
-rw-r--r-- | viterbi27_port.c | 191 | ||||
-rw-r--r-- | viterbi27_sse.c | 113 | ||||
-rw-r--r-- | viterbi27_sse2.c | 180 | ||||
-rw-r--r-- | viterbi29.c | 152 | ||||
-rw-r--r-- | viterbi29_av.c | 190 | ||||
-rw-r--r-- | viterbi29_mmx.c | 118 | ||||
-rw-r--r-- | viterbi29_port.c | 166 | ||||
-rw-r--r-- | viterbi29_sse.c | 114 | ||||
-rw-r--r-- | viterbi29_sse2.c | 119 | ||||
-rw-r--r-- | viterbi39.c | 153 | ||||
-rw-r--r-- | viterbi39_av.c | 251 | ||||
-rw-r--r-- | viterbi39_mmx.c | 185 | ||||
-rw-r--r-- | viterbi39_port.c | 168 | ||||
-rw-r--r-- | viterbi39_sse.c | 201 | ||||
-rw-r--r-- | viterbi39_sse2.c | 200 | ||||
-rw-r--r-- | viterbi615.c | 155 | ||||
-rw-r--r-- | viterbi615_av.c | 257 | ||||
-rw-r--r-- | viterbi615_mmx.c | 183 | ||||
-rw-r--r-- | viterbi615_port.c | 156 | ||||
-rw-r--r-- | viterbi615_sse.c | 201 | ||||
-rw-r--r-- | viterbi615_sse2.c | 204 | ||||
-rw-r--r-- | vtest27.c | 184 | ||||
-rw-r--r-- | vtest29.c | 185 | ||||
-rw-r--r-- | vtest39.c | 186 | ||||
-rw-r--r-- | vtest615.c | 191 |
107 files changed, 11792 insertions, 4 deletions
@@ -0,0 +1,39 @@ +INSTALLATION INSTRUCTIONS + +To build and install the libfec libraries, simply say + +./configure +make +make test (optional) +make install (as root) + +By default, "make install" puts the libfec libraries in +/usr/local/lib, the include files in /usr/local/include, and the +manual page in /usr/local/man. + +You may have an old version of the GNU assembler that cannot handle +the relatively new SSE2 mnemonics. Update your version of the GNU +"binutils" package. + +You may obtain the latest binutils package through your normal +distribution channels or from: + +http://sources.redhat.com/binutils/ + +TESTING THE FEC LIBRARY + +After running the ./configure script, optional tests can be built and +run as follows: + +make test + +"make test" tests each routine, using the SIMD versions as +appropriate, verifying correct operation and estimating Viterbi +decoding speeds. These tests should always succeed unless something is +broken. + +28 Mar 2004 +Phil Karn, karn@ka9q.net + + + @@ -0,0 +1,120 @@ +COPYRIGHT + +This package is copyright 2006 by Phil Karn, KA9Q. It may be used +under the terms of the GNU Lesser General Public License (LGPL). See +the file "lesser.txt" in this package for license details. + +INTRODUCTION + +This package provides a set of functions that implement several +popular forward error correction (FEC) algorithms and several low-level routines +useful in modems implemented with digital signal processing (DSP). + +The following routines are provided: + +1. Viterbi decoders for the following convolutional codes: + +r=1/2 k=7 ("Voyager" code, now a widely used industry standard) +r=1/2 k=9 (Used on the IS-95 CDMA forward link) +r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions) + +2. Reed-Solomon encoders and decoders for any user-specified code. + +3. Optimized encoder and decoder for the CCSDS-standard (255,223) +Reed-Solomon code, with and without the CCSDS-standard "dual basis" +symbol representation. + +4. Compute dot product between a 16-bit buffer and a set of 16-bit +coefficients. This is the basic DSP primitive for digital filtering +and correlation. + +4. Compute sum of squares of a buffer of 16-bit signed integers. This is +useful in DSP for finding the total energy in a signal. + +5. Find peak value in a buffer of 16-bit signed integers, useful for +scaling a signal to prevent overflow. + +SIMD SUPPORT + +This package automatically makes use of various SIMD (Single +Instruction stream, Multiple Data stream) instruction sets, when +available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and +Altivec on the PowerPC G4 and G5 used by Power Macintoshes. + +"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine", +and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the +IA-32. + +Many of the SIMD versions run more than an order of +magnitude faster than their portable C versions. The available SIMD +instruction sets, if any, are determined at run time and the proper +version of each routine is automatically selected. If no SIMD +instructions are available, the portable C version is invoked by +default. On targets other than IA-32 and PPC, only the portable C +version is built. + +The SIMD-assisted versions generally produce the same results as the C +versions, with a few minor exceptions. The Viterbi decoders in C have +a very slightly greater Eb/No performance due to their use of 32-bit +path metrics. On the other hand, the SIMD versions use the +"saturating" arithmetic available in these instructions to avoid the +integer wraparounds that can occur in C when argument ranges are not +properly constrained. This applies primarily to the "dotprod" (dot +product) function. + +The MMX (MultiMedia eXtensions) instruction set was introduced on +later Pentium CPUs; it is also implemented on the Pentium II and most +AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was +introduced in the Pentium III; AMD calls it "3D Now! Professional". +Intel introduced SSE2 on the Pentium 4, and it has been picked up by +later AMD CPUs. SSE support implies MMX support, while SSE2 support +implies both SSE and MMX support. + +The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott +New Instructions") was introduced in early 2004 with the latest +("Prescott") revision of the Pentium 4. Relatively little was +introduced with SSE3, and this library currently makes no use of it. + +See the various manual pages for details on how to use the library +routines. + +Copyright 2006, Phil Karn, KA9Q +karn@ka9q.net +http://www.ka9q.net/ + +This software may be used under the terms of the GNU Lesser General +Public License (LGPL); see the file lesser.txt for details. + +Revision history: +Version 1.0 released 29 May 2001 + +Version 2.0 released 3 Dec 2001: +Restructured to add support for shared libraries. + +Version 2.0.1 released 8 Dec 2001: +Includes autoconf/configure script + +Version 2.0.2 released 4 Feb 2002: +Add SIMD version override options +Test for lack of SSE2 mnemonic support in 'as' +Build only selected version + +Version 2.0.3 released 6 Feb 2002: +Fix to parityb function in parity.h + +feclib version 1.0 released November 2003 +Merged SIMD-Viterbi, RS and DSP libraries +Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time + +feclib version 2.0 (unreleased) Mar 2004 +General speedups and cleanups +Switch from 4 to 8-bit input symbols on all Viterbi decoders +Support for Altivec on PowerPC +Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code +Changed license to GNU Lesser General Public License (LGPL) + +feclib version 2.1 June 5 2006 +Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults + +feclib version 2.1.1 June 6 2006 +Fix test/benchmark time measurement on Linux diff --git a/README.md b/README.md deleted file mode 100644 index fdafed0..0000000 --- a/README.md +++ /dev/null @@ -1,4 +0,0 @@ -ka9q-fec -======== - -This is a fork of KA9Q's FEC library @@ -0,0 +1,5 @@ +typedef unsigned char data_t; +extern unsigned char Taltab[],Tal1tab[]; +#define NN 255 +#define NROOTS 32 + @@ -0,0 +1,24 @@ +/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + + + diff --git a/configure.in b/configure.in new file mode 100644 index 0000000..4e4110b --- /dev/null +++ b/configure.in @@ -0,0 +1,83 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(viterbi27.c) +AC_CONFIG_HEADER(config.h) +SO_NAME=3 +VERSION=3.0.0 +AC_SUBST(SO_NAME) +AC_SUBST(VERSION) + +dnl Checks for programs. +AC_PROG_CC +if test $GCC != "yes" +then + AC_MSG_ERROR([Need GNU C compiler]) +fi +dnl Checks for libraries. +AC_CHECK_LIB(c, malloc) + +dnl Checks for header files. +AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h) +if test -z "$HAVE_stdio.h" +then + AC_MSG_ERROR([Need stdio.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need stdlib.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need memory.h!]) +fi +if test -z "$HAVE_string.h" +then + AC_MSG_ERROR([Need string.h]) +fi + +AC_CANONICAL_SYSTEM +case $target_cpu in +i386|i486|i586|i686) + ARCH_OPTION="-march=$target_cpu" + MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \ + viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \ + viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \ + viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \ + dotprod_mmx.o dotprod_mmx_assist.o \ + dotprod_sse2.o dotprod_sse2_assist.o \ + peakval_mmx.o peakval_mmx_assist.o \ + peakval_sse.o peakval_sse_assist.o \ + peakval_sse2.o peakval_sse2_assist.o \ + sumsq.o sumsq_port.o \ + sumsq_sse2.o sumsq_sse2_assist.o \ + sumsq_mmx.o sumsq_mmx_assist.o \ + cpu_features.o cpu_mode_x86.o" + ;; +powerpc*) + ARCH_OPTION="-fno-common -faltivec" + MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \ + encode_rs_av.o \ + dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o" + ;; +*) + MLIBS= +esac +case $target_os in +darwin*) + SH_LIB=libfec.dylib + REBIND="" + ;; +*) + SH_LIB=libfec.so + REBIND=ldconfig + ;; +esac +AC_SUBST(SH_LIB) +AC_SUBST(REBIND) +AC_SUBST(MLIBS) +AC_SUBST(ARCH_OPTION) + + +dnl Checks for library functions. +AC_CHECK_FUNCS(getopt_long memset memmove) + +AC_OUTPUT(makefile) diff --git a/cpu_features.s b/cpu_features.s new file mode 100644 index 0000000..ef4ba4e --- /dev/null +++ b/cpu_features.s @@ -0,0 +1,15 @@ +.text +.global cpu_features + .type cpu_features,@function +cpu_features: + pushl %ebx + pushl %ecx + pushl %edx + movl $1,%eax + cpuid + movl %edx,%eax + popl %edx + popl %ecx + popl %ebx + ret +
\ No newline at end of file diff --git a/cpu_mode_ppc.c b/cpu_mode_ppc.c new file mode 100644 index 0000000..0071558 --- /dev/null +++ b/cpu_mode_ppc.c @@ -0,0 +1,40 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + */ +#include <stdio.h> +#include "fec.h" +#ifdef __VEC__ +#include <sys/sysctl.h> +#endif + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + if(Cpu_mode != UNKNOWN) + return; + +#ifdef __VEC__ + { + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + Cpu_mode = ALTIVEC; + else + Cpu_mode = PORT; + } +#else + Cpu_mode = PORT; +#endif + + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/cpu_mode_x86.c b/cpu_mode_x86.c new file mode 100644 index 0000000..322018e --- /dev/null +++ b/cpu_mode_x86.c @@ -0,0 +1,33 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + */ +#include <stdio.h> +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + int f; + if(Cpu_mode != UNKNOWN) + return; + + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + Cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + Cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + Cpu_mode = MMX; + } else { /* No SIMD at all */ + Cpu_mode = PORT; + } + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/decode_rs.c b/decode_rs.c new file mode 100644 index 0000000..d7f97b3 --- /dev/null +++ b/decode_rs.c @@ -0,0 +1,262 @@ +/* Reed-Solomon decoder + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include <stdio.h> +#endif + +#include <string.h> + +#define NULL ((void *)0) +#define min(a,b) ((a) < (b) ? (a) : (b)) + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +int DECODE_RS( +#ifdef FIXED +data_t *data, int *eras_pos, int no_eras,int pad){ +#else +void *p,data_t *data, int *eras_pos, int no_eras){ + struct rs *rs = (struct rs *)p; +#endif + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return -1; +#endif + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i<NROOTS;i++) + s[i] = data[0]; + + for(j=1;j<NN-PAD;j++){ + for(i=0;i<NROOTS;i++){ + if(s[i] == 0){ + s[i] = data[j]; + } else { + s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)]; + } + } + } + + /* Convert syndromes to index form, checking for nonzero condition */ + syn_error = 0; + for(i=0;i<NROOTS;i++){ + syn_error |= s[i]; + s[i] = INDEX_OF[s[i]]; + } + + if (!syn_error) { + /* if syndrome is zero, data[] is a codeword and there are no + * errors to correct. So return data[] unmodified + */ + count = 0; + goto finish; + } + memset(&lambda[1],0,NROOTS*sizeof(lambda[0])); + lambda[0] = 1; + + if (no_eras > 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i<NROOTS+1;i++) + b[i] = INDEX_OF[lambda[i]]; + + /* + * Begin Berlekamp-Massey algorithm to determine error+erasure + * locator polynomial + */ + r = no_eras; + el = no_eras; + while (++r <= NROOTS) { /* r is the step number */ + /* Compute discrepancy at the r-th step in poly-form */ + discr_r = 0; + for (i = 0; i < r; i++){ + if ((lambda[i] != 0) && (s[r-i-1] != A0)) { + discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])]; + } + } + discr_r = INDEX_OF[discr_r]; /* Index form */ + if (discr_r == A0) { + /* 2 lines below: B(x) <-- x*B(x) */ + memmove(&b[1],b,NROOTS*sizeof(b[0])); + b[0] = A0; + } else { + /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */ + t[0] = lambda[0]; + for (i = 0 ; i < NROOTS; i++) { + if(b[i] != A0) + t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])]; + else + t[i+1] = lambda[i+1]; + } + if (2 * el <= r + no_eras - 1) { + el = r + no_eras - el; + /* + * 2 lines below: B(x) <-- inv(discr_r) * + * lambda(x) + */ + for (i = 0; i <= NROOTS; i++) + b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN); + } else { + /* 2 lines below: B(x) <-- x*B(x) */ + memmove(&b[1],b,NROOTS*sizeof(b[0])); + b[0] = A0; + } + memcpy(lambda,t,(NROOTS+1)*sizeof(t[0])); + } + } + + /* Convert lambda to index form and compute deg(lambda(x)) */ + deg_lambda = 0; + for(i=0;i<NROOTS+1;i++){ + lambda[i] = INDEX_OF[lambda[i]]; + if(lambda[i] != A0) + deg_lambda = i; + } + /* Find roots of the error+erasure locator polynomial by Chien search */ + memcpy(®[1],&lambda[1],NROOTS*sizeof(reg[0])); + count = 0; /* Number of roots of lambda(x) */ + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; /* lambda[0] is always 0 */ + for (j = deg_lambda; j > 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i<count;i++) + eras_pos[i] = loc[i]; + } + return count; +} diff --git a/decode_rs.h b/decode_rs.h new file mode 100644 index 0000000..c165cf3 --- /dev/null +++ b/decode_rs.h @@ -0,0 +1,298 @@ +/* The guts of the Reed-Solomon decoder, meant to be #included + * into a function body with the following typedefs, macros and variables supplied + * according to the code parameters: + + * data_t - a typedef for the data symbol + * data_t data[] - array of NN data and parity symbols to be corrected in place + * retval - an integer lvalue into which the decoder's return code is written + * NROOTS - the number of roots in the RS code generator polynomial, + * which is the same as the number of parity symbols in a block. + Integer variable or literal. + * NN - the total number of symbols in a RS block. Integer variable or literal. + * PAD - the number of pad symbols in a block. Integer variable or literal. + * ALPHA_TO - The address of an array of NN elements to convert Galois field + * elements in index (log) form to polynomial form. Read only. + * INDEX_OF - The address of an array of NN elements to convert Galois field + * elements in polynomial form to index (log) form. Read only. + * MODNN - a function to reduce its argument modulo NN. May be inline or a macro. + * FCR - An integer literal or variable specifying the first consecutive root of the + * Reed-Solomon generator polynomial. Integer variable or literal. + * PRIM - The primitive root of the generator poly. Integer variable or literal. + * DEBUG - If set to 1 or more, do various internal consistency checking. Leave this + * undefined for production code + + * The memset(), memmove(), and memcpy() functions are used. The appropriate header + * file declaring these functions (usually <string.h>) must be included by the calling + * program. + */ + + +#if !defined(NROOTS) +#error "NROOTS not defined" +#endif + +#if !defined(NN) +#error "NN not defined" +#endif + +#if !defined(PAD) +#error "PAD not defined" +#endif + +#if !defined(ALPHA_TO) +#error "ALPHA_TO not defined" +#endif + +#if !defined(INDEX_OF) +#error "INDEX_OF not defined" +#endif + +#if !defined(MODNN) +#error "MODNN not defined" +#endif + +#if !defined(FCR) +#error "FCR not defined" +#endif + +#if !defined(PRIM) +#error "PRIM not defined" +#endif + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#undef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#undef A0 +#define A0 (NN) + +{ + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i<NROOTS;i++) + s[i] = data[0]; + + for(j=1;j<NN-PAD;j++){ + for(i=0;i<NROOTS;i++){ + if(s[i] == 0){ + s[i] = data[j]; + } else { + s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)]; + } + } + } + + /* Convert syndromes to index form, checking for nonzero condition */ + syn_error = 0; + for(i=0;i<NROOTS;i++){ + syn_error |= s[i]; + s[i] = INDEX_OF[s[i]]; + } + + if (!syn_error) { + /* if syndrome is zero, data[] is a codeword and there are no + * errors to correct. So return data[] unmodified + */ + count = 0; + goto finish; + } + memset(&lambda[1],0,NROOTS*sizeof(lambda[0])); + lambda[0] = 1; + + if (no_eras > 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i<NROOTS+1;i++) + b[i] = INDEX_OF[lambda[i]]; + + /* + * Begin Berlekamp-Massey algorithm to determine error+erasure + * locator polynomial + */ + r = no_eras; + el = no_eras; + while (++r <= NROOTS) { /* r is the step number */ + /* Compute discrepancy at the r-th step in poly-form */ + discr_r = 0; + for (i = 0; i < r; i++){ + if ((lambda[i] != 0) && (s[r-i-1] != A0)) { + discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])]; + } + } + discr_r = INDEX_OF[discr_r]; /* Index form */ + if (discr_r == A0) { + /* 2 lines below: B(x) <-- x*B(x) */ + memmove(&b[1],b,NROOTS*sizeof(b[0])); + b[0] = A0; + } else { + /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */ + t[0] = lambda[0]; + for (i = 0 ; i < NROOTS; i++) { + if(b[i] != A0) + t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])]; + else + t[i+1] = lambda[i+1]; + } + if (2 * el <= r + no_eras - 1) { + el = r + no_eras - el; + /* + * 2 lines below: B(x) <-- inv(discr_r) * + * lambda(x) + */ + for (i = 0; i <= NROOTS; i++) + b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN); + } else { + /* 2 lines below: B(x) <-- x*B(x) */ + memmove(&b[1],b,NROOTS*sizeof(b[0])); + b[0] = A0; + } + memcpy(lambda,t,(NROOTS+1)*sizeof(t[0])); + } + } + + /* Convert lambda to index form and compute deg(lambda(x)) */ + deg_lambda = 0; + for(i=0;i<NROOTS+1;i++){ + lambda[i] = INDEX_OF[lambda[i]]; + if(lambda[i] != A0) + deg_lambda = i; + } + /* Find roots of the error+erasure locator polynomial by Chien search */ + memcpy(®[1],&lambda[1],NROOTS*sizeof(reg[0])); + count = 0; /* Number of roots of lambda(x) */ + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; /* lambda[0] is always 0 */ + for (j = deg_lambda; j > 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i<count;i++) + eras_pos[i] = loc[i]; + } + retval = count; +} diff --git a/decode_rs_8.c b/decode_rs_8.c new file mode 100644 index 0000000..995b0d9 --- /dev/null +++ b/decode_rs_8.c @@ -0,0 +1,24 @@ +/* General purpose Reed-Solomon decoder for 8-bit symbols or less + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include <stdio.h> +#endif + +#include <string.h> + +#include "fixed.h" + +int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){ + int retval; + + if(pad < 0 || pad > 222){ + return -1; + } + +#include "decode_rs.h" + + return retval; +} diff --git a/decode_rs_ccsds.c b/decode_rs_ccsds.c new file mode 100644 index 0000000..0e246b4 --- /dev/null +++ b/decode_rs_ccsds.c @@ -0,0 +1,26 @@ +/* This function wraps around the fixed 8-bit decoder, performing the + * basis transformations necessary to meet the CCSDS standard + * + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include "ccsds.h" +#include "fec.h" + +int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){ + int i,r; + data_t cdata[NN]; + + /* Convert data from dual basis to conventional */ + for(i=0;i<NN-pad;i++) + cdata[i] = Tal1tab[data[i]]; + + r = decode_rs_8(cdata,eras_pos,no_eras,pad); + + if(r > 0){ + /* Convert from conventional to dual basis */ + for(i=0;i<NN-pad;i++) + data[i] = Taltab[cdata[i]]; + } + return r; +} diff --git a/decode_rs_char.c b/decode_rs_char.c new file mode 100644 index 0000000..7105233 --- /dev/null +++ b/decode_rs_char.c @@ -0,0 +1,22 @@ +/* General purpose Reed-Solomon decoder for 8-bit symbols or less + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include <stdio.h> +#endif + +#include <string.h> + +#include "char.h" +#include "rs-common.h" + +int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/decode_rs_int.c b/decode_rs_int.c new file mode 100644 index 0000000..1ef1a1f --- /dev/null +++ b/decode_rs_int.c @@ -0,0 +1,22 @@ +/* General purpose Reed-Solomon decoder + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include <stdio.h> +#endif + +#include <string.h> + +#include "int.h" +#include "rs-common.h" + +int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/dotprod.c b/dotprod.c new file mode 100644 index 0000000..b3be913 --- /dev/null +++ b/dotprod.c @@ -0,0 +1,94 @@ +/* 16-bit signed integer dot product + * Switch to appropriate versions + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +void *initdp_port(signed short coeffs[],int len); +long dotprod_port(void *p,signed short *b); +void freedp_port(void *p); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void *initdp_sse2(signed short coeffs[],int len); +long dotprod_mmx(void *p,signed short *b); +long dotprod_sse2(void *p,signed short *b); +void freedp_mmx(void *p); +void freedp_sse2(void *p); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +long dotprod_av(void *p,signed short *b); +void freedp_av(void *p); +#endif + +/* Create and return a descriptor for use with the dot product function */ +void *initdp(signed short coeffs[],int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return initdp_port(coeffs,len); +#ifdef __i386__ + case MMX: + case SSE: + return initdp_mmx(coeffs,len); + case SSE2: + return initdp_sse2(coeffs,len); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return initdp_av(coeffs,len); +#endif + } +} + + +/* Free a dot product descriptor created earlier */ +void freedp(void *p){ + switch(Cpu_mode){ + case PORT: + default: +#ifdef __i386__ + case MMX: + case SSE: + return freedp_mmx(p); + case SSE2: + return freedp_sse2(p); +#endif +#ifdef __VEC__ + case ALTIVEC: + return freedp_av(p); +#endif + } +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod(void *p,signed short a[]){ + switch(Cpu_mode){ + case PORT: + default: + return dotprod_port(p,a); +#ifdef __i386__ + case MMX: + case SSE: + return dotprod_mmx(p,a); + case SSE2: + return dotprod_sse2(p,a); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return dotprod_av(p,a); +#endif + } +} + + diff --git a/dotprod.h b/dotprod.h new file mode 100644 index 0000000..6b62b70 --- /dev/null +++ b/dotprod.h @@ -0,0 +1,15 @@ +/* Internal definitions for dotproduct function */ + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + * + * SSE2 is similar, but with 8 words at a time + * + * On a non-MMX machine, only one copy is present + */ + signed short *coeffs[8]; +}; diff --git a/dotprod_av.c b/dotprod_av.c new file mode 100644 index 0000000..1f70471 --- /dev/null +++ b/dotprod_av.c @@ -0,0 +1,93 @@ +/* 16-bit signed integer dot product + * Altivec-assisted version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On an Altivec machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + */ + signed short *coeffs[8]; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_av(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); + for(j=0;j<len;j++) + dp->coeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_av(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_av(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + vector signed short *ar,*d; + vector signed int sums0,sums1,sums2,sums3; + union { vector signed int v; signed int w[4];} s; + int nblocks; + + /* round ar down to beginning of 16-byte block containing 0th element of + * input buffer. Then set d to one of 8 sets of shifted coefficients + */ + ar = (vector signed short *)((int)a & ~15); + al = ((int)a & 15)/sizeof(signed short); + d = (vector signed short *)dp->coeffs[al]; + + nblocks = (dp->len+al-1)/8+1; + + /* Sum into four vectors each holding four 32-bit partial sums */ + sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); + while(nblocks >= 4){ + sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); + sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); + sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); + sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); + nblocks -= 4; + } + sums0 = vec_adds(sums0,sums1); + sums2 = vec_adds(sums2,sums3); + sums0 = vec_adds(sums0,sums2); + while(nblocks-- > 0){ + sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); + } + /* Sum 4 partial sums into final result */ + s.v = vec_sums(sums0,(vector signed int)(0)); + + return s.w[3]; +} + + diff --git a/dotprod_mmx.c b/dotprod_mmx.c new file mode 100644 index 0000000..c516afe --- /dev/null +++ b/dotprod_mmx.c @@ -0,0 +1,81 @@ +/* 16-bit signed integer dot product + * MMX assisted version; also for SSE + * + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[4]; +}; +long dotprod_mmx_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_mmx(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 4 copies of coefficients, one for each data alignment */ + for(i=0;i<4;i++){ + dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4, + 4*sizeof(signed short)); + for(j=0;j<len;j++) + dp->coeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_mmx(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<4;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_mmx(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + /* Round input data address down to 8 byte boundary + * NB: depending on the alignment of a[], memory + * before a[] will be accessed. The contents don't matter since they'll + * be multiplied by zero coefficients. I can't conceive of any + * situation where this could cause a segfault since memory protection + * in the x86 machines is done on much larger boundaries + */ + ar = (signed short *)((int)a & ~7); + + /* Choose one of 4 sets of pre-shifted coefficients. al is both the + * index into dp->coeffs[] and the number of 0 words padded onto + * that coefficients array for alignment purposes + */ + al = a - ar; + + /* Call assembler routine to do the work, passing number of 4-word blocks */ + return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1); +} + diff --git a/dotprod_mmx_assist.s b/dotprod_mmx_assist.s new file mode 100644 index 0000000..25deffd --- /dev/null +++ b/dotprod_mmx_assist.s @@ -0,0 +1,83 @@ +# SIMD MMX dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 4; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b should also be 64-bit aligned, or speed will suffer greatly +# Copyright 1999, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_mmx_assist + .type dotprod_mmx_assist,@function +dotprod_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %mm0,%mm0 # clear running sum (in two 32-bit halves) + +# MMX dot product loop unrolled 4 times, crunching 16 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0] + pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0] + paddd %mm1,%mm0 + + movq 8(%esi),%mm1 + pmaddwd 8(%edi),%mm1 + paddd %mm1,%mm0 + + movq 16(%esi),%mm1 + pmaddwd 16(%edi),%mm1 + paddd %mm1,%mm0 + + movq 24(%esi),%mm1 + addl $32,%esi + pmaddwd 24(%edi),%mm1 + addl $32,%edi + paddd %mm1,%mm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# MMX dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movq (%esi),%mm1 + addl $8,%esi + pmaddwd (%edi),%mm1 + addl $8,%edi + paddd %mm1,%mm0 + jmp .Loop2 +.Loop2Done: + + movd %mm0,%ebx # right-hand word to ebx + punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0 + movd %mm0,%eax + addl %ebx,%eax # running sum now in %eax + emms # done with MMX + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret diff --git a/dotprod_port.c b/dotprod_port.c new file mode 100644 index 0000000..ef635ec --- /dev/null +++ b/dotprod_port.c @@ -0,0 +1,58 @@ +/* 16-bit signed integer dot product + * Portable C version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + signed short *coeffs; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_port(signed short coeffs[],int len){ + struct dotprod *dp; + int j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Just one copy of the coefficients for the C version */ + dp->coeffs = (signed short *)calloc(len,sizeof(signed short)); + for(j=0;j<len;j++) + dp->coeffs[j] = coeffs[j]; + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_port(void *p){ + struct dotprod *dp = (struct dotprod *)p; + + if(dp->coeffs != NULL) + free(dp->coeffs); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_port(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + long corr; + int i; + + corr = 0; + for(i=0;i<dp->len;i++){ + corr += (long)a[i] * dp->coeffs[i]; + } + return corr; +} + + diff --git a/dotprod_sse2.c b/dotprod_sse2.c new file mode 100644 index 0000000..1fddd18 --- /dev/null +++ b/dotprod_sse2.c @@ -0,0 +1,72 @@ +/* 16-bit signed integer dot product + * SSE2 version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define _XOPEN_SOURCE 600 +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a SSE2 machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[8]; +}; + +long dotprod_sse2_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_sse2(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j,blksize; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + blksize = (1+(len+i-1)/8) * 8*sizeof(signed short); + posix_memalign((void **)&dp->coeffs[i],16,blksize); + memset(dp->coeffs[i],0,blksize); + for(j=0;j<len;j++) + dp->coeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_sse2(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_sse2(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + ar = (signed short *)((int)a & ~15); + al = a - ar; + + /* Call assembler routine to do the work, passing number of 8-word blocks */ + return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1); +} diff --git a/dotprod_sse2_assist.s b/dotprod_sse2_assist.s new file mode 100644 index 0000000..47348fa --- /dev/null +++ b/dotprod_sse2_assist.s @@ -0,0 +1,85 @@ +# SIMD SSE2 dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 8; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b must be 128-bit aligned +# Copyright 2001, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_sse2_assist + .type dotprod_sse2_assist,@function +dotprod_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) + +# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movdqa (%esi),%xmm1 + pmaddwd (%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 16(%esi),%xmm1 + pmaddwd 16(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 32(%esi),%xmm1 + pmaddwd 32(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 48(%esi),%xmm1 + addl $64,%esi + pmaddwd 48(%edi),%xmm1 + addl $64,%edi + paddd %xmm1,%xmm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# SSE2 dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movdqa (%esi),%xmm1 + addl $16,%esi + pmaddwd (%edi),%xmm1 + addl $16,%edi + paddd %xmm1,%xmm0 + jmp .Loop2 +.Loop2Done: + + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 + paddd %xmm1,%xmm0 + movd %xmm0,%eax # right-hand word to eax + psrldq $4,%xmm0 + movd %xmm0,%ebx + addl %ebx,%eax + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret @@ -0,0 +1,63 @@ +.TH DSP 3 +.SH NAME +initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted +digital signal processing primitives +.SH SYNOPSIS +.nf +.ft +#include "fec.h" + +void *initdp(signed short *coeffs,int len); +long dotprod(void *p,signed short *a); +void freedp(void *p); + +unsigned long long sumsq(signed short *in,int cnt); + +int peakval(signed short *b,int cnt); + +.SH DESCRIPTION +These functions provide several basic primitives useful in digital +signal processing (DSP), especially in modems. The \fBinitdp\fR, +\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot +product useful in correlation and filtering operations on signed +16-bit integers. \fBsumsq\fR computes the sum +of the squares of an array of signed 16-bit integers, +useful for measuring the energy of a signal. \fBpeakval\fR returns the +absolute value of the largest magitude element in the input array, +useful for scaling a signal's amplitude. + +Each function uses IA32 or PowerPC Altivec instructions when +available; otherwise, a portable C version is used. + +.SH USAGE +To create a FIR filter or correlator, call \fBinitdp\fR with the +coefficients in \fBcoeff\fR and their number in \fBlen\fR. This +creates the appropriate data structures and returns a handle. + +To compute a dot product, pass the handle from \fBinitdp\fR and the +input array to \fBdotprod\fR. No length field is needed as the number +of samples will be taken from the \fBlen\fR parameter originally given +to \fBinitdp\fR. There must be at least as many samples in the input +array as there were coefficients passed to \fBinitdp\fR. + +When the filter or correlator is no longer needed, the data structures +may be freed by passing the handle to \fBfreedp\fR. + +The user is responsible for scaling the inputs to \fBinitdp\fR and +\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently +wrap around in the event of overflow. + +To compute the sum of the squares of an array of signed 16-bit +integers, use sumsq\fR. This returns a 64 bit sum. + +\fBpeakval\fR computes the absolute value of each 16-bit element in +the input array and returns the largest. + +.SH RETURN VALUES + +\fBinitdp\fR returns a handle that points to a control block, or NULL in +the event of an error (such as a memory allocation failure). \fBsumsq\fR +and \fBpeakval\fR have no error returns. + +.SH AUTHOR and COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) @@ -0,0 +1,99 @@ +/* Test dot-product function */ + +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <math.h> +#include "config.h" +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {"trials",0,NULL,'n'}, + {NULL}, +}; +#endif + +int main(int argc,char *argv[]){ + short coeffs[512]; + short input[2048]; + int trials=1000,d; + int errors = 0; + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"apmstn:")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'n': + trials = atoi(optarg); + break; + } + } + + while(trials--){ + long port_result; + long simd_result; + int ntaps; + int i; + int csum = 0; + int offset; + void *dp_simd,*dp_port; + + /* Generate set of coefficients + * limit sum of absolute values to 32767 to avoid overflow + */ + memset(coeffs,0,sizeof(coeffs)); + for(i=0;i<512;i++){ + double gv; + + gv = normal_rand(0.,100.); + if(csum + fabs(gv) > 32767) + break; + coeffs[i] = gv; + csum += fabs(gv); + } + ntaps = i; + + /* Compare results to portable C version for a bunch of random data buffers and offsets */ + dp_simd = initdp(coeffs,ntaps); + dp_port = initdp_port(coeffs,ntaps); + + for(i=0;i<2048;i++) + input[i] = random(); + + offset = random() & 511; + + simd_result = dotprod(dp_simd,input+offset); + port_result = dotprod_port(dp_port,input+offset); + if(simd_result != port_result){ + errors++; + } + } + printf("dtest: %d errors\n",errors); + exit(0); +} diff --git a/encode_rs.c b/encode_rs.c new file mode 100644 index 0000000..0649094 --- /dev/null +++ b/encode_rs.c @@ -0,0 +1,52 @@ +/* Reed-Solomon encoder + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <string.h> + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +void ENCODE_RS( +#ifdef FIXED +data_t *data, data_t *bb,int pad){ +#else +void *p,data_t *data, data_t *bb){ + struct rs *rs = (struct rs *)p; +#endif + int i, j; + data_t feedback; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return; +#endif + + memset(bb,0,NROOTS*sizeof(data_t)); + + for(i=0;i<NN-NROOTS-PAD;i++){ + feedback = INDEX_OF[data[i] ^ bb[0]]; + if(feedback != A0){ /* feedback term is non-zero */ +#ifdef UNNORMALIZED + /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must + * always be for the polynomials constructed by init_rs() + */ + feedback = MODNN(NN - GENPOLY[NROOTS] + feedback); +#endif + for(j=1;j<NROOTS;j++) + bb[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])]; + } + /* Shift */ + memmove(&bb[0],&bb[1],sizeof(data_t)*(NROOTS-1)); + if(feedback != A0) + bb[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])]; + else + bb[NROOTS-1] = 0; + } +} diff --git a/encode_rs.h b/encode_rs.h new file mode 100644 index 0000000..2c157f9 --- /dev/null +++ b/encode_rs.h @@ -0,0 +1,58 @@ +/* The guts of the Reed-Solomon encoder, meant to be #included + * into a function body with the following typedefs, macros and variables supplied + * according to the code parameters: + + * data_t - a typedef for the data symbol + * data_t data[] - array of NN-NROOTS-PAD and type data_t to be encoded + * data_t parity[] - an array of NROOTS and type data_t to be written with parity symbols + * NROOTS - the number of roots in the RS code generator polynomial, + * which is the same as the number of parity symbols in a block. + Integer variable or literal. + * + * NN - the total number of symbols in a RS block. Integer variable or literal. + * PAD - the number of pad symbols in a block. Integer variable or literal. + * ALPHA_TO - The address of an array of NN elements to convert Galois field + * elements in index (log) form to polynomial form. Read only. + * INDEX_OF - The address of an array of NN elements to convert Galois field + * elements in polynomial form to index (log) form. Read only. + * MODNN - a function to reduce its argument modulo NN. May be inline or a macro. + * GENPOLY - an array of NROOTS+1 elements containing the generator polynomial in index form + + * The memset() and memmove() functions are used. The appropriate header + * file declaring these functions (usually <string.h>) must be included by the calling + * program. + + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + + +#undef A0 +#define A0 (NN) /* Special reserved value encoding zero in index form */ + +{ + int i, j; + data_t feedback; + + memset(parity,0,NROOTS*sizeof(data_t)); + + for(i=0;i<NN-NROOTS-PAD;i++){ + feedback = INDEX_OF[data[i] ^ parity[0]]; + if(feedback != A0){ /* feedback term is non-zero */ +#ifdef UNNORMALIZED + /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must + * always be for the polynomials constructed by init_rs() + */ + feedback = MODNN(NN - GENPOLY[NROOTS] + feedback); +#endif + for(j=1;j<NROOTS;j++) + parity[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])]; + } + /* Shift */ + memmove(&parity[0],&parity[1],sizeof(data_t)*(NROOTS-1)); + if(feedback != A0) + parity[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])]; + else + parity[NROOTS-1] = 0; + } +} diff --git a/encode_rs_8.c b/encode_rs_8.c new file mode 100644 index 0000000..5aaecca --- /dev/null +++ b/encode_rs_8.c @@ -0,0 +1,109 @@ +/* Reed-Solomon encoder + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <string.h> +#include "fixed.h" +#ifdef __VEC__ +#include <sys/sysctl.h> +#endif + + +static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode; + +static void encode_rs_8_c(data_t *data, data_t *parity,int pad); +#if __vec__ +static void encode_rs_8_av(data_t *data, data_t *parity,int pad); +#endif +#if __i386__ +int cpu_features(void); +#endif + +void encode_rs_8(data_t *data, data_t *parity,int pad){ + if(cpu_mode == UNKNOWN){ +#ifdef __i386__ + int f; + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + cpu_mode = MMX; + } else { /* No SIMD at all */ + cpu_mode = PORT; + } +#elif __VEC__ + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + cpu_mode = ALTIVEC; + else + cpu_mode = PORT; +#else + cpu_mode = PORT; +#endif + } + switch(cpu_mode){ +#if __vec__ + case ALTIVEC: + encode_rs_8_av(data,parity,pad); + return; +#endif +#if __i386__ + case MMX: + case SSE: + case SSE2: +#endif + default: + encode_rs_8_c(data,parity,pad); + return; + } +} + +#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */ + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form the other half by reversing this one + */ +extern static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static void encode_rs_8_av(data_t *data, data_t *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i<NN-NROOTS-pad;i++){ + vector unsigned char feedback0,feedback1; + unsigned char f; + + f = data[i] ^ shift_register.c[31]; + feedback1 = table[f].v; + feedback0 = vec_perm(feedback1,feedback1,reverse); + + /* Shift right one byte */ + shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1; + shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0; + shift_register.c[0] = f; + } + for(i=0;i<NROOTS;i++) + parity[NROOTS-i-1] = shift_register.c[i]; +} +#endif + +/* Portable C version */ +static void encode_rs_8_c(data_t *data, data_t *parity,int pad){ + +#include "encode_rs.h" + +} diff --git a/encode_rs_av.c b/encode_rs_av.c new file mode 100644 index 0000000..32e528f --- /dev/null +++ b/encode_rs_av.c @@ -0,0 +1,61 @@ +/* Fast Reed-Solomon encoder for (255,223) CCSDS code on PowerPC G4/G5 using Altivec instructions + * Copyright 2004, Phil Karn KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <string.h> +#include "fixed.h" + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form it by reversing these on the fly + */ +static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +void rs_init_av(){ + int i,j; + + /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */ + for(j=0;j<16;j++){ + table[0].c[j] = 0; + for(i=1;i<256;i++){ + table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])]; + } + } +#if 0 + for(i=0;i<256;i++){ + printf("table[%3d] = %3vu\n",i,table[i].v); + } +#endif +} + +void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i<NN-NROOTS-pad;i++){ + vector unsigned char feedback0,feedback1; + unsigned char f; + + f = data[i] ^ shift_register.c[31]; + feedback1 = table[f].v; + feedback0 = vec_perm(feedback1,feedback1,reverse); + + /* Shift right one byte */ + shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1; + shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0; + shift_register.c[0] = f; + } + for(i=0;i<NROOTS;i++) + parity[NROOTS-i-1] = shift_register.c[i]; +} diff --git a/encode_rs_ccsds.c b/encode_rs_ccsds.c new file mode 100644 index 0000000..5a2ec70 --- /dev/null +++ b/encode_rs_ccsds.c @@ -0,0 +1,24 @@ +/* This function wraps around the fixed 8-bit encoder, performing the + * basis transformations necessary to meet the CCSDS standard + * + * Copyright 2002, Phil Karn, KA9Q + * fixed bug Aug 2007 + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include "ccsds.h" +#include "fec.h" + +void encode_rs_ccsds(data_t *data,data_t *parity,int pad){ + int i; + data_t cdata[NN-NROOTS]; + + /* Convert data from dual basis to conventional */ + for(i=0;i<NN-NROOTS-pad;i++) + cdata[i] = Tal1tab[data[i]]; + + encode_rs_8(cdata,parity,pad); + + /* Convert parity from conventional to dual basis */ + for(i=0;i<NROOTS;i++) + parity[i] = Taltab[parity[i]]; +} diff --git a/encode_rs_char.c b/encode_rs_char.c new file mode 100644 index 0000000..a9bf2b8 --- /dev/null +++ b/encode_rs_char.c @@ -0,0 +1,15 @@ +/* Reed-Solomon encoder + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <string.h> + +#include "char.h" +#include "rs-common.h" + +void encode_rs_char(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/encode_rs_int.c b/encode_rs_int.c new file mode 100644 index 0000000..3c9ce78 --- /dev/null +++ b/encode_rs_int.c @@ -0,0 +1,15 @@ +/* Reed-Solomon encoder + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <string.h> + +#include "int.h" +#include "rs-common.h" + +void encode_rs_int(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/exercise.c b/exercise.c new file mode 100644 index 0000000..8ae008c --- /dev/null +++ b/exercise.c @@ -0,0 +1,122 @@ +/* Exercise an RS codec a specified number of times using random + * data and error patterns + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef FIXED +#include "fixed.h" +#define EXERCISE exercise_8 +#elif defined(CCSDS) +#include "fixed.h" +#include "ccsds.h" +#define EXERCISE exercise_ccsds +#elif defined(BIGSYM) +#include "int.h" +#define EXERCISE exercise_int +#else +#include "char.h" +#define EXERCISE exercise_char +#endif + +#ifdef FIXED +#define PRINTPARM printf("(255,223):"); +#elif defined(CCSDS) +#define PRINTPARM printf("CCSDS (255,223):"); +#else +#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots); +#endif + +/* Exercise the RS codec passed as an argument */ +int EXERCISE( +#if !defined(CCSDS) && !defined(FIXED) +void *p, +#endif +int trials){ +#if !defined(CCSDS) && !defined(FIXED) + struct rs *rs = (struct rs *)p; +#endif + data_t block[NN],tblock[NN]; + int i; + int errors; + int errlocs[NN]; + int derrlocs[NROOTS]; + int derrors; + int errval,errloc; + int erasures; + int decoder_errors = 0; + + while(trials-- != 0){ + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= NROOTS/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i<NN-NROOTS;i++) + block[i] = random() & NN; + +#if defined(CCSDS) || defined(FIXED) + ENCODE_RS(&block[0],&block[NN-NROOTS],0); +#else + ENCODE_RS(rs,&block[0],&block[NN-NROOTS]); +#endif + + /* Make temp copy, seed with errors */ + memcpy(tblock,block,sizeof(tblock)); + memset(errlocs,0,sizeof(errlocs)); + memset(derrlocs,0,sizeof(derrlocs)); + erasures=0; + for(i=0;i<errors;i++){ + do { + errval = random() & NN; + } while(errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % NN; + } while(errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if(random() & 1) /* 50-50 chance */ + derrlocs[erasures++] = errloc; +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ +#if defined(CCSDS) || defined(FIXED) + derrors = DECODE_RS(tblock,derrlocs,erasures,0); +#else + derrors = DECODE_RS(rs,tblock,derrlocs,erasures); +#endif + + if(derrors != errors){ + PRINTPARM + printf(" decoder says %d errors, true number is %d\n",derrors,errors); + decoder_errors++; + } + for(i=0;i<derrors;i++){ + if(errlocs[derrlocs[i]] == 0){ + PRINTPARM + printf(" decoder indicates error in location %d without error\n",derrlocs[i]); + decoder_errors++; + } + } + if(memcmp(tblock,block,sizeof(tblock)) != 0){ + PRINTPARM + printf(" uncorrected errors! output ^ input:"); + decoder_errors++; + for(i=0;i<NN;i++) + printf(" %02x",tblock[i] ^ block[i]); + printf("\n"); + } + } + } + return decoder_errors; +} @@ -0,0 +1,66 @@ +/* Utility routines for FEC support + * Copyright 2004, Phil Karn, KA9Q + */ + +#include <stdio.h> +#include "fec.h" + +unsigned char Partab[256]; +int P_init; + +/* Create 256-entry odd-parity lookup table + * Needed only on non-ia32 machines + */ +void partab_init(void){ + int i,cnt,ti; + + /* Initialize parity lookup table */ + for(i=0;i<256;i++){ + cnt = 0; + ti = i; + while(ti){ + if(ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + P_init=1; +} + +/* Lookup table giving count of 1 bits for integers 0-255 */ +int Bitcnt[] = { + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, +}; + @@ -0,0 +1,347 @@ +/* User include file for libfec + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifndef _FEC_H_ +#define _FEC_H_ + +/* r=1/2 k=7 convolutional encoder polynomials + * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB + * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted + */ +#define V27POLYA 0x6d +#define V27POLYB 0x4f + +void *create_viterbi27(int len); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); + +#ifdef __VEC__ +void *create_viterbi27_av(int len); +void set_viterbi27_polynomial_av(int polys[2]); +int init_viterbi27_av(void *p,int starting_state); +int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_av(void *p); +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi27_mmx(int len); +void set_viterbi27_polynomial_mmx(int polys[2]); +int init_viterbi27_mmx(void *p,int starting_state); +int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_mmx(void *p); +int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse(int len); +void set_viterbi27_polynomial_sse(int polys[2]); +int init_viterbi27_sse(void *p,int starting_state); +int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse(void *p); +int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse2(int len); +void set_viterbi27_polynomial_sse2(int polys[2]); +int init_viterbi27_sse2(void *p,int starting_state); +int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse2(void *p); +int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi27_port(int len); +void set_viterbi27_polynomial_port(int polys[2]); +int init_viterbi27_port(void *p,int starting_state); +int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_port(void *p); +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/2 k=9 convolutional encoder polynomials */ +#define V29POLYA 0x1af +#define V29POLYB 0x11d + +void *create_viterbi29(int len); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); + +#ifdef __VEC__ +void *create_viterbi29_av(int len); +void set_viterbi29_polynomial_av(int polys[2]); +int init_viterbi29_av(void *p,int starting_state); +int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_av(void *p); +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi29_mmx(int len); +void set_viterbi29_polynomial_mmx(int polys[2]); +int init_viterbi29_mmx(void *p,int starting_state); +int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_mmx(void *p); +int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse(int len); +void set_viterbi29_polynomial_sse(int polys[2]); +int init_viterbi29_sse(void *p,int starting_state); +int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse(void *p); +int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse2(int len); +void set_viterbi29_polynomial_sse2(int polys[2]); +int init_viterbi29_sse2(void *p,int starting_state); +int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse2(void *p); +int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi29_port(int len); +void set_viterbi29_polynomial_port(int polys[2]); +int init_viterbi29_port(void *p,int starting_state); +int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_port(void *p); +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/3 k=9 convolutional encoder polynomials */ +#define V39POLYA 0x1ed +#define V39POLYB 0x19b +#define V39POLYC 0x127 + +void *create_viterbi39(int len); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); + +#ifdef __VEC__ +void *create_viterbi39_av(int len); +void set_viterbi39_polynomial_av(int polys[3]); +int init_viterbi39_av(void *p,int starting_state); +int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_av(void *p); +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi39_mmx(int len); +void set_viterbi39_polynomial_mmx(int polys[3]); +int init_viterbi39_mmx(void *p,int starting_state); +int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_mmx(void *p); +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse(int len); +void set_viterbi39_polynomial_sse(int polys[3]); +int init_viterbi39_sse(void *p,int starting_state); +int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse(void *p); +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse2(int len); +void set_viterbi39_polynomial_sse2(int polys[3]); +int init_viterbi39_sse2(void *p,int starting_state); +int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse2(void *p); +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi39_port(int len); +void set_viterbi39_polynomial_port(int polys[3]); +int init_viterbi39_port(void *p,int starting_state); +int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_port(void *p); +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits); + + +/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion + * dfree = 56 + * These bits may be left-right flipped from some textbook representations; + * here I have the bits entering the shift register from the right (low) end + * + * Some other spacecraft use the same code, but with the polynomials in a different order. + * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion, + * so use set_viterbi615_polynomial() as appropriate. + */ +#define V615POLYA 042631 +#define V615POLYB 047245 +#define V615POLYC 056507 +#define V615POLYD 073363 +#define V615POLYE 077267 +#define V615POLYF 064537 + +void *create_viterbi615(int len); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); + +#ifdef __VEC__ +void *create_viterbi615_av(int len); +void set_viterbi615_polynomial_av(int polys[6]); +int init_viterbi615_av(void *p,int starting_state); +int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_av(void *p); +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi615_mmx(int len); +void set_viterbi615_polynomial_mmx(int polys[6]); +int init_viterbi615_mmx(void *p,int starting_state); +int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_mmx(void *p); +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse(int len); +void set_viterbi615_polynomial_sse(int polys[6]); +int init_viterbi615_sse(void *p,int starting_state); +int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse(void *p); +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse2(int len); +void set_viterbi615_polynomial_sse2(int polys[6]); +int init_viterbi615_sse2(void *p,int starting_state); +int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse2(void *p); +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits); + +#endif + +void *create_viterbi615_port(int len); +void set_viterbi615_polynomial_port(int polys[6]); +int init_viterbi615_port(void *p,int starting_state); +int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_port(void *p); +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits); + + +/* General purpose RS codec, 8-bit symbols */ +void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity); +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); +void *init_rs_char(int symsize,int gfpoly, + int fcr,int prim,int nroots, + int pad); +void free_rs_char(void *rs); + +/* General purpose RS codec, integer symbols */ +void encode_rs_int(void *rs,int *data,int *parity); +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); +void *init_rs_int(int symsize,int gfpoly,int fcr, + int prim,int nroots,int pad); +void free_rs_int(void *rs); + +/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis) + * symbol representation + */ +void encode_rs_8(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */ +void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* Tables to map from conventional->dual (Taltab) and + * dual->conventional (Tal1tab) bases + */ +extern unsigned char Taltab[],Tal1tab[]; + + +/* CPU SIMD instruction set available */ +extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode; +void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */ + +/* Determine parity of argument: 1 = odd, 0 = even */ +#ifdef __i386__ +static inline int parityb(unsigned char x){ + __asm__ __volatile__ ("test %1,%1;setpo %0" : "=g"(x) : "r" (x)); + return x; +} +#else +void partab_init(); + +static inline int parityb(unsigned char x){ + extern unsigned char Partab[256]; + extern int P_init; + if(!P_init){ + partab_init(); + } + return Partab[x]; +} +#endif + + +static inline int parity(int x){ + /* Fold down to one byte */ + x ^= (x >> 16); + x ^= (x >> 8); + return parityb(x); +} + +/* Useful utilities for simulation */ +double normal_rand(double mean, double std_dev); +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip); + +extern int Bitcnt[]; + +/* Dot product functions */ +void *initdp(signed short coeffs[],int len); +void freedp(void *dp); +long dotprod(void *dp,signed short a[]); + +void *initdp_port(signed short coeffs[],int len); +void freedp_port(void *dp); +long dotprod_port(void *dp,signed short a[]); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void freedp_mmx(void *dp); +long dotprod_mmx(void *dp,signed short a[]); + +void *initdp_sse(signed short coeffs[],int len); +void freedp_sse(void *dp); +long dotprod_sse(void *dp,signed short a[]); + +void *initdp_sse2(signed short coeffs[],int len); +void freedp_sse2(void *dp); +long dotprod_sse2(void *dp,signed short a[]); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +void freedp_av(void *dp); +long dotprod_av(void *dp,signed short a[]); +#endif + +/* Sum of squares - accepts signed shorts, produces unsigned long long */ +unsigned long long sumsq(signed short *in,int cnt); +unsigned long long sumsq_port(signed short *in,int cnt); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *in,int cnt); +unsigned long long sumsq_sse(signed short *in,int cnt); +unsigned long long sumsq_sse2(signed short *in,int cnt); +#endif +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *in,int cnt); +#endif + + +/* Low-level data structures and routines */ + +int cpu_features(void); + +#endif /* _FEC_H_ */ + + + @@ -0,0 +1,33 @@ +/* Stuff specific to the CCSDS (255,223) RS codec + * (255,223) code over GF(256). Note: the conventional basis is still + * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c + * + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +static inline int mod255(int x){ + while (x >= 255) { + x -= 255; + x = (x >> 8) + (x & 255); + } + return x; +} +#define MODNN(x) mod255(x) + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +#define MM 8 +#define NN 255 +#define ALPHA_TO CCSDS_alpha_to +#define INDEX_OF CCSDS_index_of +#define GENPOLY CCSDS_poly +#define NROOTS 32 +#define FCR 112 +#define PRIM 11 +#define IPRIM 116 +#define PAD pad + diff --git a/gen_ccsds.c b/gen_ccsds.c new file mode 100644 index 0000000..e1e2e26 --- /dev/null +++ b/gen_ccsds.c @@ -0,0 +1,39 @@ +/* Generate tables for CCSDS code + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include "char.h" +#include "rs-common.h" +#include "fec.h" + +int main(){ + struct rs *rs; + int i; + + rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */ + assert(rs != NULL); + printf("char CCSDS_alpha_to[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("0x%02x,",rs->alpha_to[i]); + } + printf("\n};\n\nchar CCSDS_index_of[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("%3d,",rs->index_of[i]); + } + printf("\n};\n\nchar CCSDS_poly[] = {"); + for(i=0;i<33;i++){ + if((i % 16) == 0) + printf("\n"); + + printf("%3d,",rs->genpoly[i]); + } + printf("\n};\n"); + exit(0); +} diff --git a/gen_ccsds_tal.c b/gen_ccsds_tal.c new file mode 100644 index 0000000..fc75503 --- /dev/null +++ b/gen_ccsds_tal.c @@ -0,0 +1,53 @@ +/* Conversion lookup tables from conventional alpha to Berlekamp's + * dual-basis representation. Used in the CCSDS version only. + * taltab[] -- convert conventional to dual basis + * tal1tab[] -- convert dual basis to conventional + + * Note: the actual RS encoder/decoder works with the conventional basis. + * So data is converted from dual to conventional basis before either + * encoding or decoding and then converted back. + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> + +#define DTYPE unsigned char +DTYPE Taltab[256],Tal1tab[256]; + +static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b }; + +/* Generate conversion lookup tables between conventional alpha representation + * (@**7, @**6, ...@**0) + * and Berlekamp's dual basis representation + * (l0, l1, ...l7) + */ +int main(){ + int i,j,k; + + for(i=0;i<256;i++){/* For each value of input */ + Taltab[i] = 0; + for(j=0;j<8;j++) /* for each column of matrix */ + for(k=0;k<8;k++){ /* for each row of matrix */ + if(i & (1<<k)) + Taltab[i] ^= tal[7-k] & (1<<j); + } + Tal1tab[Taltab[i]] = i; + } + printf("unsigned char Taltab[] = {\n"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("0x%02x,",Taltab[i]); + } + printf("\n};\n\nunsigned char Tal1tab[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("0x%02x,",Tal1tab[i]); + } + printf("\n};\n"); + exit(0); +} + diff --git a/init_rs.c b/init_rs.c new file mode 100644 index 0000000..ef1cf47 --- /dev/null +++ b/init_rs.c @@ -0,0 +1,39 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> +#include "fec.h" + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#include "rs-common.h" + +void free_rs(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_common(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/init_rs.h b/init_rs.h new file mode 100644 index 0000000..2b2ae98 --- /dev/null +++ b/init_rs.h @@ -0,0 +1,106 @@ +/* Common code for intializing a Reed-Solomon control block (char or int symbols) + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#undef NULL +#define NULL ((void *)0) + +{ + int i, j, sr,root,iprim; + + rs = NULL; + /* Check parameter ranges */ + if(symsize < 0 || symsize > 8*sizeof(data_t)){ + goto done; + } + + if(fcr < 0 || fcr >= (1<<symsize)) + goto done; + if(prim <= 0 || prim >= (1<<symsize)) + goto done; + if(nroots < 0 || nroots >= (1<<symsize)) + goto done; /* Can't have more roots than symbol values! */ + if(pad < 0 || pad >= ((1<<symsize) -1 - nroots)) + goto done; /* Too much padding */ + + rs = (struct rs *)calloc(1,sizeof(struct rs)); + if(rs == NULL) + goto done; + + rs->mm = symsize; + rs->nn = (1<<symsize)-1; + rs->pad = pad; + + rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->alpha_to == NULL){ + free(rs); + rs = NULL; + goto done; + } + rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->index_of == NULL){ + free(rs->alpha_to); + free(rs); + rs = NULL; + goto done; + } + + /* Generate Galois field lookup tables */ + rs->index_of[0] = A0; /* log(zero) = -inf */ + rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */ + sr = 1; + for(i=0;i<rs->nn;i++){ + rs->index_of[sr] = i; + rs->alpha_to[i] = sr; + sr <<= 1; + if(sr & (1<<symsize)) + sr ^= gfpoly; + sr &= rs->nn; + } + if(sr != 1){ + /* field generator polynomial is not primitive! */ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + + /* Form RS code generator polynomial from its roots */ + rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1)); + if(rs->genpoly == NULL){ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + rs->fcr = fcr; + rs->prim = prim; + rs->nroots = nroots; + + /* Find prim-th root of 1, used in decoding */ + for(iprim=1;(iprim % prim) != 0;iprim += rs->nn) + ; + rs->iprim = iprim / prim; + + rs->genpoly[0] = 1; + for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) { + rs->genpoly[i+1] = 1; + + /* Multiply rs->genpoly[] by @**(root + x) */ + for (j = i; j > 0; j--){ + if (rs->genpoly[j] != 0) + rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)]; + else + rs->genpoly[j] = rs->genpoly[j-1]; + } + /* rs->genpoly[0] can never be zero */ + rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)]; + } + /* convert rs->genpoly[] to index form for quicker encoding */ + for (i = 0; i <= nroots; i++) + rs->genpoly[i] = rs->index_of[rs->genpoly[i]]; + done:; + +} diff --git a/init_rs_char.c b/init_rs_char.c new file mode 100644 index 0000000..a51099a --- /dev/null +++ b/init_rs_char.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> + +#include "char.h" +#include "rs-common.h" + +void free_rs_char(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/init_rs_int.c b/init_rs_int.c new file mode 100644 index 0000000..a6036c2 --- /dev/null +++ b/init_rs_int.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdlib.h> + +#include "int.h" +#include "rs-common.h" + +void free_rs_int(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} @@ -0,0 +1,22 @@ +/* Stuff specific to the general (integer) version of the Reed-Solomon codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned int data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + diff --git a/lesser.txt b/lesser.txt new file mode 100644 index 0000000..b1e3f5a --- /dev/null +++ b/lesser.txt @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/makefile.in b/makefile.in new file mode 100644 index 0000000..53fdfcb --- /dev/null +++ b/makefile.in @@ -0,0 +1,242 @@ +# Makefile prototype for configure +# Copyright 2004 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + +# @configure_input@ +srcdir = @srcdir@ +prefix = @prefix@ +exec_prefix=@exec_prefix@ +VPATH = @srcdir@ +CC=@CC@ +LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \ + viterbi39.o viterbi39_port.o \ + viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \ + decode_rs_char.o decode_rs_int.o decode_rs_8.o \ + init_rs_char.o init_rs_int.o ccsds_tab.o \ + encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \ + dotprod.o dotprod_port.o \ + peakval.o peakval_port.o \ + sumsq.o sumsq_port.o + +CFLAGS=@CFLAGS@ -I. -Wall @ARCH_OPTION@ + +SHARED_LIB=@SH_LIB@ + +all: libfec.a $(SHARED_LIB) + +test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest + @echo "Correctness tests:" + ./vtest27 -e 3.0 -n 1000 -v + ./vtest29 -e 2.5 -n 1000 -v + ./vtest39 -e 2.5 -n 1000 -v + ./vtest615 -e 1.0 -n 100 -v + ./rstest + ./dtest + ./sumsq_test + ./peaktest + @echo "Speed tests:" + ./vtest27 + ./vtest29 + ./vtest39 + ./vtest615 + +install: all + mkdir -p @libdir@ + install -m 644 -p $(SHARED_LIB) libfec.a @libdir@ +# (cd @libdir@;ln -f -s $(SHARED_LIB) libfec.so) + @REBIND@ + mkdir -p @includedir@ + install -m 644 -p fec.h @includedir@ + mkdir -m 0755 -p @mandir@/man3 + install -m 644 -p simd-viterbi.3 rs.3 dsp.3 @mandir@/man3 + +peaktest: peaktest.o libfec.a + gcc -g -o $@ $^ + +sumsq_test: sumsq_test.o libfec.a + gcc -g -o $@ $^ + +dtest: dtest.o libfec.a + gcc -g -o $@ $^ -lm + +vtest27: vtest27.o libfec.a + gcc -g -o $@ $^ -lm + +vtest29: vtest29.o libfec.a + gcc -g -o $@ $^ -lm + +vtest39: vtest39.o libfec.a + gcc -g -o $@ $^ -lm + +vtest615: vtest615.o libfec.a + gcc -g -o $@ $^ -lm + +rstest: rstest.o libfec.a + gcc -g -o $@ $^ + +rs_speedtest: rs_speedtest.o libfec.a + gcc -g -o $@ $^ + +# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug? +vtest27.o: vtest27.c fec.h + gcc -g -c $< + +vtest29.o: vtest29.c fec.h + gcc -g -c $< + +vtest39.o: vtest39.c fec.h + gcc -g -c $< + +vtest615.o: vtest615.c fec.h + gcc -g -c $< + +libfec.a: $(LIBS) + ar rv $@ $^ + ranlib libfec.a + +# for Darwin +libfec.dylib: $(LIBS) + $(CC) -dynamiclib -install_name $@ -o $@ $^ + +# for Linux et al +libfec.so: $(LIBS) + gcc -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc + +dotprod.o: dotprod.c fec.h + +dotprod_port.o: dotprod_port.c fec.h + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi29.o: viterbi29.c fec.h + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + +viterbi39_sse.o: viterbi39_sse.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + +encode_rs_char.o: encode_rs_char.c char.h rs-common.h + +encode_rs_int.o: encode_rs_int.c int.h rs-common.h + +encode_rs_8.o: encode_rs_8.c fixed.h + +encode_rs_av.o: encode_rs_av.c fixed.h + +decode_rs_char.o: decode_rs_char.c char.h rs-common.h + +decode_rs_int.o: decode_rs_int.c int.h rs-common.h + +decode_rs_8.o: decode_rs_8.c fixed.h + +init_rs_char.o: init_rs_char.c char.h rs-common.h + +init_rs_int.o: init_rs_int.c int.h rs-common.h + +ccsds_tab.o: ccsds_tab.c + +ccsds_tab.c: gen_ccsds + ./gen_ccsds > ccsds_tab.c + +gen_ccsds: gen_ccsds.o init_rs_char.o + gcc -o $@ $^ + +gen_ccsds.o: gen_ccsds.c + gcc $(CFLAGS) -c -o $@ $< + +ccsds_tal.o: ccsds_tal.c + +ccsds_tal.c: gen_ccsds_tal + ./gen_ccsds_tal > ccsds_tal.c + +exercise_char.o: exercise.c + gcc $(CFLAGS) -c -o $@ $< + +exercise_int.o: exercise.c + gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $< + +exercise_8.o: exercise.c + gcc -DFIXED=1 $(CFLAGS) -c -o $@ $< + +exercise_ccsds.o: exercise.c + gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $< + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi27_av.o: viterbi27_av.c fec.h + +viterbi27_mmx.o: viterbi27_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi27_sse.o: viterbi27_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi27_sse2.o: viterbi27_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi29.o: viterbi29.c fec.h + +viterbi29_port.o: viterbi29_port.c fec.h + +viterbi29_av.o: viterbi29_av.c fec.h + +viterbi29_mmx.o: viterbi29_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi29_sse.o: viterbi29_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi29_sse2.o: viterbi29_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_av.o: viterbi39_av.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi39_sse.o: viterbi39_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi615.o: viterbi615.c fec.h + +viterbi615_port.o: viterbi615_port.c fec.h + +viterbi615_av.o: viterbi615_av.c fec.h + +viterbi615_mmx.o: viterbi615_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi615_sse.o: viterbi615_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi615_sse2.o: viterbi615_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +cpu_mode_x86.o: cpu_mode_x86.c fec.h + +cpu_mode_ppc.o: cpu_mode_ppc.c fec.h + + +clean: + rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core + rm -rf autom4te.cache + +distclean: clean + rm -f config.log config.cache config.status config.h makefile + diff --git a/mmxbfly27.s b/mmxbfly27.s new file mode 100644 index 0000000..4abbf48 --- /dev/null +++ b/mmxbfly27.s @@ -0,0 +1,148 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; +*/ + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + + # These are offsets into struct v27, defined in viterbi27_mmx.c + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2 + .type update_viterbi27_blk_mmx,@function + .align 16 + +update_viterbi27_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $5,%eax + shll $5,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab27_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + + paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 4 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + + addl $64,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/mmxbfly29.s b/mmxbfly29.s new file mode 100644 index 0000000..e37cab8 --- /dev/null +++ b/mmxbfly29.s @@ -0,0 +1,161 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits); +*/ + + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2 + .type update_viterbi29_blk_mmx,@function + .align 16 + + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + +update_viterbi29_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $7,%eax + shll $7,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab29_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 16 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + butterfly GROUP=8 + butterfly GROUP=9 + butterfly GROUP=10 + butterfly GROUP=11 + butterfly GROUP=12 + butterfly GROUP=13 + butterfly GROUP=14 + butterfly GROUP=15 + + addl $256,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/peak_mmx_assist.s b/peak_mmx_assist.s new file mode 100644 index 0000000..dae831f --- /dev/null +++ b/peak_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx(signed short *in,int cnt); + .global peakval_mmx + .type peakval_mmx,@function + .align 16 +peakval_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/peak_sse2_assist.s b/peak_sse2_assist.s new file mode 100644 index 0000000..1dee3a8 --- /dev/null +++ b/peak_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2(signed short *in,int cnt); + .global peakval_sse2 + .type peakval_sse2,@function + .align 16 +peakval_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/peak_sse_assist.s b/peak_sse_assist.s new file mode 100644 index 0000000..ea6fce8 --- /dev/null +++ b/peak_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse(signed short *in,int cnt); + .global peakval_sse + .type peakval_sse,@function + .align 16 +peakval_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/peaktest.c b/peaktest.c new file mode 100644 index 0000000..fa4b280 --- /dev/null +++ b/peaktest.c @@ -0,0 +1,38 @@ +/* Verify correctness of the peak routine + * Copyright 2004 Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +int peakval(signed short *,int); +int peakval_port(signed short *,int); + +int main(){ + int i,s; + int result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i<NSAMP;i++){ + do { + s = random() & 0x0fff; + } while(s == 0x8000); + samples[i] = s; + } + samples[5] = 25000; + + rresult = peakval_port(&samples[OFFSET],NSAMP-OFFSET); + result = peakval(&samples[OFFSET],NSAMP-OFFSET); + if(result == rresult){ + printf("OK\n"); + } else { + printf("peak mismatch: %d != %d\n",result,rresult); + } + exit(0); +} diff --git a/peakval.c b/peakval.c new file mode 100644 index 0000000..811a3a9 --- /dev/null +++ b/peakval.c @@ -0,0 +1,39 @@ +/* Switch to appropriate version of peakval routine + * Copyright 2004, Phil Karn, KA9Q + */ + +#include <stdlib.h> +#include "fec.h" + +int peakval_port(signed short *b,int cnt); +#ifdef __i386__ +int peakval_mmx(signed short *b,int cnt); +int peakval_sse(signed short *b,int cnt); +int peakval_sse2(signed short *b,int cnt); +#endif + +#ifdef __VEC__ +int peakval_av(signed short *b,int cnt); +#endif + +int peakval(signed short *b,int cnt){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return peakval_port(b,cnt); +#ifdef __i386__ + case MMX: + return peakval_mmx(b,cnt); + case SSE: + return peakval_sse(b,cnt); + case SSE2: + return peakval_sse2(b,cnt); +#endif +#ifdef __VEC__ + case ALTIVEC: + return peakval_av(b,cnt); +#endif + } +} diff --git a/peakval_av.c b/peakval_av.c new file mode 100644 index 0000000..ae54c10 --- /dev/null +++ b/peakval_av.c @@ -0,0 +1,61 @@ +/* Return the largest absolute value of a vector of signed shorts + + * This is the Altivec SIMD version. + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +signed short peakval_av(signed short *in,int cnt){ + vector signed short x; + int pad; + union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s; + vector signed short smallest,largest; + + smallest = (vector signed short)(0); + largest = (vector signed short)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + } + /* Combine and extract result */ + largest = vec_max(largest,vec_abs(smallest)); + + s.c[15] = 64; /* Shift right four 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 32; /* Shift right two 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 16; /* Shift right one 16-bit word */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.hv = largest; + return s.s[7]; +} diff --git a/peakval_mmx.c b/peakval_mmx.c new file mode 100644 index 0000000..436fe88 --- /dev/null +++ b/peakval_mmx.c @@ -0,0 +1,34 @@ +/* Wrapper for the MMX version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include <stdlib.h> + +int peakval_mmx_assist(signed short *,int); + +int peakval_mmx(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_mmx_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/peakval_mmx_assist.s b/peakval_mmx_assist.s new file mode 100644 index 0000000..553cb79 --- /dev/null +++ b/peakval_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx_assist(signed short *in,int cnt); + .global peakval_mmx_assist + .type peakval_mmx_assist,@function + .align 16 +peakval_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/peakval_port.c b/peakval_port.c new file mode 100644 index 0000000..07ab316 --- /dev/null +++ b/peakval_port.c @@ -0,0 +1,16 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include <stdlib.h> +#include "fec.h" +int peakval_port(signed short *b,int len){ + int peak = 0; + int a,i; + + for(i=0;i<len;i++){ + a = abs(b[i]); + if(a > peak) + peak = a; + } + return peak; +} diff --git a/peakval_sse.c b/peakval_sse.c new file mode 100644 index 0000000..9868b7f --- /dev/null +++ b/peakval_sse.c @@ -0,0 +1,35 @@ +/* IA-32 SSE version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include <stdlib.h> +#include "fec.h" + +int peakval_sse_assist(signed short *,int); + +int peakval_sse(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/peakval_sse2.c b/peakval_sse2.c new file mode 100644 index 0000000..79d9059 --- /dev/null +++ b/peakval_sse2.c @@ -0,0 +1,34 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include <stdlib.h> +#include "fec.h" + +int peakval_sse2_assist(signed short *,int); + +int peakval_sse2(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 15) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse2_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~7; + cnt &= 7; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/peakval_sse2_assist.s b/peakval_sse2_assist.s new file mode 100644 index 0000000..c7a58e7 --- /dev/null +++ b/peakval_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2_assist(signed short *in,int cnt); + .global peakval_sse2_assist + .type peakval_sse2_assist,@function + .align 16 +peakval_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/peakval_sse_assist.s b/peakval_sse_assist.s new file mode 100644 index 0000000..827c800 --- /dev/null +++ b/peakval_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse_assist(signed short *in,int cnt); + .global peakval_sse_assist + .type peakval_sse_assist,@function + .align 16 +peakval_sse_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/rs-common.h b/rs-common.h new file mode 100644 index 0000000..e64eb39 --- /dev/null +++ b/rs-common.h @@ -0,0 +1,26 @@ +/* Stuff common to all the general-purpose Reed-Solomon codecs + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +/* Reed-Solomon codec control block */ +struct rs { + int mm; /* Bits per symbol */ + int nn; /* Symbols per block (= (1<<mm)-1) */ + data_t *alpha_to; /* log lookup table */ + data_t *index_of; /* Antilog lookup table */ + data_t *genpoly; /* Generator polynomial */ + int nroots; /* Number of generator roots = number of parity symbols */ + int fcr; /* First consecutive root, index form */ + int prim; /* Primitive element, index form */ + int iprim; /* prim-th root of 1, index form */ + int pad; /* Padding bytes in shortened block */ +}; + +static inline int modnn(struct rs *rs,int x){ + while (x >= rs->nn) { + x -= rs->nn; + x = (x >> rs->mm) + (x & rs->nn); + } + return x; +} @@ -0,0 +1,198 @@ +.TH REED-SOLOMON 3 +.SH NAME +init_rs_int, encode_rs_int, decode_rs_int, free_rs_int, +init_rs_char, encode_rs_char, decode_rs_char, free_rs_char, +encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds +\- Reed-Solomon encoding/decoding +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" + +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_int(void *rs,int *data,int *parity); + +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); + +void free_rs_int(void *rs); + + +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_char(void *rs,unsigned char *data, + unsigned char *parity); + +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); + +void free_rs_char(void *rs); + + +void encode_rs_8(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras, + int pad); + + +void encode_rs_ccsds(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras, + int pad); + +unsigned char Taltab[256]; +unsigned char Tal1tab[256]; + +.fi + +.SH DESCRIPTION +These functions implement Reed-Solomon error control encoding and +decoding. For optimal performance in a variety of applications, three +sets of functions are supplied. To access these functions, add "-lfec" +to your linker command line. + +The functions with names ending in \fB_int\fR handle data in integer arrays, +permitting arbitrarily large codewords limited only by machine +resources. + +The functions with names ending in \fB_char\fR take unsigned char arrays and can +handle codes with symbols of 8 bits or less (i.e., with codewords of +255 symbols or less). + +\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific +(255,223) code with 8-bit symbols specified by the CCSDS: +a field generator of 1 + X + X^2 + X^7 + X^8 and a code +generator with first consecutive root = 112 and a primitive element of +11. These functions use the conventional +polynomial form, \fInot\fR the dual-basis specified in +the CCSDS standard, to represent symbols. This code may be +shortened by giving a non-zero \fBpad\fR value to produce a +(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the +specified number of zeroes at the front of the full codeword. + +For full CCSDS compatibility, \fBencode_rs_ccsds\fR and +\fBdecode_rs_ccsds\fR are provided. These functions use two lookup +tables, \fBTaltab\fR to convert from conventional to dual-basis, and +\fBTal1tab\fR to perform the inverse mapping from dual-basis to +conventional form, before and after calls to \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + +The \fB_8\fR and \fB_ccsds\fR functions do not require initialization. + +To use the general purpose RS encoder or decoder (i.e., +the \fB_char\fR or \fB_int\fR versions), the user must first +call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The +arguments are as follows: + +\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR +or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a +huge code would exhaust memory limits on a 32-bit machine). The resulting +Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols, +each containing \fBsymsize\fR bits. The codeword may be shortened with the +\fBpad\fR parameter described below. + +\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients, +with the 0th coefficient in the low order bit. The polynomial +\fImust\fR be primitive; if not, the call will fail and NULL will be +returned. + +\fBfcr\fR gives, in index form, the first consecutive root of the +Reed Solomon code generator polynomial. + +\fBprim\fR gives, in index form, the primitive element in the Galois field +used to generate the Reed Solomon code generator polynomial. + +\fBnroots\fR gives the number of roots in the Reed Solomon code +generator polynomial. This equals the number of parity symbols +per code block. + +\fBpad\fR gives the number of leading symbols in the codeword +that are implicitly padded to zero in a shortened code block. + +The resulting Reed-Solomon code has parameters (N,K), where +N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR. + +The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept +the pointer returned by \fBinit_rs_char\fR or +\fBinit_rs_int\fR, respectively, to +encode a block of data using the specified code. +The input data array is expected to +contain K symbols (of \fBsymsize\fR bits each, right justified +in each char or int) and \fBnroots\fR parity symbols will be placed +into the \fBparity\fR array, right justified. + +The \fBdecode_\fR functions correct +the errors in a Reed-Solomon codeword of N symbols up to the capability of the code. +An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR +array to assist the decoder; this parameter may be NULL if no erasures +are given. The number of erased symbols must be given in the \fBno_eras\fR +parameter. + +To maximize performance, the encode and decode functions perform no +"sanity checking" of their inputs. Decoder failure may result if +\fBeras_pos\fR contains duplicate entries, and both encoder and +decoder will fail if an input symbol exceeds its allowable range. +(Symbol range overflow cannot occur with the \fB_8\fR or +\fB_ccsds\fR functions, +or with the \fB_char\fR functions when 8-bit symbols are specified.) + +The decoder corrects the symbols "in place", returning the number +of symbols in error. If the codeword is uncorrectable, -1 is returned +and the data block is unchanged. If \fBeras_pos\fR is non-null, it is +used to return a list of corrected symbol positions, in no particular +order. This means that the +array passed through this parameter \fImust\fR have at least \fBnroots\fR +elements to prevent a possible buffer overflow. + +The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal +space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions, +respecitively. + +The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have +corresponding \fBinit\fR and \fBfree\fR, nor do they take the +\fBrs\fR argument accepted by the other functions as their parameters +are statically compiled. These functions implement a code +equivalent to calling + +\fBinit_rs_char\fR(8,0x187,112,11,32,pad); + +and using the resulting pointer with \fBencode_rs_char\fR and +\fBdecode_rs_char\fR. + +.SH RETURN VALUES +\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal +control structure that must be passed to the corresponding encode, decode +and free functions. These functions return NULL on error. + +The \fBdecode_\fR functions return a count of corrected +symbols, or -1 if the block was uncorrectible. + +.SH AUTHOR +Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert +Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy +(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar +Welz (dwelz@web.de). + +.SH COPYRIGHT +Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the +GNU Lesser General Public License (LGPL). + +.SH SEE ALSO +CCSDS 101.0-B-6: Telemetry Channel Coding. +http://www.ccsds.org/documents/101x0b6.pdf + +.SH NOTE +CCSDS chose the "dual basis" symbol representation because it +simplified the implementation of a Reed-Solomon encoder in dedicated +hardware. However, this approach holds no advantages for a software +implementation on a general purpose computer, so use of the dual basis +is recommended only if compatibility with the CCSDS standard is needed, +e.g., to decode data from an existing spacecraft using the CCSDS +standard. If you just want a fast (255,223) RS codec without needing +to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + diff --git a/rs_speedtest.c b/rs_speedtest.c new file mode 100644 index 0000000..225f160 --- /dev/null +++ b/rs_speedtest.c @@ -0,0 +1,54 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/time.h> +#include <sys/resource.h> +#include "fec.h" + +int main(){ + unsigned char block[255]; + int i; + void *rs; + struct rusage start,finish; + double extime; + int trials = 10000; + + for(i=0;i<223;i++) + block[i] = 0x01; + + rs = init_rs_char(8,0x187,112,11,32,0); + encode_rs_char(rs,block,&block[223]); + + getrusage(RUSAGE_SELF,&start); + for(i=0;i<trials;i++){ +#if 0 + block[0] ^= 0xff; /* Introduce an error */ + block[2] ^= 0xff; /* Introduce an error */ +#endif + decode_rs_char(rs,block,NULL,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + + printf("Execution time for %d Reed-Solomon blocks using general decoder: %.2f sec\n",trials,extime); + printf("decoder speed: %g bits/s\n",trials*223*8/extime); + + + encode_rs_8(block,&block[223],0); + getrusage(RUSAGE_SELF,&start); + for(i=0;i<trials;i++){ +#if 0 + block[0] ^= 0xff; /* Introduce an error */ + block[2] ^= 0xff; /* Introduce an error */ +#endif + decode_rs_8(block,NULL,0,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d Reed-Solomon blocks using CCSDS decoder: %.2f sec\n",trials,extime); + printf("decoder speed: %g bits/s\n",trials*223*8/extime); + + exit(0); +} + diff --git a/rstest.c b/rstest.c new file mode 100644 index 0000000..539b40a --- /dev/null +++ b/rstest.c @@ -0,0 +1,296 @@ +/* Test the Reed-Solomon codecs + * for various block sizes and with random data and random error patterns + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <time.h> +#include "fec.h" + + +struct etab { + int symsize; + int genpoly; + int fcs; + int prim; + int nroots; + int ntrials; +} Tab[] = { + {2, 0x7, 1, 1, 1, 10 }, + {3, 0xb, 1, 1, 2, 10 }, + {4, 0x13, 1, 1, 4, 10 }, + {5, 0x25, 1, 1, 6, 10 }, + {6, 0x43, 1, 1, 8, 10 }, + {7, 0x89, 1, 1, 10, 10 }, + {8, 0x11d, 1, 1, 32, 10 }, + {8, 0x187, 112,11, 32, 10 }, /* Duplicates CCSDS codec */ + {9, 0x211, 1, 1, 32, 10 }, + {10,0x409, 1, 1, 32, 10 }, + {11,0x805, 1, 1, 32, 10 }, + {12,0x1053, 1, 1, 32, 5 }, + {13,0x201b, 1, 1, 32, 2 }, + {14,0x4443, 1, 1, 32, 1 }, + {15,0x8003, 1, 1, 32, 1 }, + {16,0x1100b, 1, 1, 32, 1 }, + {0, 0, 0, 0, 0}, +}; + +int exercise_char(struct etab *e); +int exercise_int(struct etab *e); +int exercise_8(void); + +int main(){ + int i; + + srandom(time(NULL)); + + printf("Testing fixed CCSDS encoder...\n"); + exercise_8(); + for(i=0;Tab[i].symsize != 0;i++){ + int nn,kk; + + nn = (1<<Tab[i].symsize) - 1; + kk = nn - Tab[i].nroots; + printf("Testing (%d,%d) code...\n",nn,kk); + if(Tab[i].symsize <= 8) + exercise_char(&Tab[i]); + else + exercise_int(&Tab[i]); + } + exit(0); +} + +int exercise_8(void){ + int nn = 255; + unsigned char block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + + /* Compute code parameters */ + kk = 223; + + + /* Test up to the error correction capacity of the code */ + for(errors=0;errors<=(nn-kk)/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i<kk;i++) + block[i] = random() & nn; + memcpy(tblock,block,sizeof(block)); + encode_rs_8(block,&block[kk],0); + + /* Make temp copy, seed with errors */ + memcpy(tblock,block,sizeof(block)); + memset(errlocs,0,sizeof(errlocs)); + memset(derrlocs,0,sizeof(derrlocs)); + erasures=0; + for(i=0;i<errors;i++){ + do { + errval = random() & nn; + } while(errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } while(errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if(random() & 1) /* 50-50 chance */ + derrlocs[erasures++] = errloc; +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_8(tblock,derrlocs,erasures,0); + + if(derrors != errors){ + printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors); + decoder_errors++; + } + for(i=0;i<derrors;i++){ + if(errlocs[derrlocs[i]] == 0){ + printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]); + decoder_errors++; + } + } + if(memcmp(tblock,block,sizeof(tblock)) != 0){ + printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk); + decoder_errors++; + for(i=0;i<nn;i++) + printf(" %02x",tblock[i] ^ block[i]); + printf("\n"); + } + } + return decoder_errors; +} + + +int exercise_char(struct etab *e){ + int nn = (1<<e->symsize) - 1; + unsigned char block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + if(e->symsize > 8) + return -1; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_char failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i<kk;i++) + block[i] = random() & nn; + memcpy(tblock,block,sizeof(block)); + encode_rs_char(rs,block,&block[kk]); + + /* Make temp copy, seed with errors */ + memcpy(tblock,block,sizeof(block)); + memset(errlocs,0,sizeof(errlocs)); + memset(derrlocs,0,sizeof(derrlocs)); + erasures=0; + for(i=0;i<errors;i++){ + do { + errval = random() & nn; + } while(errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } while(errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if(random() & 1) /* 50-50 chance */ + derrlocs[erasures++] = errloc; +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_char(rs,tblock,derrlocs,erasures); + + if(derrors != errors){ + printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors); + decoder_errors++; + } + for(i=0;i<derrors;i++){ + if(errlocs[derrlocs[i]] == 0){ + printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]); + decoder_errors++; + } + } + if(memcmp(tblock,block,sizeof(tblock)) != 0){ + printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk); + decoder_errors++; + for(i=0;i<nn;i++) + printf(" %02x",tblock[i] ^ block[i]); + printf("\n"); + } + } + + free_rs_char(rs); + return 0; +} + +int exercise_int(struct etab *e){ + int nn = (1<<e->symsize) - 1; + int block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_int failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i<kk;i++) + block[i] = random() & nn; + memcpy(tblock,block,sizeof(block)); + encode_rs_int(rs,block,&block[kk]); + + /* Make temp copy, seed with errors */ + memcpy(tblock,block,sizeof(block)); + memset(errlocs,0,sizeof(errlocs)); + memset(derrlocs,0,sizeof(derrlocs)); + erasures=0; + for(i=0;i<errors;i++){ + do { + errval = random() & nn; + } while(errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } while(errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if(random() & 1) /* 50-50 chance */ + derrlocs[erasures++] = errloc; +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_int(rs,tblock,derrlocs,erasures); + + if(derrors != errors){ + printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors); + decoder_errors++; + } + for(i=0;i<derrors;i++){ + if(errlocs[derrlocs[i]] == 0){ + printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]); + decoder_errors++; + } + } + if(memcmp(tblock,block,sizeof(tblock)) != 0){ + printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk); + decoder_errors++; + for(i=0;i<nn;i++) + printf(" %02x",tblock[i] ^ block[i]); + printf("\n"); + } + } + + free_rs_int(rs); + return 0; +} @@ -0,0 +1,43 @@ +#include <math.h> +#include <stdlib.h> +#include "fec.h" + +#define MAX_RANDOM 0x7fffffff + +/* Generate gaussian random double with specified mean and std_dev */ +double normal_rand(double mean, double std_dev) +{ + double fac,rsq,v1,v2; + static double gset; + static int iset; + + if(iset){ + /* Already got one */ + iset = 0; + return mean + std_dev*gset; + } + /* Generate two evenly distributed numbers between -1 and +1 + * that are inside the unit circle + */ + do { + v1 = 2.0 * (double)random() / MAX_RANDOM - 1; + v2 = 2.0 * (double)random() / MAX_RANDOM - 1; + rsq = v1*v1 + v2*v2; + } while(rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0*log(rsq)/rsq); + gset = v1*fac; + iset++; + return mean + std_dev*v2*fac; +} + +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){ + int sample; + + sample = offset + gain*normal_rand(sym?amp:-amp,1.0); + /* Clip to 8-bit offset range */ + if(sample < 0) + sample = 0; + else if(sample > clip) + sample = clip; + return sample; +} diff --git a/simd-viterbi.3 b/simd-viterbi.3 new file mode 100644 index 0000000..4c67593 --- /dev/null +++ b/simd-viterbi.3 @@ -0,0 +1,247 @@ +.TH SIMD-VITERBI 3 +.SH NAME +create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk, +chainback_viterbi27, delete_viterbi27, +create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk, +chainback_viterbi29, delete_viterbi29, +create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk, +chainback_viterbi39, delete_viterbi39, +create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk, +chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" +void *create_viterbi27(int blocklen); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi29(int blocklen); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi39(int blocklen); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi615(int blocklen); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); +.fi +.SH DESCRIPTION +These functions implement high performance Viterbi decoders for four +convolutional codes: a rate 1/2 constraint length 7 (k=7) code +("viterbi27"), a rate 1/2 k=9 code ("viterbi29"), +a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615"). +The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve +decoding speed. + +On the IA32 there are three different SIMD instruction sets. The first +and most common is MMX, introduced on later Intel Pentiums and then on +the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe, +etc). SSE was introduced on the Pentium III and later implemented in +the AMD Athlon 4 (AMD calls it "3D Now! Professional"). Most +recently, SSE2 was introduced in the Intel Pentium 4, and has been +adopted by more recent AMD CPUs. The presence of SSE2 implies the +existence of SSE, which in turn implies MMX. + +Altivec is the PowerPC SIMD instruction set. It is roughly comparable +to SSE2. Altivec was introduced to the general public in the Apple +Macintosh G4; it is also present in the G5. Altivec is actually a +Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it +"VMX". All refer to the same thing. + +When built for the IA32 or PPC architectures, the functions +automatically use the most powerful SIMD instruction set available. If +no SIMD instructions are available, or if the library is built for a +non-IA32, non-PPC machine, a portable C version is executed +instead. + +.SH USAGE +Four versions of each function are provided, one for each code. +In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39" +or "viterbi615" as desired. + +Before Viterbi decoding can begin, an instance must first be created with +\fBcreate_viterbi()\fR. This function creates and returns a pointer to +an internal control structure +containing the path metrics and the branch +decisions. \fBcreate_viterbi()\fR takes one argument that gives the +length of the data block in bits. You \fImust not\fR attempt to +decode a block longer than the length given to \fBcreate_viterbi()\fR. + +Before decoding a new frame, +\fBinit_viterbi()\fR must be called to reset the decoder state. +It accepts the instance pointer returned by +\fBcreate_viterbi()\fR and the initial starting state of the +convolutional encoder (usually 0). If the initial starting state is unknown or +incorrect, the decoder will still function but the decoded data may be +incorrect at the start of the block. + +Blocks of received symbols are processed with calls to +\fBupdate_viterbi_blk()\fR. The \fBnbits\fR parameter specifies the +number of \fIdata bits\fR (not channel symbols) represented by the +\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in +\fBsyms\fR is twice \fInbits\fR, and so on.) +Each symbol is expected to range +from 0 through 255, with 0 corresponding to a "strong 0" and 255 +corresponding to a "strong 1". The caller is responsible for +determining the proper pairing of input symbols (commonly known as +decoder symbol phasing). + +At the end of the block, the data is recovered with a call to +\fBchainback_viterbi()\fR. The arguments are the pointer to the +decoder instance, a pointer to a user-supplied buffer into which the +decoded data is to be written, the number of data bits (not bytes) +that are to be decoded, and the terminal state of the convolutional +encoder at the end of the frame (usually 0). If the terminal state is +incorrect or unknown, the decoded data bits at the end of the frame +may be unreliable. The decoded data is written in big-endian order, +i.e., the first bit in the frame is written into the high order bit of +the first byte in the buffer. If the frame is not an integral number +of bytes long, the low order bits of the last byte in the frame will +be unused. + +Note that the decoders assume the use of a tail, i.e., the encoding +and transmission of a sufficient number of padding bits beyond the end +of the user data to force the convolutional encoder into the known +terminal state given to \fBchainback_viterbi()\fR. The tail is +always one bit less than the constraint length of the code, so the k=7 +code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits +(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail +symbols). + +The tail bits are not included in the length arguments to +\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if +the block contains 1000 user bits, then this would be the length +parameter given to \fBcreate_viterbi27()\fR and +\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called +with a total of 2012 symbols - the last 12 encoded symbols +representing the tail bits. + +After the call to \fBchainback_viterbi()\fR, the decoder may be reset +with a call to \fBinit_viterbi()\fR and another block can be decoded. +Alternatively, \fBdelete_viterbi()\fR can be called to free all resources +used by the Viterbi decoder. + +The \fBset_viterbi_polynomial()\fR function allows use of other than the default +code generator polynomials. Although only one set of polynomials are generally +used with each code, there can are different conventions as to their order and +symbol polarity, and these functions simplifies their use. + +The default polynomials for the viterbi27 routes +are those of the NASA-JPL convention \fIwithout\fR symbol inversion. +The NASA-JPL convention normally inverts the first symbol. +The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second. +.sp +To set the NASA-JPL convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { -V27POLYA,V27POLYB }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +and to set the CCSDS convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { V27POLYB,-V27POLYA }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +The default polynomials for the viterbi615 routines +are those used by the Cassini spacecraft \fIwithout\fR +symbol inversion. Mars Pathfinder (MPF) and STEREO +swap the third and fourth polynomials. +Both conventions invert the +first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions. +.sp +To set the Cassini convention with symbol inversion, do the following: + +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi +.sp +and to set the MPF/STEREO convention with symbol inversion: +.sp +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi + +For performance reasons, calling this function changes the code +generator polynomials for \fIall\fR instances of corresponding Viterbi decoder, +including those already created. + +.SH ERROR PERFORMANCE +These decoders have all been extensively tested and found to provide +performance consistent with that expected for soft-decision Viterbi +decoding with 8-bit symbols. + +Due to internal differences, the implementations +vary slightly in error performance. In +general, the portable C versions exhibit the best error performance +because they use full-sized branch metrics, and the MMX versions +exhibit the worst because they use 8-bit branch metrics with modulo +comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and +r=1/2 k=9 codes use unsigned +8-bit branch metrics, and are almost as good as the C versions. The +r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD +versions. + +.SH DIRECT ACCESS TO SPECIFIC FUNCTION VERSIONS +Calling the functions listed above automatically calls the appropriate +version of the function depending on the CPU type and available SIMD +instructions. A particular version can also be called directly by +appending the appropriate suffix to the function name. The available +suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX, +SSE, SSE2, Altivec and portable versions, respectively. For example, +the SSE2 version of the update_viterbi27_blk() function can be invoked +as update_viterbi27_blk_sse2(). + +Naturally, the _av functions are only available on the PowerPC and the +_mmx, _sse and _sse2 versions are only available on IA-32. Calling +a SIMD-enabled function on a CPU that doesn't support the appropriate +set of instructions will result in an illegal instruction exception. + +.SH RETURN VALUES +\fBcreate_viterbi\fR returns a pointer to the structure containing +the decoder state. +The other functions return -1 on error, 0 otherwise. + +.SH AUTHOR & COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) + +.SH LICENSE +This software may be used under the terms of the GNU Limited General Public License (LGPL). + + diff --git a/sqtest.c b/sqtest.c new file mode 100644 index 0000000..b2abb09 --- /dev/null +++ b/sqtest.c @@ -0,0 +1,42 @@ +/* Verify correctness of the sum-of-square routines */ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +long long sumsq_wq(signed short *in,int cnt); +long long sumsq_wq_ref(signed short *in,int cnt); + +int main(){ + int i; + long long result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i<NSAMP;i++) + samples[i] = random() & 0xffff; + + rresult = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET); + result = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET); + if(result == rresult){ + printf("OK\n"); + } else { + printf("sum mismatch: %lld != %lld\n",result,rresult); + } + exit(0); +} + +long long sumsq_wq_ref(signed short *in,int cnt){ + long long sum = 0; + int i; + + for(i=0;i<cnt;i++){ + sum += (long)in[i] * in[i]; + } + return sum; +} + diff --git a/sse2bfly27.s b/sse2bfly27.s new file mode 100644 index 0000000..27422a2 --- /dev/null +++ b/sse2bfly27.s @@ -0,0 +1,202 @@ +/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2003 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_sse2,Branchtab27_sse2 + .type update_viterbi27_blk_sse2,@function + .align 16 + +update_viterbi27_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab27_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + + # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics + # This is okay for this + # code because the worst-case metric spread (at high Eb/No) is only 120, + # well within the range of our unsigned 8-bit path metrics, and even within + # the range of signed 8-bit path metrics + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 # note use of saturating arithmetic + paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? + + # negate branch metrics + pxor %xmm7,%xmm4 + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 2 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + + addl $8,%edx # bump decision pointer + + # See if we have to normalize. This requires an explanation. We don't want + # our path metrics to exceed 255 on the *next* iteration. Since the + # largest branch metric is 30, that means we don't want any to exceed 225 + # on *this* iteration. Rather than look them all, we just pick an arbitrary one + # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- + # determined worst-case metric spread for this code and branch metrics in the range 0-30. + + # This is extremely conservative, and empirical testing at a variety of Eb/Nos might + # show that a higher threshold could be used without affecting BER performance + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $105,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics. We can't just pick an arbitrary small constant because + # the minimum metric might be zero! + movdqa (%edi),%xmm0 + movdqa %xmm0,%xmm4 + movdqa 16(%edi),%xmm1 + pminub %xmm1,%xmm4 + movdqa 32(%edi),%xmm2 + pminub %xmm2,%xmm4 + movdqa 48(%edi),%xmm3 + pminub %xmm3,%xmm4 + + # crunch down to single lowest metric + movdqa %xmm4,%xmm5 + psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $32,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $16,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $8,%xmm5 + pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 + + punpcklbw %xmm4,%xmm4 # lowest 2 bytes + pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes + punpcklqdq %xmm4,%xmm4 # all 16 bytes + + # xmm4 now contains lowest metric in all 16 bytes + # subtract it from every output metric + psubusb %xmm4,%xmm0 + psubusb %xmm4,%xmm1 + psubusb %xmm4,%xmm2 + psubusb %xmm4,%xmm3 + movdqa %xmm0,(%edi) + movdqa %xmm1,16(%edi) + movdqa %xmm2,32(%edi) + movdqa %xmm3,48(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 + +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 diff --git a/sse2bfly29.s b/sse2bfly29.s new file mode 100644 index 0000000..0fa1742 --- /dev/null +++ b/sse2bfly29.s @@ -0,0 +1,245 @@ +/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; +*/ + + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + + .text + .global update_viterbi29_blk_sse2,Branchtab29_sse2 + .type update_viterbi29_blk_sse2,@function + .align 16 + +update_viterbi29_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + movdqa thirtyones,%xmm7 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab29_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 # xmm4 contains branch metrics + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 + paddusb %xmm4,%xmm3 + + # invert branch metrics + pxor %xmm7,%xmm4 + + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 8 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movdqa (%edi),%xmm0 + pminub 16(%edi),%xmm0 + pminub 32(%edi),%xmm0 + pminub 48(%edi),%xmm0 + pminub 64(%edi),%xmm0 + pminub 80(%edi),%xmm0 + pminub 96(%edi),%xmm0 + pminub 112(%edi),%xmm0 + pminub 128(%edi),%xmm0 + pminub 144(%edi),%xmm0 + pminub 160(%edi),%xmm0 + pminub 176(%edi),%xmm0 + pminub 192(%edi),%xmm0 + pminub 208(%edi),%xmm0 + pminub 224(%edi),%xmm0 + pminub 240(%edi),%xmm0 + + # crunch down to single lowest metric + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $32,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $16,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $8,%xmm0 + pminub %xmm1,%xmm0 + + punpcklbw %xmm0,%xmm0 # lowest 2 bytes + pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes + punpcklqdq %xmm0,%xmm0 # all 16 bytes + + # xmm0 now contains lowest metric in all 16 bytes + # subtract it from every output metric + movdqa (%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,(%edi) + movdqa 16(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,16(%edi) + movdqa 32(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,32(%edi) + movdqa 48(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,48(%edi) + movdqa 64(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,64(%edi) + movdqa 80(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,80(%edi) + movdqa 96(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,96(%edi) + movdqa 112(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,112(%edi) + movdqa 128(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,128(%edi) + movdqa 144(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,144(%edi) + movdqa 160(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,160(%edi) + movdqa 176(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,176(%edi) + movdqa 192(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,192(%edi) + movdqa 208(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,208(%edi) + movdqa 224(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,224(%edi) + movdqa 240(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,240(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + diff --git a/ssebfly27.s b/ssebfly27.s new file mode 100644 index 0000000..7f445da --- /dev/null +++ b/ssebfly27.s @@ -0,0 +1,205 @@ +/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2001 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 +.text +.global update_viterbi27_blk_sse,Branchtab27_sse + .type update_viterbi27_blk_sse,@function + .align 16 + +update_viterbi27_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # %ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + movq thirtyones,%mm7 + + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab27_sse+(8*\GROUP),%mm4 + movq Branchtab27_sse+32+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 4 times for a total of 32 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + + addl $8,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmpl $150,%eax # is it greater than 150? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + + movd %mm0,%eax + and $0xff,%eax + +done: # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + + ret + + .data + + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + + diff --git a/ssebfly29.s b/ssebfly29.s new file mode 100644 index 0000000..d7d2149 --- /dev/null +++ b/ssebfly29.s @@ -0,0 +1,271 @@ +/* Intel SIMD SSE implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); +*/ + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_sse,Branchtab29_sse + .type update_viterbi29_blk_sse,@function + .align 16 + +update_viterbi29_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + + movq thirtyones,%mm7 + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab29_sse+(8*\GROUP),%mm4 + movq Branchtab29_sse+128+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 16 times for a total of 128 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=4 + butterfly GROUP=5 + movl %ebx,8(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=6 + butterfly GROUP=7 + movl %ebx,12(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=8 + butterfly GROUP=9 + movl %ebx,16(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=10 + butterfly GROUP=11 + movl %ebx,20(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=12 + butterfly GROUP=13 + movl %ebx,24(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=14 + butterfly GROUP=15 + movl %ebx,28(%edx) # stash second 32 decisions + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + pminub 64(%edi),%mm0 + pminub 72(%edi),%mm0 + pminub 80(%edi),%mm0 + pminub 88(%edi),%mm0 + pminub 96(%edi),%mm0 + pminub 104(%edi),%mm0 + pminub 112(%edi),%mm0 + pminub 120(%edi),%mm0 + pminub 128(%edi),%mm0 + pminub 136(%edi),%mm0 + pminub 144(%edi),%mm0 + pminub 152(%edi),%mm0 + pminub 160(%edi),%mm0 + pminub 168(%edi),%mm0 + pminub 176(%edi),%mm0 + pminub 184(%edi),%mm0 + pminub 192(%edi),%mm0 + pminub 200(%edi),%mm0 + pminub 208(%edi),%mm0 + pminub 216(%edi),%mm0 + pminub 224(%edi),%mm0 + pminub 232(%edi),%mm0 + pminub 240(%edi),%mm0 + pminub 248(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + movq 8(%edi),%mm1 # reload + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + PSUBUSBM %mm0,64(%edi) + PSUBUSBM %mm0,72(%edi) + PSUBUSBM %mm0,80(%edi) + PSUBUSBM %mm0,88(%edi) + PSUBUSBM %mm0,96(%edi) + PSUBUSBM %mm0,104(%edi) + PSUBUSBM %mm0,112(%edi) + PSUBUSBM %mm0,120(%edi) + PSUBUSBM %mm0,128(%edi) + PSUBUSBM %mm0,136(%edi) + PSUBUSBM %mm0,144(%edi) + PSUBUSBM %mm0,152(%edi) + PSUBUSBM %mm0,160(%edi) + PSUBUSBM %mm0,168(%edi) + PSUBUSBM %mm0,176(%edi) + PSUBUSBM %mm0,184(%edi) + PSUBUSBM %mm0,192(%edi) + PSUBUSBM %mm0,200(%edi) + PSUBUSBM %mm0,208(%edi) + PSUBUSBM %mm0,216(%edi) + PSUBUSBM %mm0,224(%edi) + PSUBUSBM %mm0,232(%edi) + PSUBUSBM %mm0,240(%edi) + PSUBUSBM %mm0,248(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + @@ -0,0 +1,40 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include <stdlib.h> +#include "fec.h" + +unsigned long long sumsq_port(signed short *,int); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *,int); +unsigned long long sumsq_sse(signed short *,int); +unsigned long long sumsq_sse2(signed short *,int); +#endif + +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *,int); +#endif + +unsigned long long sumsq(signed short *in,int cnt){ + switch(Cpu_mode){ + case PORT: + default: + return sumsq_port(in,cnt); +#ifdef __i386__ + case SSE: + case MMX: + return sumsq_mmx(in,cnt); + case SSE2: + return sumsq_sse2(in,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return sumsq_av(in,cnt); +#endif + } +} diff --git a/sumsq_av.c b/sumsq_av.c new file mode 100644 index 0000000..53c6acf --- /dev/null +++ b/sumsq_av.c @@ -0,0 +1,78 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * This is the Altivec SIMD version. It's a little hairy because Altivec + * does not do 64-bit operations directly, so we have to accumulate separate + * 32-bit sums and carries + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +unsigned long long sumsq_av(signed short *in,int cnt){ + long long sum; + vector signed short x; + vector unsigned int sums,carries,s1,s2; + int pad; + union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s; + + carries = sums = (vector unsigned int)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + /* A single vec_msum cannot overflow, but we have to sum it with + * the earlier terms separately to handle the carries + * The cast to unsigned is OK because squares are always positive + */ + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + } + /* Combine 4 sub-sums and carries */ + s.c[15] = 64; /* Shift right two 32-bit words */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + s.c[15] = 32; /* Shift right one 32-bit word */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + /* Extract sum and carries from right-hand words and combine into result */ + s.iv = sums; + sum = s.w[3]; + + s.iv = carries; + sum += (long long)s.w[3] << 32; + + return sum; +} + diff --git a/sumsq_mmx.c b/sumsq_mmx.c new file mode 100644 index 0000000..e766831 --- /dev/null +++ b/sumsq_mmx.c @@ -0,0 +1,35 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * MMX-assisted version (also used on SSE) + + * The SSE2 and MMX assist routines both operate on multiples of + * 8 words; they differ only in their alignment requirements (8 bytes + * for MMX, 16 bytes for SSE2) + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser Public License (LGPL) + */ + +long long sumsq_mmx_assist(signed short *,int); + +long long sumsq_mmx(signed short *in,int cnt){ + long long sum = 0; + + /* Handle stuff before the next 8-byte boundary */ + while(((int)in & 7) != 0 && cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + sum += sumsq_mmx_assist(in,cnt); + in += cnt & ~7; + cnt &= 7; + + /* Handle up to 7 words at end */ + while(cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + return sum; +} diff --git a/sumsq_mmx_assist.s b/sumsq_mmx_assist.s new file mode 100644 index 0000000..b3bac66 --- /dev/null +++ b/sumsq_mmx_assist.s @@ -0,0 +1,83 @@ +# MMX assist routines for sumsq +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Evaluate sum of squares of signed 16-bit input samples +# long long sumsq_mmx_assist(signed short *in,int cnt); + .global sumsq_mmx_assist + .type sumsq_mmx_assist,@function + .align 16 +sumsq_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + xor %eax,%eax + xor %edx,%edx + + # Since 4 * 32767**2 < 2**32, we can accumulate two at a time +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) + movq 8(%esi),%mm6 # S4 S5 S6 S7 + pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) + paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + psrlq $32,%mm0 + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + addl $16,%esi + jmp 1b + +2: emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + +# Evaluate sum of squares of signed 16-bit input samples +# long sumsq_wd_mmx_assist(signed short *in,int cnt); +# Quick version, only safe for small numbers of small input values... + .global sumsq_wd_mmx_assist + .type sumsq_wd_mmx_assist,@function + .align 16 +sumsq_wd_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + pxor %mm2,%mm2 # zero sum + +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) + movq 8(%esi),%mm1 + pmaddwd %mm1,%mm1 + paddd %mm1,%mm2 + paddd %mm0,%mm2 # accumulate + + addl $16,%esi + jmp 1b + +2: movd %mm2,%eax # even sum + psrlq $32,%mm2 + movd %mm2,%edx # odd sum + addl %edx,%eax + emms + popl %esi + popl %ebp + ret diff --git a/sumsq_port.c b/sumsq_port.c new file mode 100644 index 0000000..6d0b4c1 --- /dev/null +++ b/sumsq_port.c @@ -0,0 +1,16 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Portable C version + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +unsigned long long sumsq_port(signed short *in,int cnt){ + long long sum = 0; + int i; + + for(i=0;i<cnt;i++){ + sum += (int)in[i] * (int)in[i]; + } + return sum; +} diff --git a/sumsq_sse2.c b/sumsq_sse2.c new file mode 100644 index 0000000..b05d2e9 --- /dev/null +++ b/sumsq_sse2.c @@ -0,0 +1,33 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * The SSE2 and MMX assist routines both operate on multiples of + * 8 words; they differ only in their alignment requirements (8 bytes + * for MMX, 16 bytes for SSE2) + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser Public License (LGPL) + */ + +long long sumsq_sse2_assist(signed short *,int); + +long long sumsq_sse2(signed short *in,int cnt){ + long long sum = 0; + + /* Handle stuff before the next 8-byte boundary */ + while(((int)in & 15) != 0 && cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + sum += sumsq_sse2_assist(in,cnt); + in += cnt & ~7; + cnt &= 7; + + /* Handle up to 7 trailing words */ + while(cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + return sum; +} diff --git a/sumsq_sse2_assist.s b/sumsq_sse2_assist.s new file mode 100644 index 0000000..d1c4ee7 --- /dev/null +++ b/sumsq_sse2_assist.s @@ -0,0 +1,49 @@ +# SSE2 assist routines for sumsq +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text +# Evaluate sum of squares of signed 16-bit input samples +# long long sumsq_sse2_assist(signed short *in,int cnt); + .global sumsq_sse2_assist + .type sumsq_sse2_assist,@function + .align 16 +sumsq_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + pxor %xmm2,%xmm2 # zero sum + movaps low,%xmm3 # load mask + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7 + pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7) + movaps %xmm0,%xmm1 + pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0 + paddq %xmm1,%xmm2 # sum even-numbered dwords + psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0 + paddq %xmm0,%xmm2 # sum odd-numbered dwords + addl $16,%esi + jmp 1b + +2: movaps %xmm2,%xmm0 + psrldq $8,%xmm0 + paddq %xmm2,%xmm0 # combine 64-bit sums + + movd %xmm0,%eax # low 32 bits of sum + psrldq $4,%xmm0 + movd %xmm0,%edx # high 32 bits of sum + + popl %ecx + popl %esi + popl %ebp + ret + + .data + .align 16 +low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0 diff --git a/sumsq_test.c b/sumsq_test.c new file mode 100644 index 0000000..4debd47 --- /dev/null +++ b/sumsq_test.c @@ -0,0 +1,101 @@ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <time.h> +#include "config.h" +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +int Verbose = 0; + +int main(int argc,char *argv[]){ + signed short *buf; + int i,d,trial,trials=10000; + int bufsize = 2048; + long long port_sum,simd_sum; + time_t t; + int timetrials=0; + + find_cpu_mode(); + time(&t); + srandom(t); + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + bufsize = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'v': + Verbose++; + break; + case 'T': + timetrials++; + break; + } + } + + buf = (signed short *)calloc(bufsize,sizeof(signed short)); + if(timetrials){ + for(trial=0;trial<trials;trial++){ + (void)sumsq(buf,bufsize); + } + } else { + for(trial=0;trial<trials;trial++){ + int length,offset; + + offset = random() & 7; + length = (random() % bufsize) - offset; + if(length <= 0) + continue; + for(i=0;i<bufsize;i++) + buf[i] = random(); + + port_sum = sumsq_port(buf+offset,length); + simd_sum = sumsq(buf+offset,length); + if(port_sum != simd_sum){ + printf("offset %d len %d port_sum = %lld simd_sum = %lld ",offset,length,port_sum,simd_sum); + + printf("ERROR! diff = %lld\n",simd_sum-port_sum); + } + } + } + exit(0); +} diff --git a/viterbi27.c b/viterbi27.c new file mode 100644 index 0000000..554da92 --- /dev/null +++ b/viterbi27.c @@ -0,0 +1,161 @@ +/* K=7 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi27_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi27_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi27_mmx(len); + case SSE: + return create_viterbi27_sse(len); + case SSE2: + return create_viterbi27_sse2(len); +#endif + } +} + +void set_viterbi27_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi27_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi27_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi27_polynomial_mmx(polys); + break; + case SSE: + set_viterbi27_polynomial_sse(polys); + break; + case SSE2: + set_viterbi27_polynomial_sse2(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi27_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi27_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi27_mmx(p,starting_state); + case SSE: + return init_viterbi27_sse(p,starting_state); + case SSE2: + return init_viterbi27_sse2(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi27( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi27_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi27_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi27_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi27_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi27_sse2(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi27_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi27_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi27_mmx(p); + break; + case SSE: + delete_viterbi27_sse(p); + break; + case SSE2: + delete_viterbi27_sse2(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){ + if(p == NULL) + return -1; + + switch(Cpu_mode){ + case PORT: + default: + update_viterbi27_blk_port(p,syms,nbits); + break; +#ifdef __VEC__ + case ALTIVEC: + update_viterbi27_blk_av(p,syms,nbits); + break; +#endif +#ifdef __i386__ + case MMX: + update_viterbi27_blk_mmx(p,syms,nbits); + break; + case SSE: + update_viterbi27_blk_sse(p,syms,nbits); + break; + case SSE2: + update_viterbi27_blk_sse2(p,syms,nbits); + break; +#endif + } + return 0; +} diff --git a/viterbi27_av.c b/viterbi27_av.c new file mode 100644 index 0000000..98d7344 --- /dev/null +++ b/viterbi27_av.c @@ -0,0 +1,210 @@ +/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions + * Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <memory.h> +#include <stdlib.h> +#include "fec.h" + +typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t; +typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t; + +static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_av(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<4;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_av(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA,V27POLYB }; + set_viterbi27_polynomial_av(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + if(p == NULL) + return -1; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_av(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* Process received symbols */ +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + vector unsigned char survivor0,survivor1,sym0v,sym1v; + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3; + void *tmp; + + /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */ + sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); + + sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */ + sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */ + syms += 2; + + /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */ + + /* Form first set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Form first set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[0],metric); + m3 = vec_adds(vp->old_metrics->v[2],metric); + m1 = vec_adds(vp->old_metrics->v[2],m_metric); + m2 = vec_adds(vp->old_metrics->v[0],m_metric); + + /* Form second set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Compute second set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[1],metric); + m3 = vec_adds(vp->old_metrics->v[3],metric); + m1 = vec_adds(vp->old_metrics->v[3],m_metric); + m2 = vec_adds(vp->old_metrics->v[1],m_metric); + + /* Interleave and store first decisions and survivors */ + d->v[0] = vec_mergeh(decision0,decision1); + d->v[1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1); + + /* Compare and select second set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store second set of decisions and survivors */ + d->v[2] = vec_mergeh(decision0,decision1); + d->v[3] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1); + + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 105){ + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]); + scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]); + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0); + vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0); + vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0); + vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + + return 0; +} + diff --git a/viterbi27_mmx.c b/viterbi27_mmx.c new file mode 100644 index 0000000..a6d5125 --- /dev/null +++ b/viterbi27_mmx.c @@ -0,0 +1,115 @@ +/* K=7 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <mmintrin.h> +#include "fec.h" + +typedef union { char c[64]; __m64 v[8];} decision_t; +typedef union { unsigned char c[64]; __m64 v[8];} metric_t; + +unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16))); +unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_mmx(void *p,int starting_state){ + struct v27 *vp = (struct v27 *)p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + int symbol; + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_mmx(int len){ + struct v27 *vp; + int polys[2] = { V27POLYA, V27POLYB }; + + if(Init == 0){ + set_viterbi27_polynomial_mmx(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_mmx(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v27 *vp = (struct v27 *)p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate &= 63; + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_mmx(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/viterbi27_port.c b/viterbi27_port.c new file mode 100644 index 0000000..7cac2b3 --- /dev/null +++ b/viterbi27_port.c @@ -0,0 +1,191 @@ +/* K=7 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + + +typedef union { unsigned int w[64]; } metric_t; +typedef union { unsigned long w[2];} decision_t; +static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_port(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_port(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_port(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_port(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_port(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+32] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + void *tmp; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + unsigned char sym0,sym1; + + d->w[0] = d->w[1] = 0; + sym0 = *syms++; + sym1 = *syms++; + + BFLY(0); + BFLY(1); + BFLY(2); + BFLY(3); + BFLY(4); + BFLY(5); + BFLY(6); + BFLY(7); + BFLY(8); + BFLY(9); + BFLY(10); + BFLY(11); + BFLY(12); + BFLY(13); + BFLY(14); + BFLY(15); + BFLY(16); + BFLY(17); + BFLY(18); + BFLY(19); + BFLY(20); + BFLY(21); + BFLY(22); + BFLY(23); + BFLY(24); + BFLY(25); + BFLY(26); + BFLY(27); + BFLY(28); + BFLY(29); + BFLY(30); + BFLY(31); + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/viterbi27_sse.c b/viterbi27_sse.c new file mode 100644 index 0000000..cd1f287 --- /dev/null +++ b/viterbi27_sse.c @@ -0,0 +1,113 @@ +/* K=7 r=1/2 Viterbi decoder for SSE + * Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <xmmintrin.h> +#include "fec.h" + +typedef union { unsigned char c[64]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in ssebfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + + set_viterbi27_polynomial_sse(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27(vp,0); + return vp; +} + +void set_viterbi27_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/viterbi27_sse2.c b/viterbi27_sse2.c new file mode 100644 index 0000000..bc01710 --- /dev/null +++ b/viterbi27_sse2.c @@ -0,0 +1,180 @@ +/* K=7 r=1/2 Viterbi decoder for SSE2 + * Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <xmmintrin.h> +#include "fec.h" + +typedef union { unsigned char c[64]; __m128i v[4]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse2(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse2(int len){ + void *p; + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27))) + return NULL; + vp = (struct v27 *)p; + + if((p = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi27_sse2(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse2(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +#if 0 +/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ +void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return; + d = (decision_t *)vp->dp; + while(nbits--){ + __m128i sym0v,sym1v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi8(syms[0]); + sym1v = _mm_set1_epi8(syms[1]); + syms += 2; + + for(i=0;i<2;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v)); + /* There's no packed bytes right shift in SSE2, so we use the word version and mask + * (I'm *really* starting to like Altivec...) + */ + metric = _mm_srli_epi16(metric,3); + metric = _mm_and_si128(metric,_mm_set1_epi8(31)); + m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_epi8(vp->old_metrics->v[i],metric); + m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric); + m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric); + m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric); + + /* Compare and select, using modulo arithmetic */ + decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128()); + decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128()); + survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0)); + survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2)); + + /* Pack each set of decisions into 16 bits */ + d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1)); + d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1)); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; +} +#endif diff --git a/viterbi29.c b/viterbi29.c new file mode 100644 index 0000000..80cbb33 --- /dev/null +++ b/viterbi29.c @@ -0,0 +1,152 @@ +/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi29_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi29_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi29_mmx(len); + case SSE: + return create_viterbi29_sse(len); + case SSE2: + return create_viterbi29_sse2(len); +#endif + } +} + +void set_viterbi29_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi29_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi29_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi29_polynomial_mmx(polys); + break; + case SSE: + set_viterbi29_polynomial_sse(polys); + break; + case SSE2: + set_viterbi29_polynomial_sse2(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi29_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi29_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi29_mmx(p,starting_state); + case SSE: + return init_viterbi29_sse(p,starting_state); + case SSE2: + return init_viterbi29_sse2(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi29( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi29_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi29_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi29_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi29_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi29_sse2(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi29_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi29_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi29_mmx(p); + break; + case SSE: + delete_viterbi29_sse(p); + break; + case SSE2: + delete_viterbi29_sse2(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi29_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi29_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi29_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi29_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi29_blk_sse2(p,syms,nbits); +#endif + } +} diff --git a/viterbi29_av.c b/viterbi29_av.c new file mode 100644 index 0000000..31c8d27 --- /dev/null +++ b/viterbi29_av.c @@ -0,0 +1,190 @@ +/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <sys/sysctl.h> +#include "fec.h" + +typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t; +typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t; + +static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_av(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_av(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + set_viterbi29_polynomial_av(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi29_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_av(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + + while(nbits--){ + vector unsigned char sym1v,sym2v; + void *tmp; + + /* All this seems necessary just to load a byte into all elements of a vector! */ + sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */ + sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */ + sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */ + syms += 2; + + for(i=0;i<8;i++){ + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = (vector unsigned char)(31) - metric; + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[8+i],metric); + m1 = vec_adds(vp->old_metrics->v[8+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store decisions and survivors */ + d->v[2*i] = vec_mergeh(decision0,decision1); + d->v[2*i+1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + } + d++; + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 50){ + int i; + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vp->new_metrics->v[0]; + scale1 = vp->new_metrics->v[1]; + for(i=2;i<16;i+=2){ + scale0 = vec_min(scale0,vp->new_metrics->v[i]); + scale1 = vec_min(scale1,vp->new_metrics->v[i+1]); + } + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + for(i=0;i<16;i++) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0); + } + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/viterbi29_mmx.c b/viterbi29_mmx.c new file mode 100644 index 0000000..563f40a --- /dev/null +++ b/viterbi29_mmx.c @@ -0,0 +1,118 @@ +/* K=9 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <mmintrin.h> +#include "fec.h" + +typedef union { char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned char c[256]; __m64 v[32];} metric_t; + +unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8))); +unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_mmx(int len){ + struct v29 *vp; + + if(Init == 0){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial_mmx(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + int symbol; + + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_mmx(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v29 *vp = (struct v29 *)p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + endstate &= 255; + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_mmx(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/viterbi29_port.c b/viterbi29_port.c new file mode 100644 index 0000000..292dce8 --- /dev/null +++ b/viterbi29_port.c @@ -0,0 +1,166 @@ +/* K=9 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_port(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_port(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + set_viterbi29_polynomial_port(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_port(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/viterbi29_sse.c b/viterbi29_sse.c new file mode 100644 index 0000000..4a92e5f --- /dev/null +++ b/viterbi29_sse.c @@ -0,0 +1,114 @@ +/* K=9 r=1/2 Viterbi decoder for SSE + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <xmmintrin.h> +#include "fec.h" + +typedef union { unsigned char w[256]; __m64 v[32];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + + set_viterbi29_polynomial_sse(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 200; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/viterbi29_sse2.c b/viterbi29_sse2.c new file mode 100644 index 0000000..4c7336c --- /dev/null +++ b/viterbi29_sse2.c @@ -0,0 +1,119 @@ +/* K=9 r=1/2 Viterbi decoder for SSE2 + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <emmintrin.h> +#include "fec.h" + +typedef union { unsigned char c[256]; __m128i v[16];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse2(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse2(int len){ + void *p; + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29))) + return NULL; + vp = (struct v29 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi29_sse2(vp,0); + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse2(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/viterbi39.c b/viterbi39.c new file mode 100644 index 0000000..ac28c2c --- /dev/null +++ b/viterbi39.c @@ -0,0 +1,153 @@ +/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Aug 2006, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi39_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi39_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi39_mmx(len); + case SSE: + return create_viterbi39_sse(len); + case SSE2: + return create_viterbi39_sse2(len); +#endif + } +} + +void set_viterbi39_polynomial(int polys[3]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi39_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi39_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi39_polynomial_mmx(polys); + break; + case SSE: + set_viterbi39_polynomial_sse(polys); + break; + case SSE2: + set_viterbi39_polynomial_sse2(polys); + break; +#endif + } +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi39_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi39_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi39_mmx(p,starting_state); + case SSE: + return init_viterbi39_sse(p,starting_state); + case SSE2: + return init_viterbi39_sse2(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi39( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi39_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi39_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi39_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi39_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi39_sse2(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi39_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi39_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi39_mmx(p); + break; + case SSE: + delete_viterbi39_sse(p); + break; + case SSE2: + delete_viterbi39_sse2(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi39_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi39_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi39_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi39_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi39_blk_sse2(p,syms,nbits); +#endif + } +} diff --git a/viterbi39_av.c b/viterbi39_av.c new file mode 100644 index 0000000..2deed51 --- /dev/null +++ b/viterbi39_av.c @@ -0,0 +1,251 @@ +/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t; +typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t; + +static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_av(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<32;i++) + vp->metrics1.v[i] = (vector unsigned short)(1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_av(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_av(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_av(polys); + } + vp = (struct v39 *)malloc(sizeof(struct v39)); + vp->decisions = malloc(sizeof(decision_t)*(len+8)); + init_viterbi39_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi39_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_av(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + syms += 3; + + for(i=0;i<16;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * the metrics are in the range 0-765 + */ + m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v)); + m1 = vec_xor(Branchtab39[2].v[i],sym2v); + metric = vec_add(m0,m1); + m_metric = vec_sub((vector unsigned short)(765),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[16+i],metric); + m1 = vec_adds(vp->old_metrics->v[16+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<32;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825 + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-5000){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=31;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/viterbi39_mmx.c b/viterbi39_mmx.c new file mode 100644 index 0000000..875391a --- /dev/null +++ b/viterbi39_mmx.c @@ -0,0 +1,185 @@ +/* K=9 r=1/3 Viterbi decoder for x86 MMX + * Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <mmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +typedef union { unsigned char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_mmx(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_mmx(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_mmx(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA,V39POLYB,V39POLYC }; + set_viterbi39_polynomial_mmx(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_mmx(vp,0); + return vp; +} + + + +/* Viterbi chainback */ +int chainback_viterbi39_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_mmx(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + if(vp->new_metrics->s[0] < vp->old_metrics->s[0]) + path_metric += 65536; /* Hack: wraparound probably occured */ + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/viterbi39_port.c b/viterbi39_port.c new file mode 100644 index 0000000..5685c90 --- /dev/null +++ b/viterbi39_port.c @@ -0,0 +1,168 @@ +/* K=9 r=1/3 Viterbi decoder in portable C + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_port(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_port(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_port(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = {V39POLYA,V39POLYB,V39POLYC}; + set_viterbi39_polynomial_port(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi39_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_port(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \ + (Branchtab39[2].c[i] ^ sym2);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (765 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-765);\ + m1 += (metric+metric-765);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1,sym2; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + sym2 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/viterbi39_sse.c b/viterbi39_sse.c new file mode 100644 index 0000000..c2f2865 --- /dev/null +++ b/viterbi39_sse.c @@ -0,0 +1,201 @@ +/* K=9 r=1/3 Viterbi decoder for x86 SSE + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <xmmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; +typedef union { signed short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_sse(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric - SHRT_MIN; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<64;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<64;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/viterbi39_sse2.c b/viterbi39_sse2.c new file mode 100644 index 0000000..f13794e --- /dev/null +++ b/viterbi39_sse2.c @@ -0,0 +1,200 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <emmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned short s[16];} decision_t; +typedef union { signed short s[256]; __m128i v[32];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse2(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse2(int len){ + void *p; + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39))) + return NULL; + + vp = (struct v39 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi39_sse2(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse2(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse2(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + syms += 3; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<16;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<32;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} + + diff --git a/viterbi615.c b/viterbi615.c new file mode 100644 index 0000000..6dda51f --- /dev/null +++ b/viterbi615.c @@ -0,0 +1,155 @@ +/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615(int len){ + + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi615_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi615_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi615_mmx(len); + case SSE: + return create_viterbi615_sse(len); + case SSE2: + return create_viterbi615_sse2(len); +#endif + } +} + +void set_viterbi615_polynomial(int polys[6]){ + + switch(Cpu_mode){ + case PORT: + default: + set_viterbi615_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi615_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi615_polynomial_mmx(polys); + break; + case SSE: + set_viterbi615_polynomial_sse(polys); + break; + case SSE2: + set_viterbi615_polynomial_sse2(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi615_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi615_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi615_mmx(p,starting_state); + case SSE: + return init_viterbi615_sse(p,starting_state); + case SSE2: + return init_viterbi615_sse2(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi615( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi615_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi615_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi615_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi615_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi615_sse2(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi615_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi615_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi615_mmx(p); + break; + case SSE: + delete_viterbi615_sse(p); + break; + case SSE2: + delete_viterbi615_sse2(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi615_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi615_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi615_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi615_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi615_blk_sse2(p,syms,nbits); +#endif + } +} + diff --git a/viterbi615_av.c b/viterbi615_av.c new file mode 100644 index 0000000..4a6ce9c --- /dev/null +++ b/viterbi615_av.c @@ -0,0 +1,257 @@ +/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t; +typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t; + +static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_av(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + + for(i=0;i<2048;i++) + vp->metrics1.v[i] = (vector unsigned short)(5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_av(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_av(polys); + } + vp = (struct v615 *)malloc(sizeof(struct v615)); + vp->decisions = malloc(sizeof(decision_t)*(len+14)); + init_viterbi615_av(vp,0); + return vp; +} + +void set_viterbi615_polynomial_av(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + + +/* Viterbi chainback */ +int chainback_viterbi615_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 16384; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_av(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + sym3v = vec_splat(symv,3); + sym4v = vec_splat(symv,4); + sym5v = vec_splat(symv,5); + syms += 6; + + for(i=0;i<1024;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v)); + m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v)); + m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v)); + metric = vec_add(m0,m1); + metric = vec_add(metric,m2); + m_metric = vec_sub((vector unsigned short)(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[1024+i],metric); + m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<2048;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + + * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750. + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-12750){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=2047;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/viterbi615_mmx.c b/viterbi615_mmx.c new file mode 100644 index 0000000..89a56f7 --- /dev/null +++ b/viterbi615_mmx.c @@ -0,0 +1,183 @@ +/* K=15 r=1/6 Viterbi decoder for x86 MMX + * Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <mmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include "fec.h" + +typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t; +typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_mmx(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = 5000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_mmx(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_mmx(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_mmx(vp,0); + return vp; +} + +void set_viterbi615_polynomial_mmx(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_mmx(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/viterbi615_port.c b/viterbi615_port.c new file mode 100644 index 0000000..89bdd80 --- /dev/null +++ b/viterbi615_port.c @@ -0,0 +1,156 @@ +/* K=15 r=1/6 Viterbi decoder in portable C + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { unsigned long w[16384]; } metric_t; + +static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_port(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_port(polys); + } + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615(vp,0); + return vp; +} + +void set_viterbi615_polynomial_port(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_port(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.w[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi615_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_port(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned long metric,m0,m1,m2,m3,decision0,decision1;\ + metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\ + +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\ + +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\ + m2 = vp->old_metrics->w[i] + (1530-metric);\ + m3 = vp->old_metrics->w[i+8192] + metric;\ + decision0 = (signed long)(m0-m1) >= 0;\ + decision1 = (signed long)(m2-m3) >= 0;\ + vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\ + vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\ + d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\ +} +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + void *tmp; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + memset(d,0,sizeof(decision_t)); + for(i=0;i<8192;i++) + BFLY(i); + + syms += 6; + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + diff --git a/viterbi615_sse.c b/viterbi615_sse.c new file mode 100644 index 0000000..de0f8af --- /dev/null +++ b/viterbi615_sse.c @@ -0,0 +1,201 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <xmmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { signed short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_sse(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<4096;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<4096;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/viterbi615_sse2.c b/viterbi615_sse2.c new file mode 100644 index 0000000..7f711e5 --- /dev/null +++ b/viterbi615_sse2.c @@ -0,0 +1,204 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include <emmintrin.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <limits.h> +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t; +typedef union { signed short s[16384]; __m128i v[2048];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse2(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse2(int len){ + void *p; + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse2(polys); + } + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615))) + return NULL; + + vp = (struct v615 *)p; + if((p = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi615_sse2(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse2(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse2(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + sym3v = _mm_set1_epi16(syms[3]); + sym4v = _mm_set1_epi16(syms[4]); + sym5v = _mm_set1_epi16(syms[5]); + syms += 6; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<1024;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2)); + m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-90 branch metrics is 405 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<2048;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + + diff --git a/vtest27.c b/vtest27.c new file mode 100644 index 0000000..7256483 --- /dev/null +++ b/vtest27.c @@ -0,0 +1,184 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <math.h> +#include <memory.h> +#include <sys/time.h> +#include <sys/resource.h> +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+6)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi27(framebits)) == NULL){ + printf("create_viterbi27 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr<trials;tr++){ + /* Encode a frame of random data */ + for(i=0;i<framebits+6;i++){ + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i/8] = sr & 0xff; + symbols[2*i+0] = addnoise(parity(sr & V27POLYA),gain,Gain,127.5,255); + symbols[2*i+1] = addnoise(parity(sr & V27POLYB),gain,Gain,127.5,255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi27(vp,0); + + /* Decode block */ + update_viterbi27_blk(vp,symbols,framebits+6); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp,data,framebits,0); + errcnt = 0; + for(i=0;i<framebits/8;i++){ + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if(errcnt != 0) + badframes++; + if(Verbose > 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i<framebits/8;i++){ + printf("%02x",xordata[i]); + } + printf("\n"); + } + if(Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,tr+1,(double)badframes/(tr+1)); + fflush(stdout); + } + if(Verbose > 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi27(vp,0); + + /* Decode block */ + update_viterbi27_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} diff --git a/vtest29.c b/vtest29.c new file mode 100644 index 0000000..8471b54 --- /dev/null +++ b/vtest29.c @@ -0,0 +1,185 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <math.h> +#include <memory.h> +#include <sys/time.h> +#include <sys/resource.h> +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi29(framebits)) == NULL){ + printf("create_viterbi29 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr<trials;tr++){ + /* Encode a frame of random data */ + for(i=0;i<framebits+8;i++){ + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i/8] = sr & 0xff; + symbols[2*i+0] = addnoise(parity(sr & V29POLYA),gain,Gain,127.5,255); + symbols[2*i+1] = addnoise(parity(sr & V29POLYB),gain,Gain,127.5,255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi29(vp,0); + + /* Decode block */ + update_viterbi29_blk(vp,symbols,framebits+8); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp,data,framebits,0); + errcnt = 0; + for(i=0;i<framebits/8;i++){ + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if(errcnt != 0) + badframes++; + if(Verbose > 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i<framebits/8;i++){ + printf("%02x",xordata[i]); + } + printf("\n"); + } + if(Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,tr+1,(double)badframes/(tr+1)); + fflush(stdout); + } + if(Verbose > 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi29(vp,0); + + /* Decode block */ + update_viterbi29_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/vtest39.c b/vtest39.c new file mode 100644 index 0000000..76723b2 --- /dev/null +++ b/vtest39.c @@ -0,0 +1,186 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <math.h> +#include <memory.h> +#include <sys/time.h> +#include <sys/resource.h> +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./3.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*3*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi39(framebits)) == NULL){ + printf("create_viterbi39 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr<trials;tr++){ + /* Encode a frame of random data */ + for(i=0;i<framebits+8;i++){ + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i/8] = sr & 0xff; + symbols[3*i+0] = addnoise(parity(sr & V39POLYA),gain,Gain,127.5,255); + symbols[3*i+1] = addnoise(parity(sr & V39POLYB),gain,Gain,127.5,255); + symbols[3*i+2] = addnoise(parity(sr & V39POLYC),gain,Gain,127.5,255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi39(vp,0); + + /* Decode block */ + update_viterbi39_blk(vp,symbols,framebits+8); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp,data,framebits,0); + errcnt = 0; + for(i=0;i<framebits/8;i++){ + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if(errcnt != 0) + badframes++; + if(Verbose > 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i<framebits/8;i++){ + printf("%02x",xordata[i]); + } + printf("\n"); + } + if(Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,tr+1,(double)badframes/(tr+1)); + fflush(stdout); + } + if(Verbose > 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi39(vp,0); + + /* Decode block */ + update_viterbi39_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/vtest615.c b/vtest615.c new file mode 100644 index 0000000..4bd8c4f --- /dev/null +++ b/vtest615.c @@ -0,0 +1,191 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <math.h> +#include <memory.h> +#include <sys/time.h> +#include <sys/resource.h> +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./6.) +#define MAXBYTES 10000 +#define OFFSET (127.5) +#define CLIP 255 + +double Gain = 24.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10,errcnt,framebits=2048; + int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*6*(MAXBYTES+14)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi615(framebits)) == NULL){ + printf("create_viterbi615 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr<trials;tr++){ + /* Encode a frame of random data */ + for(i=0;i<framebits+14;i++){ + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i/8] = sr & 0xff; + symbols[6*i+0] = addnoise(parity(sr & V615POLYA),gain,Gain,OFFSET,CLIP); + symbols[6*i+1] = addnoise(parity(sr & V615POLYB),gain,Gain,OFFSET,CLIP); + symbols[6*i+2] = addnoise(parity(sr & V615POLYC),gain,Gain,OFFSET,CLIP); + symbols[6*i+3] = addnoise(parity(sr & V615POLYD),gain,Gain,OFFSET,CLIP); + symbols[6*i+4] = addnoise(parity(sr & V615POLYE),gain,Gain,OFFSET,CLIP); + symbols[6*i+5] = addnoise(parity(sr & V615POLYF),gain,Gain,OFFSET,CLIP); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi615(vp,0); + + /* Decode block */ + update_viterbi615_blk(vp,symbols,framebits+14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp,data,framebits,0); + errcnt = 0; + for(i=0;i<framebits/8;i++){ + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if(errcnt != 0) + badframes++; + if(Verbose > 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i<framebits/8;i++){ + printf("%02x",xordata[i]); + } + printf("\n"); + } + if(Verbose) + printf("BER %d/%d (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,(tr+1),(double)badframes/(tr+1)); + fflush(stdout); + + } + + if(Verbose > 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n", + tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,(tr+1),(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi615(vp,0); + + /* Decode block */ + update_viterbi615_blk(vp,symbols,framebits+14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} |