diff options
107 files changed, 11792 insertions, 4 deletions
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..e9e5baf
--- /dev/null
@@ -0,0 +1,39 @@
+To build and install the libfec libraries, simply say
+make test (optional)
+make install (as root)
+By default, "make install" puts the libfec libraries in
+/usr/local/lib, the include files in /usr/local/include, and the
+manual page in /usr/local/man.
+You may have an old version of the GNU assembler that cannot handle
+the relatively new SSE2 mnemonics. Update your version of the GNU
+"binutils" package.
+You may obtain the latest binutils package through your normal
+distribution channels or from:
+After running the ./configure script, optional tests can be built and
+run as follows:
+make test
+"make test" tests each routine, using the SIMD versions as
+appropriate, verifying correct operation and estimating Viterbi
+decoding speeds. These tests should always succeed unless something is
+28 Mar 2004
+Phil Karn, karn@ka9q.net
diff --git a/README b/README
new file mode 100644
index 0000000..95253e2
--- /dev/null
+++ b/README
@@ -0,0 +1,120 @@
+This package is copyright 2006 by Phil Karn, KA9Q. It may be used
+under the terms of the GNU Lesser General Public License (LGPL). See
+the file "lesser.txt" in this package for license details.
+This package provides a set of functions that implement several
+popular forward error correction (FEC) algorithms and several low-level routines
+useful in modems implemented with digital signal processing (DSP).
+The following routines are provided:
+1. Viterbi decoders for the following convolutional codes:
+r=1/2 k=7 ("Voyager" code, now a widely used industry standard)
+r=1/2 k=9 (Used on the IS-95 CDMA forward link)
+r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions)
+2. Reed-Solomon encoders and decoders for any user-specified code.
+3. Optimized encoder and decoder for the CCSDS-standard (255,223)
+Reed-Solomon code, with and without the CCSDS-standard "dual basis"
+symbol representation.
+4. Compute dot product between a 16-bit buffer and a set of 16-bit
+coefficients. This is the basic DSP primitive for digital filtering
+and correlation.
+4. Compute sum of squares of a buffer of 16-bit signed integers. This is
+useful in DSP for finding the total energy in a signal.
+5. Find peak value in a buffer of 16-bit signed integers, useful for
+scaling a signal to prevent overflow.
+This package automatically makes use of various SIMD (Single
+Instruction stream, Multiple Data stream) instruction sets, when
+available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and
+Altivec on the PowerPC G4 and G5 used by Power Macintoshes.
+"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine",
+and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the
+Many of the SIMD versions run more than an order of
+magnitude faster than their portable C versions. The available SIMD
+instruction sets, if any, are determined at run time and the proper
+version of each routine is automatically selected. If no SIMD
+instructions are available, the portable C version is invoked by
+default. On targets other than IA-32 and PPC, only the portable C
+version is built.
+The SIMD-assisted versions generally produce the same results as the C
+versions, with a few minor exceptions. The Viterbi decoders in C have
+a very slightly greater Eb/No performance due to their use of 32-bit
+path metrics. On the other hand, the SIMD versions use the
+"saturating" arithmetic available in these instructions to avoid the
+integer wraparounds that can occur in C when argument ranges are not
+properly constrained. This applies primarily to the "dotprod" (dot
+product) function.
+The MMX (MultiMedia eXtensions) instruction set was introduced on
+later Pentium CPUs; it is also implemented on the Pentium II and most
+AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was
+introduced in the Pentium III; AMD calls it "3D Now! Professional".
+Intel introduced SSE2 on the Pentium 4, and it has been picked up by
+later AMD CPUs. SSE support implies MMX support, while SSE2 support
+implies both SSE and MMX support.
+The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott
+New Instructions") was introduced in early 2004 with the latest
+("Prescott") revision of the Pentium 4. Relatively little was
+introduced with SSE3, and this library currently makes no use of it.
+See the various manual pages for details on how to use the library
+Copyright 2006, Phil Karn, KA9Q
+This software may be used under the terms of the GNU Lesser General
+Public License (LGPL); see the file lesser.txt for details.
+Revision history:
+Version 1.0 released 29 May 2001
+Version 2.0 released 3 Dec 2001:
+Restructured to add support for shared libraries.
+Version 2.0.1 released 8 Dec 2001:
+Includes autoconf/configure script
+Version 2.0.2 released 4 Feb 2002:
+Add SIMD version override options
+Test for lack of SSE2 mnemonic support in 'as'
+Build only selected version
+Version 2.0.3 released 6 Feb 2002:
+Fix to parityb function in parity.h
+feclib version 1.0 released November 2003
+Merged SIMD-Viterbi, RS and DSP libraries
+Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time
+feclib version 2.0 (unreleased) Mar 2004
+General speedups and cleanups
+Switch from 4 to 8-bit input symbols on all Viterbi decoders
+Support for Altivec on PowerPC
+Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code
+Changed license to GNU Lesser General Public License (LGPL)
+feclib version 2.1 June 5 2006
+Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults
+feclib version 2.1.1 June 6 2006
+Fix test/benchmark time measurement on Linux
diff --git a/README.md b/README.md
deleted file mode 100644
index fdafed0..0000000
--- a/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-This is a fork of KA9Q's FEC library
diff --git a/ccsds.h b/ccsds.h
new file mode 100644
index 0000000..ae65468
--- /dev/null
+++ b/ccsds.h
@@ -0,0 +1,5 @@
+typedef unsigned char data_t;
+extern unsigned char Taltab[],Tal1tab[];
+#define NN 255
+#define NROOTS 32
diff --git a/char.h b/char.h
new file mode 100644
index 0000000..25efd65
--- /dev/null
+++ b/char.h
@@ -0,0 +1,24 @@
+/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+#define MODNN(x) modnn(rs,x)
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to)
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
diff --git a/configure.in b/configure.in
new file mode 100644
index 0000000..4e4110b
--- /dev/null
+++ b/configure.in
@@ -0,0 +1,83 @@
+dnl Process this file with autoconf to produce a configure script.
+dnl Checks for programs.
+if test $GCC != "yes"
+ AC_MSG_ERROR([Need GNU C compiler])
+dnl Checks for libraries.
+AC_CHECK_LIB(c, malloc)
+dnl Checks for header files.
+AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h)
+if test -z "$HAVE_stdio.h"
+ AC_MSG_ERROR([Need stdio.h!])
+if test -z "$HAVE_stdlib.h"
+ AC_MSG_ERROR([Need stdlib.h!])
+if test -z "$HAVE_stdlib.h"
+ AC_MSG_ERROR([Need memory.h!])
+if test -z "$HAVE_string.h"
+ AC_MSG_ERROR([Need string.h])
+case $target_cpu in
+ ARCH_OPTION="-march=$target_cpu"
+ MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \
+ viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \
+ viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \
+ viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \
+ dotprod_mmx.o dotprod_mmx_assist.o \
+ dotprod_sse2.o dotprod_sse2_assist.o \
+ peakval_mmx.o peakval_mmx_assist.o \
+ peakval_sse.o peakval_sse_assist.o \
+ peakval_sse2.o peakval_sse2_assist.o \
+ sumsq.o sumsq_port.o \
+ sumsq_sse2.o sumsq_sse2_assist.o \
+ sumsq_mmx.o sumsq_mmx_assist.o \
+ cpu_features.o cpu_mode_x86.o"
+ ;;
+ ARCH_OPTION="-fno-common -faltivec"
+ MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \
+ encode_rs_av.o \
+ dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o"
+ ;;
+case $target_os in
+ SH_LIB=libfec.dylib
+ ;;
+ SH_LIB=libfec.so
+ REBIND=ldconfig
+ ;;
+dnl Checks for library functions.
+AC_CHECK_FUNCS(getopt_long memset memmove)
diff --git a/cpu_features.s b/cpu_features.s
new file mode 100644
index 0000000..ef4ba4e
--- /dev/null
+++ b/cpu_features.s
@@ -0,0 +1,15 @@
+.global cpu_features
+ .type cpu_features,@function
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl $1,%eax
+ cpuid
+ movl %edx,%eax
+ popl %edx
+ popl %ecx
+ popl %ebx
+ ret
+ \ No newline at end of file
diff --git a/cpu_mode_ppc.c b/cpu_mode_ppc.c
new file mode 100644
index 0000000..0071558
--- /dev/null
+++ b/cpu_mode_ppc.c
@@ -0,0 +1,40 @@
+/* Determine CPU support for SIMD on Power PC
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+ "x86 Streaming SIMD Extensions (SSE)",
+ "x86 Streaming SIMD Extensions 2 (SSE2)",
+ "PowerPC G4/G5 Altivec/Velocity Engine"};
+enum cpu_mode Cpu_mode;
+void find_cpu_mode(void){
+ if(Cpu_mode != UNKNOWN)
+ return;
+#ifdef __VEC__
+ {
+ /* Ask the OS if we have Altivec support */
+ int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+ int hasVectorUnit = 0;
+ size_t length = sizeof(hasVectorUnit);
+ int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+ if(0 == error && hasVectorUnit)
+ Cpu_mode = ALTIVEC;
+ else
+ Cpu_mode = PORT;
+ }
+ Cpu_mode = PORT;
+ fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
diff --git a/cpu_mode_x86.c b/cpu_mode_x86.c
new file mode 100644
index 0000000..322018e
--- /dev/null
+++ b/cpu_mode_x86.c
@@ -0,0 +1,33 @@
+/* Determine CPU support for SIMD
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+ "x86 Streaming SIMD Extensions (SSE)",
+ "x86 Streaming SIMD Extensions 2 (SSE2)",
+ "PowerPC G4/G5 Altivec/Velocity Engine"};
+enum cpu_mode Cpu_mode;
+void find_cpu_mode(void){
+ int f;
+ if(Cpu_mode != UNKNOWN)
+ return;
+ /* Figure out what kind of CPU we have */
+ f = cpu_features();
+ if(f & (1<<26)){ /* SSE2 is present */
+ Cpu_mode = SSE2;
+ } else if(f & (1<<25)){ /* SSE is present */
+ Cpu_mode = SSE;
+ } else if(f & (1<<23)){ /* MMX is present */
+ Cpu_mode = MMX;
+ } else { /* No SIMD at all */
+ Cpu_mode = PORT;
+ }
+ fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
diff --git a/decode_rs.c b/decode_rs.c
new file mode 100644
index 0000000..d7f97b3
--- /dev/null
+++ b/decode_rs.c
@@ -0,0 +1,262 @@
+/* Reed-Solomon decoder
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#ifdef DEBUG
+#include <stdio.h>
+#include <string.h>
+#define NULL ((void *)0)
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#include "char.h"
+#ifdef FIXED
+data_t *data, int *eras_pos, int no_eras,int pad){
+void *p,data_t *data, int *eras_pos, int no_eras){
+ struct rs *rs = (struct rs *)p;
+ int deg_lambda, el, deg_omega;
+ int i, j, r,k;
+ data_t u,q,tmp,num1,num2,den,discr_r;
+ data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly
+ * and syndrome poly */
+ data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+ data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+ int syn_error, count;
+#ifdef FIXED
+ /* Check pad parameter for validity */
+ if(pad < 0 || pad >= NN)
+ return -1;
+ /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+ for(i=0;i<NROOTS;i++)
+ s[i] = data[0];
+ for(j=1;j<NN-PAD;j++){
+ for(i=0;i<NROOTS;i++){
+ if(s[i] == 0){
+ s[i] = data[j];
+ } else {
+ s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+ }
+ }
+ }
+ /* Convert syndromes to index form, checking for nonzero condition */
+ syn_error = 0;
+ for(i=0;i<NROOTS;i++){
+ syn_error |= s[i];
+ s[i] = INDEX_OF[s[i]];
+ }
+ if (!syn_error) {
+ /* if syndrome is zero, data[] is a codeword and there are no
+ * errors to correct. So return data[] unmodified
+ */
+ count = 0;
+ goto finish;
+ }
+ memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+ lambda[0] = 1;
+ if (no_eras > 0) {
+ /* Init lambda to be the erasure locator polynomial */
+ lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+ for (i = 1; i < no_eras; i++) {
+ u = MODNN(PRIM*(NN-1-eras_pos[i]));
+ for (j = i+1; j > 0; j--) {
+ tmp = INDEX_OF[lambda[j - 1]];
+ if(tmp != A0)
+ lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+ }
+ }
+#if DEBUG >= 1
+ /* Test code that verifies the erasure locator polynomial just constructed
+ Needed only for decoder debugging. */
+ /* find roots of the erasure location polynomial */
+ for(i=1;i<=no_eras;i++)
+ reg[i] = INDEX_OF[lambda[i]];
+ count = 0;
+ for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+ q = 1;
+ for (j = 1; j <= no_eras; j++)
+ if (reg[j] != A0) {
+ reg[j] = MODNN(reg[j] + j);
+ q ^= ALPHA_TO[reg[j]];
+ }
+ if (q != 0)
+ continue;
+ /* store root and error location number indices */
+ root[count] = i;
+ loc[count] = k;
+ count++;
+ }
+ if (count != no_eras) {
+ printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+ count = -1;
+ goto finish;
+ }
+#if DEBUG >= 2
+ printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+ for (i = 0; i < count; i++)
+ printf("%d ", loc[i]);
+ printf("\n");
+ }
+ for(i=0;i<NROOTS+1;i++)
+ b[i] = INDEX_OF[lambda[i]];
+ /*
+ * Begin Berlekamp-Massey algorithm to determine error+erasure
+ * locator polynomial
+ */
+ r = no_eras;
+ el = no_eras;
+ while (++r <= NROOTS) { /* r is the step number */
+ /* Compute discrepancy at the r-th step in poly-form */
+ discr_r = 0;
+ for (i = 0; i < r; i++){
+ if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+ discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+ }
+ }
+ discr_r = INDEX_OF[discr_r]; /* Index form */
+ if (discr_r == A0) {
+ /* 2 lines below: B(x) <-- x*B(x) */
+ memmove(&b[1],b,NROOTS*sizeof(b[0]));
+ b[0] = A0;
+ } else {
+ /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+ t[0] = lambda[0];
+ for (i = 0 ; i < NROOTS; i++) {
+ if(b[i] != A0)
+ t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+ else
+ t[i+1] = lambda[i+1];
+ }
+ if (2 * el <= r + no_eras - 1) {
+ el = r + no_eras - el;
+ /*
+ * 2 lines below: B(x) <-- inv(discr_r) *
+ * lambda(x)
+ */
+ for (i = 0; i <= NROOTS; i++)
+ b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+ } else {
+ /* 2 lines below: B(x) <-- x*B(x) */
+ memmove(&b[1],b,NROOTS*sizeof(b[0]));
+ b[0] = A0;
+ }
+ memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+ }
+ }
+ /* Convert lambda to index form and compute deg(lambda(x)) */
+ deg_lambda = 0;
+ for(i=0;i<NROOTS+1;i++){
+ lambda[i] = INDEX_OF[lambda[i]];
+ if(lambda[i] != A0)
+ deg_lambda = i;
+ }
+ /* Find roots of the error+erasure locator polynomial by Chien search */
+ memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+ count = 0; /* Number of roots of lambda(x) */
+ for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+ q = 1; /* lambda[0] is always 0 */
+ for (j = deg_lambda; j > 0; j--){
+ if (reg[j] != A0) {
+ reg[j] = MODNN(reg[j] + j);
+ q ^= ALPHA_TO[reg[j]];
+ }
+ }
+ if (q != 0)
+ continue; /* Not a root */
+ /* store root (index-form) and error location number */
+#if DEBUG>=2
+ printf("count %d root %d loc %d\n",count,i,k);
+ root[count] = i;
+ loc[count] = k;
+ /* If we've already found max possible roots,
+ * abort the search to save time
+ */
+ if(++count == deg_lambda)
+ break;
+ }
+ if (deg_lambda != count) {
+ /*
+ * deg(lambda) unequal to number of roots => uncorrectable
+ * error detected
+ */
+ count = -1;
+ goto finish;
+ }
+ /*
+ * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+ * x**NROOTS). in index form. Also find deg(omega).
+ */
+ deg_omega = deg_lambda-1;
+ for (i = 0; i <= deg_omega;i++){
+ tmp = 0;
+ for(j=i;j >= 0; j--){
+ if ((s[i - j] != A0) && (lambda[j] != A0))
+ tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+ }
+ omega[i] = INDEX_OF[tmp];
+ }
+ /*
+ * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+ * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+ */
+ for (j = count-1; j >=0; j--) {
+ num1 = 0;
+ for (i = deg_omega; i >= 0; i--) {
+ if (omega[i] != A0)
+ num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+ }
+ num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+ den = 0;
+ /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+ for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+ if(lambda[i+1] != A0)
+ den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+ }
+#if DEBUG >= 1
+ if (den == 0) {
+ printf("\n ERROR: denominator = 0\n");
+ count = -1;
+ goto finish;
+ }
+ /* Apply error to data */
+ if (num1 != 0 && loc[j] >= PAD) {
+ data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+ }
+ }
+ finish:
+ if(eras_pos != NULL){
+ for(i=0;i<count;i++)
+ eras_pos[i] = loc[i];
+ }
+ return count;
diff --git a/decode_rs.h b/decode_rs.h
new file mode 100644
index 0000000..c165cf3
--- /dev/null
+++ b/decode_rs.h
@@ -0,0 +1,298 @@
+/* The guts of the Reed-Solomon decoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN data and parity symbols to be corrected in place
+ * retval - an integer lvalue into which the decoder's return code is written
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ * which is the same as the number of parity symbols in a block.
+ Integer variable or literal.
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ * elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ * elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * FCR - An integer literal or variable specifying the first consecutive root of the
+ * Reed-Solomon generator polynomial. Integer variable or literal.
+ * PRIM - The primitive root of the generator poly. Integer variable or literal.
+ * DEBUG - If set to 1 or more, do various internal consistency checking. Leave this
+ * undefined for production code
+ * The memset(), memmove(), and memcpy() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+ */
+#if !defined(NROOTS)
+#error "NROOTS not defined"
+#if !defined(NN)
+#error "NN not defined"
+#if !defined(PAD)
+#error "PAD not defined"
+#if !defined(ALPHA_TO)
+#error "ALPHA_TO not defined"
+#if !defined(INDEX_OF)
+#error "INDEX_OF not defined"
+#if !defined(MODNN)
+#error "MODNN not defined"
+#if !defined(FCR)
+#error "FCR not defined"
+#if !defined(PRIM)
+#error "PRIM not defined"
+#if !defined(NULL)
+#define NULL ((void *)0)
+#undef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#undef A0
+#define A0 (NN)
+ int deg_lambda, el, deg_omega;
+ int i, j, r,k;
+ data_t u,q,tmp,num1,num2,den,discr_r;
+ data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly
+ * and syndrome poly */
+ data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+ data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+ int syn_error, count;
+ /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+ for(i=0;i<NROOTS;i++)
+ s[i] = data[0];
+ for(j=1;j<NN-PAD;j++){
+ for(i=0;i<NROOTS;i++){
+ if(s[i] == 0){
+ s[i] = data[j];
+ } else {
+ s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+ }
+ }
+ }
+ /* Convert syndromes to index form, checking for nonzero condition */
+ syn_error = 0;
+ for(i=0;i<NROOTS;i++){
+ syn_error |= s[i];
+ s[i] = INDEX_OF[s[i]];
+ }
+ if (!syn_error) {
+ /* if syndrome is zero, data[] is a codeword and there are no
+ * errors to correct. So return data[] unmodified
+ */
+ count = 0;
+ goto finish;
+ }
+ memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+ lambda[0] = 1;
+ if (no_eras > 0) {
+ /* Init lambda to be the erasure locator polynomial */
+ lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+ for (i = 1; i < no_eras; i++) {
+ u = MODNN(PRIM*(NN-1-eras_pos[i]));
+ for (j = i+1; j > 0; j--) {
+ tmp = INDEX_OF[lambda[j - 1]];
+ if(tmp != A0)
+ lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+ }
+ }
+#if DEBUG >= 1
+ /* Test code that verifies the erasure locator polynomial just constructed
+ Needed only for decoder debugging. */
+ /* find roots of the erasure location polynomial */
+ for(i=1;i<=no_eras;i++)
+ reg[i] = INDEX_OF[lambda[i]];
+ count = 0;
+ for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+ q = 1;
+ for (j = 1; j <= no_eras; j++)
+ if (reg[j] != A0) {
+ reg[j] = MODNN(reg[j] + j);
+ q ^= ALPHA_TO[reg[j]];
+ }
+ if (q != 0)
+ continue;
+ /* store root and error location number indices */
+ root[count] = i;
+ loc[count] = k;
+ count++;
+ }
+ if (count != no_eras) {
+ printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+ count = -1;
+ goto finish;
+ }
+#if DEBUG >= 2
+ printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+ for (i = 0; i < count; i++)
+ printf("%d ", loc[i]);
+ printf("\n");
+ }
+ for(i=0;i<NROOTS+1;i++)
+ b[i] = INDEX_OF[lambda[i]];
+ /*
+ * Begin Berlekamp-Massey algorithm to determine error+erasure
+ * locator polynomial
+ */
+ r = no_eras;
+ el = no_eras;
+ while (++r <= NROOTS) { /* r is the step number */
+ /* Compute discrepancy at the r-th step in poly-form */
+ discr_r = 0;
+ for (i = 0; i < r; i++){
+ if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+ discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+ }
+ }
+ discr_r = INDEX_OF[discr_r]; /* Index form */
+ if (discr_r == A0) {
+ /* 2 lines below: B(x) <-- x*B(x) */
+ memmove(&b[1],b,NROOTS*sizeof(b[0]));
+ b[0] = A0;
+ } else {
+ /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+ t[0] = lambda[0];
+ for (i = 0 ; i < NROOTS; i++) {
+ if(b[i] != A0)
+ t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+ else
+ t[i+1] = lambda[i+1];
+ }
+ if (2 * el <= r + no_eras - 1) {
+ el = r + no_eras - el;
+ /*
+ * 2 lines below: B(x) <-- inv(discr_r) *
+ * lambda(x)
+ */
+ for (i = 0; i <= NROOTS; i++)
+ b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+ } else {
+ /* 2 lines below: B(x) <-- x*B(x) */
+ memmove(&b[1],b,NROOTS*sizeof(b[0]));
+ b[0] = A0;
+ }
+ memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+ }
+ }
+ /* Convert lambda to index form and compute deg(lambda(x)) */
+ deg_lambda = 0;
+ for(i=0;i<NROOTS+1;i++){
+ lambda[i] = INDEX_OF[lambda[i]];
+ if(lambda[i] != A0)
+ deg_lambda = i;
+ }
+ /* Find roots of the error+erasure locator polynomial by Chien search */
+ memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+ count = 0; /* Number of roots of lambda(x) */
+ for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+ q = 1; /* lambda[0] is always 0 */
+ for (j = deg_lambda; j > 0; j--){
+ if (reg[j] != A0) {
+ reg[j] = MODNN(reg[j] + j);
+ q ^= ALPHA_TO[reg[j]];
+ }
+ }
+ if (q != 0)
+ continue; /* Not a root */
+ /* store root (index-form) and error location number */
+#if DEBUG>=2
+ printf("count %d root %d loc %d\n",count,i,k);
+ root[count] = i;
+ loc[count] = k;
+ /* If we've already found max possible roots,
+ * abort the search to save time
+ */
+ if(++count == deg_lambda)
+ break;
+ }
+ if (deg_lambda != count) {
+ /*
+ * deg(lambda) unequal to number of roots => uncorrectable
+ * error detected
+ */
+ count = -1;
+ goto finish;
+ }
+ /*
+ * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+ * x**NROOTS). in index form. Also find deg(omega).
+ */
+ deg_omega = deg_lambda-1;
+ for (i = 0; i <= deg_omega;i++){
+ tmp = 0;
+ for(j=i;j >= 0; j--){
+ if ((s[i - j] != A0) && (lambda[j] != A0))
+ tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+ }
+ omega[i] = INDEX_OF[tmp];
+ }
+ /*
+ * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+ * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+ */
+ for (j = count-1; j >=0; j--) {
+ num1 = 0;
+ for (i = deg_omega; i >= 0; i--) {
+ if (omega[i] != A0)
+ num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+ }
+ num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+ den = 0;
+ /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+ for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+ if(lambda[i+1] != A0)
+ den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+ }
+#if DEBUG >= 1
+ if (den == 0) {
+ printf("\n ERROR: denominator = 0\n");
+ count = -1;
+ goto finish;
+ }
+ /* Apply error to data */
+ if (num1 != 0 && loc[j] >= PAD) {
+ data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+ }
+ }
+ finish:
+ if(eras_pos != NULL){
+ for(i=0;i<count;i++)
+ eras_pos[i] = loc[i];
+ }
+ retval = count;
diff --git a/decode_rs_8.c b/decode_rs_8.c
new file mode 100644
index 0000000..995b0d9
--- /dev/null
+++ b/decode_rs_8.c
@@ -0,0 +1,24 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#ifdef DEBUG
+#include <stdio.h>
+#include <string.h>
+#include "fixed.h"
+int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){
+ int retval;
+ if(pad < 0 || pad > 222){
+ return -1;
+ }
+#include "decode_rs.h"
+ return retval;
diff --git a/decode_rs_ccsds.c b/decode_rs_ccsds.c
new file mode 100644
index 0000000..0e246b4
--- /dev/null
+++ b/decode_rs_ccsds.c
@@ -0,0 +1,26 @@
+/* This function wraps around the fixed 8-bit decoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){
+ int i,r;
+ data_t cdata[NN];
+ /* Convert data from dual basis to conventional */
+ for(i=0;i<NN-pad;i++)
+ cdata[i] = Tal1tab[data[i]];
+ r = decode_rs_8(cdata,eras_pos,no_eras,pad);
+ if(r > 0){
+ /* Convert from conventional to dual basis */
+ for(i=0;i<NN-pad;i++)
+ data[i] = Taltab[cdata[i]];
+ }
+ return r;
diff --git a/decode_rs_char.c b/decode_rs_char.c
new file mode 100644
index 0000000..7105233
--- /dev/null
+++ b/decode_rs_char.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#ifdef DEBUG
+#include <stdio.h>
+#include <string.h>
+#include "char.h"
+#include "rs-common.h"
+int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){
+ int retval;
+ struct rs *rs = (struct rs *)p;
+#include "decode_rs.h"
+ return retval;
diff --git a/decode_rs_int.c b/decode_rs_int.c
new file mode 100644
index 0000000..1ef1a1f
--- /dev/null
+++ b/decode_rs_int.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#ifdef DEBUG
+#include <stdio.h>
+#include <string.h>
+#include "int.h"
+#include "rs-common.h"
+int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){
+ int retval;
+ struct rs *rs = (struct rs *)p;
+#include "decode_rs.h"
+ return retval;
diff --git a/dotprod.c b/dotprod.c
new file mode 100644
index 0000000..b3be913
--- /dev/null
+++ b/dotprod.c
@@ -0,0 +1,94 @@
+/* 16-bit signed integer dot product
+ * Switch to appropriate versions
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+void *initdp_port(signed short coeffs[],int len);
+long dotprod_port(void *p,signed short *b);
+void freedp_port(void *p);
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void *initdp_sse2(signed short coeffs[],int len);
+long dotprod_mmx(void *p,signed short *b);
+long dotprod_sse2(void *p,signed short *b);
+void freedp_mmx(void *p);
+void freedp_sse2(void *p);
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+long dotprod_av(void *p,signed short *b);
+void freedp_av(void *p);
+/* Create and return a descriptor for use with the dot product function */
+void *initdp(signed short coeffs[],int len){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return initdp_port(coeffs,len);
+#ifdef __i386__
+ case MMX:
+ case SSE:
+ return initdp_mmx(coeffs,len);
+ case SSE2:
+ return initdp_sse2(coeffs,len);
+#ifdef __VEC__
+ case ALTIVEC:
+ return initdp_av(coeffs,len);
+ }
+/* Free a dot product descriptor created earlier */
+void freedp(void *p){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+#ifdef __i386__
+ case MMX:
+ case SSE:
+ return freedp_mmx(p);
+ case SSE2:
+ return freedp_sse2(p);
+#ifdef __VEC__
+ case ALTIVEC:
+ return freedp_av(p);
+ }
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod(void *p,signed short a[]){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return dotprod_port(p,a);
+#ifdef __i386__
+ case MMX:
+ case SSE:
+ return dotprod_mmx(p,a);
+ case SSE2:
+ return dotprod_sse2(p,a);
+#ifdef __VEC__
+ case ALTIVEC:
+ return dotprod_av(p,a);
+ }
diff --git a/dotprod.h b/dotprod.h
new file mode 100644
index 0000000..6b62b70
--- /dev/null
+++ b/dotprod.h
@@ -0,0 +1,15 @@
+/* Internal definitions for dotproduct function */
+struct dotprod {
+ int len; /* Number of coefficients */
+ /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+ * preshifted by 0,1,2,3 words to meet all possible input data
+ * alignments (see Intel ap559 on MMX dot products).
+ *
+ * SSE2 is similar, but with 8 words at a time
+ *
+ * On a non-MMX machine, only one copy is present
+ */
+ signed short *coeffs[8];
diff --git a/dotprod_av.c b/dotprod_av.c
new file mode 100644
index 0000000..1f70471
--- /dev/null
+++ b/dotprod_av.c
@@ -0,0 +1,93 @@
+/* 16-bit signed integer dot product
+ * Altivec-assisted version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+struct dotprod {
+ int len; /* Number of coefficients */
+ /* On an Altivec machine, these hold 8 copies of the coefficients,
+ * preshifted by 0,1,..7 words to meet all possible input data
+ */
+ signed short *coeffs[8];
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_av(signed short coeffs[],int len){
+ struct dotprod *dp;
+ int i,j;
+ if(len == 0)
+ return NULL;
+ dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+ dp->len = len;
+ /* Make 8 copies of coefficients, one for each data alignment,
+ * each aligned to 16-byte boundary
+ */
+ for(i=0;i<8;i++){
+ dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
+ for(j=0;j<len;j++)
+ dp->coeffs[i][j+i] = coeffs[j];
+ }
+ return (void *)dp;
+/* Free a dot product descriptor created earlier */
+void freedp_av(void *p){
+ struct dotprod *dp = (struct dotprod *)p;
+ int i;
+ for(i=0;i<8;i++)
+ if(dp->coeffs[i] != NULL)
+ free(dp->coeffs[i]);
+ free(dp);
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_av(void *p,signed short a[]){
+ struct dotprod *dp = (struct dotprod *)p;
+ int al;
+ vector signed short *ar,*d;
+ vector signed int sums0,sums1,sums2,sums3;
+ union { vector signed int v; signed int w[4];} s;
+ int nblocks;
+ /* round ar down to beginning of 16-byte block containing 0th element of
+ * input buffer. Then set d to one of 8 sets of shifted coefficients
+ */
+ ar = (vector signed short *)((int)a & ~15);
+ al = ((int)a & 15)/sizeof(signed short);
+ d = (vector signed short *)dp->coeffs[al];
+ nblocks = (dp->len+al-1)/8+1;
+ /* Sum into four vectors each holding four 32-bit partial sums */
+ sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
+ while(nblocks >= 4){
+ sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
+ sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
+ sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
+ sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
+ nblocks -= 4;
+ }
+ sums0 = vec_adds(sums0,sums1);
+ sums2 = vec_adds(sums2,sums3);
+ sums0 = vec_adds(sums0,sums2);
+ while(nblocks-- > 0){
+ sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
+ }
+ /* Sum 4 partial sums into final result */
+ s.v = vec_sums(sums0,(vector signed int)(0));
+ return s.w[3];
diff --git a/dotprod_mmx.c b/dotprod_mmx.c
new file mode 100644
index 0000000..c516afe
--- /dev/null
+++ b/dotprod_mmx.c
@@ -0,0 +1,81 @@
+/* 16-bit signed integer dot product
+ * MMX assisted version; also for SSE
+ *
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+struct dotprod {
+ int len; /* Number of coefficients */
+ /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+ * preshifted by 0,1,2,3 words to meet all possible input data
+ * alignments (see Intel ap559 on MMX dot products).
+ */
+ signed short *coeffs[4];
+long dotprod_mmx_assist(signed short *a,signed short *b,int cnt);
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_mmx(signed short coeffs[],int len){
+ struct dotprod *dp;
+ int i,j;
+ if(len == 0)
+ return NULL;
+ dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+ dp->len = len;
+ /* Make 4 copies of coefficients, one for each data alignment */
+ for(i=0;i<4;i++){
+ dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4,
+ 4*sizeof(signed short));
+ for(j=0;j<len;j++)
+ dp->coeffs[i][j+i] = coeffs[j];
+ }
+ return (void *)dp;
+/* Free a dot product descriptor created earlier */
+void freedp_mmx(void *p){
+ struct dotprod *dp = (struct dotprod *)p;
+ int i;
+ for(i=0;i<4;i++)
+ if(dp->coeffs[i] != NULL)
+ free(dp->coeffs[i]);
+ free(dp);
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_mmx(void *p,signed short a[]){
+ struct dotprod *dp = (struct dotprod *)p;
+ int al;
+ signed short *ar;
+ /* Round input data address down to 8 byte boundary
+ * NB: depending on the alignment of a[], memory
+ * before a[] will be accessed. The contents don't matter since they'll
+ * be multiplied by zero coefficients. I can't conceive of any
+ * situation where this could cause a segfault since memory protection
+ * in the x86 machines is done on much larger boundaries
+ */
+ ar = (signed short *)((int)a & ~7);
+ /* Choose one of 4 sets of pre-shifted coefficients. al is both the
+ * index into dp->coeffs[] and the number of 0 words padded onto
+ * that coefficients array for alignment purposes
+ */
+ al = a - ar;
+ /* Call assembler routine to do the work, passing number of 4-word blocks */
+ return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1);
diff --git a/dotprod_mmx_assist.s b/dotprod_mmx_assist.s
new file mode 100644
index 0000000..25deffd
--- /dev/null
+++ b/dotprod_mmx_assist.s
@@ -0,0 +1,83 @@
+# SIMD MMX dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+# long sum = 0;
+# cnt *= 4;
+# while(cnt--)
+# sum += *a++ + *b++;
+# return sum;
+# }
+# a and b should also be 64-bit aligned, or speed will suffer greatly
+# Copyright 1999, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+ .global dotprod_mmx_assist
+ .type dotprod_mmx_assist,@function
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %ecx
+ pushl %ebx
+ movl 8(%ebp),%esi # a
+ movl 12(%ebp),%edi # b
+ movl 16(%ebp),%ecx # cnt
+ pxor %mm0,%mm0 # clear running sum (in two 32-bit halves)
+# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
+ .align 16
+.Loop1: subl $4,%ecx
+ jl .Loop1Done
+ movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0]
+ pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
+ paddd %mm1,%mm0
+ movq 8(%esi),%mm1
+ pmaddwd 8(%edi),%mm1
+ paddd %mm1,%mm0
+ movq 16(%esi),%mm1
+ pmaddwd 16(%edi),%mm1
+ paddd %mm1,%mm0
+ movq 24(%esi),%mm1
+ addl $32,%esi
+ pmaddwd 24(%edi),%mm1
+ addl $32,%edi
+ paddd %mm1,%mm0
+ jmp .Loop1
+ addl $4,%ecx
+# MMX dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2: subl $1,%ecx
+ jl .Loop2Done
+ movq (%esi),%mm1
+ addl $8,%esi
+ pmaddwd (%edi),%mm1
+ addl $8,%edi
+ paddd %mm1,%mm0
+ jmp .Loop2
+ movd %mm0,%ebx # right-hand word to ebx
+ punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0
+ movd %mm0,%eax
+ addl %ebx,%eax # running sum now in %eax
+ emms # done with MMX
+ popl %ebx
+ popl %ecx
+ popl %edi
+ popl %esi
+ movl %ebp,%esp
+ popl %ebp
+ ret
diff --git a/dotprod_port.c b/dotprod_port.c
new file mode 100644
index 0000000..ef635ec
--- /dev/null
+++ b/dotprod_port.c
@@ -0,0 +1,58 @@
+/* 16-bit signed integer dot product
+ * Portable C version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+struct dotprod {
+ int len; /* Number of coefficients */
+ signed short *coeffs;
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_port(signed short coeffs[],int len){
+ struct dotprod *dp;
+ int j;
+ if(len == 0)
+ return NULL;
+ dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+ dp->len = len;
+ /* Just one copy of the coefficients for the C version */
+ dp->coeffs = (signed short *)calloc(len,sizeof(signed short));
+ for(j=0;j<len;j++)
+ dp->coeffs[j] = coeffs[j];
+ return (void *)dp;
+/* Free a dot product descriptor created earlier */
+void freedp_port(void *p){
+ struct dotprod *dp = (struct dotprod *)p;
+ if(dp->coeffs != NULL)
+ free(dp->coeffs);
+ free(dp);
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_port(void *p,signed short a[]){
+ struct dotprod *dp = (struct dotprod *)p;
+ long corr;
+ int i;
+ corr = 0;
+ for(i=0;i<dp->len;i++){
+ corr += (long)a[i] * dp->coeffs[i];
+ }
+ return corr;
diff --git a/dotprod_sse2.c b/dotprod_sse2.c
new file mode 100644
index 0000000..1fddd18
--- /dev/null
+++ b/dotprod_sse2.c
@@ -0,0 +1,72 @@
+/* 16-bit signed integer dot product
+ * SSE2 version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+struct dotprod {
+ int len; /* Number of coefficients */
+ /* On a SSE2 machine, these hold 8 copies of the coefficients,
+ * preshifted by 0,1,..7 words to meet all possible input data
+ * alignments (see Intel ap559 on MMX dot products).
+ */
+ signed short *coeffs[8];
+long dotprod_sse2_assist(signed short *a,signed short *b,int cnt);
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_sse2(signed short coeffs[],int len){
+ struct dotprod *dp;
+ int i,j,blksize;
+ if(len == 0)
+ return NULL;
+ dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+ dp->len = len;
+ /* Make 8 copies of coefficients, one for each data alignment,
+ * each aligned to 16-byte boundary
+ */
+ for(i=0;i<8;i++){
+ blksize = (1+(len+i-1)/8) * 8*sizeof(signed short);
+ posix_memalign((void **)&dp->coeffs[i],16,blksize);
+ memset(dp->coeffs[i],0,blksize);
+ for(j=0;j<len;j++)
+ dp->coeffs[i][j+i] = coeffs[j];
+ }
+ return (void *)dp;
+/* Free a dot product descriptor created earlier */
+void freedp_sse2(void *p){
+ struct dotprod *dp = (struct dotprod *)p;
+ int i;
+ for(i=0;i<8;i++)
+ if(dp->coeffs[i] != NULL)
+ free(dp->coeffs[i]);
+ free(dp);
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_sse2(void *p,signed short a[]){
+ struct dotprod *dp = (struct dotprod *)p;
+ int al;
+ signed short *ar;
+ ar = (signed short *)((int)a & ~15);
+ al = a - ar;
+ /* Call assembler routine to do the work, passing number of 8-word blocks */
+ return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1);
diff --git a/dotprod_sse2_assist.s b/dotprod_sse2_assist.s
new file mode 100644
index 0000000..47348fa
--- /dev/null
+++ b/dotprod_sse2_assist.s
@@ -0,0 +1,85 @@
+# SIMD SSE2 dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+# long sum = 0;
+# cnt *= 8;
+# while(cnt--)
+# sum += *a++ + *b++;
+# return sum;
+# }
+# a and b must be 128-bit aligned
+# Copyright 2001, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+ .global dotprod_sse2_assist
+ .type dotprod_sse2_assist,@function
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %ecx
+ pushl %ebx
+ movl 8(%ebp),%esi # a
+ movl 12(%ebp),%edi # b
+ movl 16(%ebp),%ecx # cnt
+ pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves)
+# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
+ .align 16
+.Loop1: subl $4,%ecx
+ jl .Loop1Done
+ movdqa (%esi),%xmm1
+ pmaddwd (%edi),%xmm1
+ paddd %xmm1,%xmm0
+ movdqa 16(%esi),%xmm1
+ pmaddwd 16(%edi),%xmm1
+ paddd %xmm1,%xmm0
+ movdqa 32(%esi),%xmm1
+ pmaddwd 32(%edi),%xmm1
+ paddd %xmm1,%xmm0
+ movdqa 48(%esi),%xmm1
+ addl $64,%esi
+ pmaddwd 48(%edi),%xmm1
+ addl $64,%edi
+ paddd %xmm1,%xmm0
+ jmp .Loop1
+ addl $4,%ecx
+# SSE2 dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2: subl $1,%ecx
+ jl .Loop2Done
+ movdqa (%esi),%xmm1
+ addl $16,%esi
+ pmaddwd (%edi),%xmm1
+ addl $16,%edi
+ paddd %xmm1,%xmm0
+ jmp .Loop2
+ movdqa %xmm0,%xmm1
+ psrldq $8,%xmm0
+ paddd %xmm1,%xmm0
+ movd %xmm0,%eax # right-hand word to eax
+ psrldq $4,%xmm0
+ movd %xmm0,%ebx
+ addl %ebx,%eax
+ popl %ebx
+ popl %ecx
+ popl %edi
+ popl %esi
+ movl %ebp,%esp
+ popl %ebp
+ ret
diff --git a/dsp.3 b/dsp.3
new file mode 100644
index 0000000..e9794da
--- /dev/null
+++ b/dsp.3
@@ -0,0 +1,63 @@
+.TH DSP 3
+initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted
+digital signal processing primitives
+#include "fec.h"
+void *initdp(signed short *coeffs,int len);
+long dotprod(void *p,signed short *a);
+void freedp(void *p);
+unsigned long long sumsq(signed short *in,int cnt);
+int peakval(signed short *b,int cnt);
+These functions provide several basic primitives useful in digital
+signal processing (DSP), especially in modems. The \fBinitdp\fR,
+\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot
+product useful in correlation and filtering operations on signed
+16-bit integers. \fBsumsq\fR computes the sum
+of the squares of an array of signed 16-bit integers,
+useful for measuring the energy of a signal. \fBpeakval\fR returns the
+absolute value of the largest magitude element in the input array,
+useful for scaling a signal's amplitude.
+Each function uses IA32 or PowerPC Altivec instructions when
+available; otherwise, a portable C version is used.
+To create a FIR filter or correlator, call \fBinitdp\fR with the
+coefficients in \fBcoeff\fR and their number in \fBlen\fR. This
+creates the appropriate data structures and returns a handle.
+To compute a dot product, pass the handle from \fBinitdp\fR and the
+input array to \fBdotprod\fR. No length field is needed as the number
+of samples will be taken from the \fBlen\fR parameter originally given
+to \fBinitdp\fR. There must be at least as many samples in the input
+array as there were coefficients passed to \fBinitdp\fR.
+When the filter or correlator is no longer needed, the data structures
+may be freed by passing the handle to \fBfreedp\fR.
+The user is responsible for scaling the inputs to \fBinitdp\fR and
+\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently
+wrap around in the event of overflow.
+To compute the sum of the squares of an array of signed 16-bit
+integers, use sumsq\fR. This returns a 64 bit sum.
+\fBpeakval\fR computes the absolute value of each 16-bit element in
+the input array and returns the largest.
+\fBinitdp\fR returns a handle that points to a control block, or NULL in
+the event of an error (such as a memory allocation failure). \fBsumsq\fR
+and \fBpeakval\fR have no error returns.
+Phil Karn, KA9Q (karn@ka9q.net)
diff --git a/dtest.c b/dtest.c
new file mode 100644
index 0000000..394cb03
--- /dev/null
+++ b/dtest.c
@@ -0,0 +1,99 @@
+/* Test dot-product function */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include "config.h"
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {"trials",0,NULL,'n'},
+ {NULL},
+int main(int argc,char *argv[]){
+ short coeffs[512];
+ short input[2048];
+ int trials=1000,d;
+ int errors = 0;
+ while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"apmstn:")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ }
+ }
+ while(trials--){
+ long port_result;
+ long simd_result;
+ int ntaps;
+ int i;
+ int csum = 0;
+ int offset;
+ void *dp_simd,*dp_port;
+ /* Generate set of coefficients
+ * limit sum of absolute values to 32767 to avoid overflow
+ */
+ memset(coeffs,0,sizeof(coeffs));
+ for(i=0;i<512;i++){
+ double gv;
+ gv = normal_rand(0.,100.);
+ if(csum + fabs(gv) > 32767)
+ break;
+ coeffs[i] = gv;
+ csum += fabs(gv);
+ }
+ ntaps = i;
+ /* Compare results to portable C version for a bunch of random data buffers and offsets */
+ dp_simd = initdp(coeffs,ntaps);
+ dp_port = initdp_port(coeffs,ntaps);
+ for(i=0;i<2048;i++)
+ input[i] = random();
+ offset = random() & 511;
+ simd_result = dotprod(dp_simd,input+offset);
+ port_result = dotprod_port(dp_port,input+offset);
+ if(simd_result != port_result){
+ errors++;
+ }
+ }
+ printf("dtest: %d errors\n",errors);
+ exit(0);
diff --git a/encode_rs.c b/encode_rs.c
new file mode 100644
index 0000000..0649094
--- /dev/null
+++ b/encode_rs.c
@@ -0,0 +1,52 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#include "char.h"
+void ENCODE_RS(
+#ifdef FIXED
+data_t *data, data_t *bb,int pad){
+void *p,data_t *data, data_t *bb){
+ struct rs *rs = (struct rs *)p;
+ int i, j;
+ data_t feedback;
+#ifdef FIXED
+ /* Check pad parameter for validity */
+ if(pad < 0 || pad >= NN)
+ return;
+ memset(bb,0,NROOTS*sizeof(data_t));
+ for(i=0;i<NN-NROOTS-PAD;i++){
+ feedback = INDEX_OF[data[i] ^ bb[0]];
+ if(feedback != A0){ /* feedback term is non-zero */
+ /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+ * always be for the polynomials constructed by init_rs()
+ */
+ feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+ for(j=1;j<NROOTS;j++)
+ bb[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+ }
+ /* Shift */
+ memmove(&bb[0],&bb[1],sizeof(data_t)*(NROOTS-1));
+ if(feedback != A0)
+ bb[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+ else
+ bb[NROOTS-1] = 0;
+ }
diff --git a/encode_rs.h b/encode_rs.h
new file mode 100644
index 0000000..2c157f9
--- /dev/null
+++ b/encode_rs.h
@@ -0,0 +1,58 @@
+/* The guts of the Reed-Solomon encoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN-NROOTS-PAD and type data_t to be encoded
+ * data_t parity[] - an array of NROOTS and type data_t to be written with parity symbols
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ * which is the same as the number of parity symbols in a block.
+ Integer variable or literal.
+ *
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ * elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ * elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * GENPOLY - an array of NROOTS+1 elements containing the generator polynomial in index form
+ * The memset() and memmove() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#undef A0
+#define A0 (NN) /* Special reserved value encoding zero in index form */
+ int i, j;
+ data_t feedback;
+ memset(parity,0,NROOTS*sizeof(data_t));
+ for(i=0;i<NN-NROOTS-PAD;i++){
+ feedback = INDEX_OF[data[i] ^ parity[0]];
+ if(feedback != A0){ /* feedback term is non-zero */
+ /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+ * always be for the polynomials constructed by init_rs()
+ */
+ feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+ for(j=1;j<NROOTS;j++)
+ parity[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+ }
+ /* Shift */
+ memmove(&parity[0],&parity[1],sizeof(data_t)*(NROOTS-1));
+ if(feedback != A0)
+ parity[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+ else
+ parity[NROOTS-1] = 0;
+ }
diff --git a/encode_rs_8.c b/encode_rs_8.c
new file mode 100644
index 0000000..5aaecca
--- /dev/null
+++ b/encode_rs_8.c
@@ -0,0 +1,109 @@
+/* Reed-Solomon encoder
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#include "fixed.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode;
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad);
+#if __vec__
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad);
+#if __i386__
+int cpu_features(void);
+void encode_rs_8(data_t *data, data_t *parity,int pad){
+ if(cpu_mode == UNKNOWN){
+#ifdef __i386__
+ int f;
+ /* Figure out what kind of CPU we have */
+ f = cpu_features();
+ if(f & (1<<26)){ /* SSE2 is present */
+ cpu_mode = SSE2;
+ } else if(f & (1<<25)){ /* SSE is present */
+ cpu_mode = SSE;
+ } else if(f & (1<<23)){ /* MMX is present */
+ cpu_mode = MMX;
+ } else { /* No SIMD at all */
+ cpu_mode = PORT;
+ }
+#elif __VEC__
+ /* Ask the OS if we have Altivec support */
+ int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+ int hasVectorUnit = 0;
+ size_t length = sizeof(hasVectorUnit);
+ int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+ if(0 == error && hasVectorUnit)
+ cpu_mode = ALTIVEC;
+ else
+ cpu_mode = PORT;
+ cpu_mode = PORT;
+ }
+ switch(cpu_mode){
+#if __vec__
+ case ALTIVEC:
+ encode_rs_8_av(data,parity,pad);
+ return;
+#if __i386__
+ case MMX:
+ case SSE:
+ case SSE2:
+ default:
+ encode_rs_8_c(data,parity,pad);
+ return;
+ }
+#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form the other half by reversing this one
+ */
+extern static union { vector unsigned char v; unsigned char c[16]; } table[256];
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad){
+ union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+ int i;
+ shift_register.v[0] = (vector unsigned char)(0);
+ shift_register.v[1] = (vector unsigned char)(0);
+ for(i=0;i<NN-NROOTS-pad;i++){
+ vector unsigned char feedback0,feedback1;
+ unsigned char f;
+ f = data[i] ^ shift_register.c[31];
+ feedback1 = table[f].v;
+ feedback0 = vec_perm(feedback1,feedback1,reverse);
+ /* Shift right one byte */
+ shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+ shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+ shift_register.c[0] = f;
+ }
+ for(i=0;i<NROOTS;i++)
+ parity[NROOTS-i-1] = shift_register.c[i];
+/* Portable C version */
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad){
+#include "encode_rs.h"
diff --git a/encode_rs_av.c b/encode_rs_av.c
new file mode 100644
index 0000000..32e528f
--- /dev/null
+++ b/encode_rs_av.c
@@ -0,0 +1,61 @@
+/* Fast Reed-Solomon encoder for (255,223) CCSDS code on PowerPC G4/G5 using Altivec instructions
+ * Copyright 2004, Phil Karn KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <string.h>
+#include "fixed.h"
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form it by reversing these on the fly
+ */
+static union { vector unsigned char v; unsigned char c[16]; } table[256];
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+void rs_init_av(){
+ int i,j;
+ /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */
+ for(j=0;j<16;j++){
+ table[0].c[j] = 0;
+ for(i=1;i<256;i++){
+ table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])];
+ }
+ }
+#if 0
+ for(i=0;i<256;i++){
+ printf("table[%3d] = %3vu\n",i,table[i].v);
+ }
+void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){
+ union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+ int i;
+ shift_register.v[0] = (vector unsigned char)(0);
+ shift_register.v[1] = (vector unsigned char)(0);
+ for(i=0;i<NN-NROOTS-pad;i++){
+ vector unsigned char feedback0,feedback1;
+ unsigned char f;
+ f = data[i] ^ shift_register.c[31];
+ feedback1 = table[f].v;
+ feedback0 = vec_perm(feedback1,feedback1,reverse);
+ /* Shift right one byte */
+ shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+ shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+ shift_register.c[0] = f;
+ }
+ for(i=0;i<NROOTS;i++)
+ parity[NROOTS-i-1] = shift_register.c[i];
diff --git a/encode_rs_ccsds.c b/encode_rs_ccsds.c
new file mode 100644
index 0000000..5a2ec70
--- /dev/null
+++ b/encode_rs_ccsds.c
@@ -0,0 +1,24 @@
+/* This function wraps around the fixed 8-bit encoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * fixed bug Aug 2007
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+void encode_rs_ccsds(data_t *data,data_t *parity,int pad){
+ int i;
+ data_t cdata[NN-NROOTS];
+ /* Convert data from dual basis to conventional */
+ for(i=0;i<NN-NROOTS-pad;i++)
+ cdata[i] = Tal1tab[data[i]];
+ encode_rs_8(cdata,parity,pad);
+ /* Convert parity from conventional to dual basis */
+ for(i=0;i<NROOTS;i++)
+ parity[i] = Taltab[parity[i]];
diff --git a/encode_rs_char.c b/encode_rs_char.c
new file mode 100644
index 0000000..a9bf2b8
--- /dev/null
+++ b/encode_rs_char.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#include "char.h"
+#include "rs-common.h"
+void encode_rs_char(void *p,data_t *data, data_t *parity){
+ struct rs *rs = (struct rs *)p;
+#include "encode_rs.h"
diff --git a/encode_rs_int.c b/encode_rs_int.c
new file mode 100644
index 0000000..3c9ce78
--- /dev/null
+++ b/encode_rs_int.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#include "int.h"
+#include "rs-common.h"
+void encode_rs_int(void *p,data_t *data, data_t *parity){
+ struct rs *rs = (struct rs *)p;
+#include "encode_rs.h"
diff --git a/exercise.c b/exercise.c
new file mode 100644
index 0000000..8ae008c
--- /dev/null
+++ b/exercise.c
@@ -0,0 +1,122 @@
+/* Exercise an RS codec a specified number of times using random
+ * data and error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef FIXED
+#include "fixed.h"
+#define EXERCISE exercise_8
+#elif defined(CCSDS)
+#include "fixed.h"
+#include "ccsds.h"
+#define EXERCISE exercise_ccsds
+#elif defined(BIGSYM)
+#include "int.h"
+#define EXERCISE exercise_int
+#include "char.h"
+#define EXERCISE exercise_char
+#ifdef FIXED
+#define PRINTPARM printf("(255,223):");
+#elif defined(CCSDS)
+#define PRINTPARM printf("CCSDS (255,223):");
+#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots);
+/* Exercise the RS codec passed as an argument */
+#if !defined(CCSDS) && !defined(FIXED)
+void *p,
+int trials){
+#if !defined(CCSDS) && !defined(FIXED)
+ struct rs *rs = (struct rs *)p;
+ data_t block[NN],tblock[NN];
+ int i;
+ int errors;
+ int errlocs[NN];
+ int derrlocs[NROOTS];
+ int derrors;
+ int errval,errloc;
+ int erasures;
+ int decoder_errors = 0;
+ while(trials-- != 0){
+ /* Test up to the error correction capacity of the code */
+ for(errors=0;errors <= NROOTS/2;errors++){
+ /* Load block with random data and encode */
+ for(i=0;i<NN-NROOTS;i++)
+ block[i] = random() & NN;
+#if defined(CCSDS) || defined(FIXED)
+ ENCODE_RS(&block[0],&block[NN-NROOTS],0);
+ ENCODE_RS(rs,&block[0],&block[NN-NROOTS]);
+ /* Make temp copy, seed with errors */
+ memcpy(tblock,block,sizeof(tblock));
+ memset(errlocs,0,sizeof(errlocs));
+ memset(derrlocs,0,sizeof(derrlocs));
+ erasures=0;
+ for(i=0;i<errors;i++){
+ do {
+ errval = random() & NN;
+ } while(errval == 0); /* Error value must be nonzero */
+ do {
+ errloc = random() % NN;
+ } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+ errlocs[errloc] = 1;
+ if(random() & 1) /* 50-50 chance */
+ derrlocs[erasures++] = errloc;
+ tblock[errloc] ^= errval;
+ }
+ /* Decode the errored block */
+#if defined(CCSDS) || defined(FIXED)
+ derrors = DECODE_RS(tblock,derrlocs,erasures,0);
+ derrors = DECODE_RS(rs,tblock,derrlocs,erasures);
+ if(derrors != errors){
+ printf(" decoder says %d errors, true number is %d\n",derrors,errors);
+ decoder_errors++;
+ }
+ for(i=0;i<derrors;i++){
+ if(errlocs[derrlocs[i]] == 0){
+ printf(" decoder indicates error in location %d without error\n",derrlocs[i]);
+ decoder_errors++;
+ }
+ }
+ if(memcmp(tblock,block,sizeof(tblock)) != 0){
+ printf(" uncorrected errors! output ^ input:");
+ decoder_errors++;
+ for(i=0;i<NN;i++)
+ printf(" %02x",tblock[i] ^ block[i]);
+ printf("\n");
+ }
+ }
+ }
+ return decoder_errors;
diff --git a/fec.c b/fec.c
new file mode 100644
index 0000000..35960c3
--- /dev/null
+++ b/fec.c
@@ -0,0 +1,66 @@
+/* Utility routines for FEC support
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+unsigned char Partab[256];
+int P_init;
+/* Create 256-entry odd-parity lookup table
+ * Needed only on non-ia32 machines
+ */
+void partab_init(void){
+ int i,cnt,ti;
+ /* Initialize parity lookup table */
+ for(i=0;i<256;i++){
+ cnt = 0;
+ ti = i;
+ while(ti){
+ if(ti & 1)
+ cnt++;
+ ti >>= 1;
+ }
+ Partab[i] = cnt & 1;
+ }
+ P_init=1;
+/* Lookup table giving count of 1 bits for integers 0-255 */
+int Bitcnt[] = {
+ 0, 1, 1, 2, 1, 2, 2, 3,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 5, 6, 6, 7, 6, 7, 7, 8,
diff --git a/fec.h b/fec.h
new file mode 100644
index 0000000..08e8454
--- /dev/null
+++ b/fec.h
@@ -0,0 +1,347 @@
+/* User include file for libfec
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#ifndef _FEC_H_
+#define _FEC_H_
+/* r=1/2 k=7 convolutional encoder polynomials
+ * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB
+ * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted
+ */
+#define V27POLYA 0x6d
+#define V27POLYB 0x4f
+void *create_viterbi27(int len);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+#ifdef __VEC__
+void *create_viterbi27_av(int len);
+void set_viterbi27_polynomial_av(int polys[2]);
+int init_viterbi27_av(void *p,int starting_state);
+int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_av(void *p);
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits);
+#ifdef __i386__
+void *create_viterbi27_mmx(int len);
+void set_viterbi27_polynomial_mmx(int polys[2]);
+int init_viterbi27_mmx(void *p,int starting_state);
+int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_mmx(void *p);
+int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits);
+void *create_viterbi27_sse(int len);
+void set_viterbi27_polynomial_sse(int polys[2]);
+int init_viterbi27_sse(void *p,int starting_state);
+int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse(void *p);
+int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits);
+void *create_viterbi27_sse2(int len);
+void set_viterbi27_polynomial_sse2(int polys[2]);
+int init_viterbi27_sse2(void *p,int starting_state);
+int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse2(void *p);
+int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits);
+void *create_viterbi27_port(int len);
+void set_viterbi27_polynomial_port(int polys[2]);
+int init_viterbi27_port(void *p,int starting_state);
+int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_port(void *p);
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits);
+/* r=1/2 k=9 convolutional encoder polynomials */
+#define V29POLYA 0x1af
+#define V29POLYB 0x11d
+void *create_viterbi29(int len);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+#ifdef __VEC__
+void *create_viterbi29_av(int len);
+void set_viterbi29_polynomial_av(int polys[2]);
+int init_viterbi29_av(void *p,int starting_state);
+int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_av(void *p);
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits);
+#ifdef __i386__
+void *create_viterbi29_mmx(int len);
+void set_viterbi29_polynomial_mmx(int polys[2]);
+int init_viterbi29_mmx(void *p,int starting_state);
+int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_mmx(void *p);
+int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits);
+void *create_viterbi29_sse(int len);
+void set_viterbi29_polynomial_sse(int polys[2]);
+int init_viterbi29_sse(void *p,int starting_state);
+int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse(void *p);
+int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits);
+void *create_viterbi29_sse2(int len);
+void set_viterbi29_polynomial_sse2(int polys[2]);
+int init_viterbi29_sse2(void *p,int starting_state);
+int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse2(void *p);
+int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits);
+void *create_viterbi29_port(int len);
+void set_viterbi29_polynomial_port(int polys[2]);
+int init_viterbi29_port(void *p,int starting_state);
+int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_port(void *p);
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits);
+/* r=1/3 k=9 convolutional encoder polynomials */
+#define V39POLYA 0x1ed
+#define V39POLYB 0x19b
+#define V39POLYC 0x127
+void *create_viterbi39(int len);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+#ifdef __VEC__
+void *create_viterbi39_av(int len);
+void set_viterbi39_polynomial_av(int polys[3]);
+int init_viterbi39_av(void *p,int starting_state);
+int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_av(void *p);
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits);
+#ifdef __i386__
+void *create_viterbi39_mmx(int len);
+void set_viterbi39_polynomial_mmx(int polys[3]);
+int init_viterbi39_mmx(void *p,int starting_state);
+int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_mmx(void *p);
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits);
+void *create_viterbi39_sse(int len);
+void set_viterbi39_polynomial_sse(int polys[3]);
+int init_viterbi39_sse(void *p,int starting_state);
+int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse(void *p);
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits);
+void *create_viterbi39_sse2(int len);
+void set_viterbi39_polynomial_sse2(int polys[3]);
+int init_viterbi39_sse2(void *p,int starting_state);
+int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse2(void *p);
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits);
+void *create_viterbi39_port(int len);
+void set_viterbi39_polynomial_port(int polys[3]);
+int init_viterbi39_port(void *p,int starting_state);
+int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_port(void *p);
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits);
+/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion
+ * dfree = 56
+ * These bits may be left-right flipped from some textbook representations;
+ * here I have the bits entering the shift register from the right (low) end
+ *
+ * Some other spacecraft use the same code, but with the polynomials in a different order.
+ * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion,
+ * so use set_viterbi615_polynomial() as appropriate.
+ */
+#define V615POLYA 042631
+#define V615POLYB 047245
+#define V615POLYC 056507
+#define V615POLYD 073363
+#define V615POLYE 077267
+#define V615POLYF 064537
+void *create_viterbi615(int len);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+#ifdef __VEC__
+void *create_viterbi615_av(int len);
+void set_viterbi615_polynomial_av(int polys[6]);
+int init_viterbi615_av(void *p,int starting_state);
+int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_av(void *p);
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits);
+#ifdef __i386__
+void *create_viterbi615_mmx(int len);
+void set_viterbi615_polynomial_mmx(int polys[6]);
+int init_viterbi615_mmx(void *p,int starting_state);
+int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_mmx(void *p);
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits);
+void *create_viterbi615_sse(int len);
+void set_viterbi615_polynomial_sse(int polys[6]);
+int init_viterbi615_sse(void *p,int starting_state);
+int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse(void *p);
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits);
+void *create_viterbi615_sse2(int len);
+void set_viterbi615_polynomial_sse2(int polys[6]);
+int init_viterbi615_sse2(void *p,int starting_state);
+int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse2(void *p);
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits);
+void *create_viterbi615_port(int len);
+void set_viterbi615_polynomial_port(int polys[6]);
+int init_viterbi615_port(void *p,int starting_state);
+int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_port(void *p);
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits);
+/* General purpose RS codec, 8-bit symbols */
+void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity);
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+ int no_eras);
+void *init_rs_char(int symsize,int gfpoly,
+ int fcr,int prim,int nroots,
+ int pad);
+void free_rs_char(void *rs);
+/* General purpose RS codec, integer symbols */
+void encode_rs_int(void *rs,int *data,int *parity);
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+void *init_rs_int(int symsize,int gfpoly,int fcr,
+ int prim,int nroots,int pad);
+void free_rs_int(void *rs);
+/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis)
+ * symbol representation
+ */
+void encode_rs_8(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad);
+/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad);
+/* Tables to map from conventional->dual (Taltab) and
+ * dual->conventional (Tal1tab) bases
+ */
+extern unsigned char Taltab[],Tal1tab[];
+/* CPU SIMD instruction set available */
+extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode;
+void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */
+/* Determine parity of argument: 1 = odd, 0 = even */
+#ifdef __i386__
+static inline int parityb(unsigned char x){
+ __asm__ __volatile__ ("test %1,%1;setpo %0" : "=g"(x) : "r" (x));
+ return x;
+void partab_init();
+static inline int parityb(unsigned char x){
+ extern unsigned char Partab[256];
+ extern int P_init;
+ if(!P_init){
+ partab_init();
+ }
+ return Partab[x];
+static inline int parity(int x){
+ /* Fold down to one byte */
+ x ^= (x >> 16);
+ x ^= (x >> 8);
+ return parityb(x);
+/* Useful utilities for simulation */
+double normal_rand(double mean, double std_dev);
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip);
+extern int Bitcnt[];
+/* Dot product functions */
+void *initdp(signed short coeffs[],int len);
+void freedp(void *dp);
+long dotprod(void *dp,signed short a[]);
+void *initdp_port(signed short coeffs[],int len);
+void freedp_port(void *dp);
+long dotprod_port(void *dp,signed short a[]);
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void freedp_mmx(void *dp);
+long dotprod_mmx(void *dp,signed short a[]);
+void *initdp_sse(signed short coeffs[],int len);
+void freedp_sse(void *dp);
+long dotprod_sse(void *dp,signed short a[]);
+void *initdp_sse2(signed short coeffs[],int len);
+void freedp_sse2(void *dp);
+long dotprod_sse2(void *dp,signed short a[]);
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+void freedp_av(void *dp);
+long dotprod_av(void *dp,signed short a[]);
+/* Sum of squares - accepts signed shorts, produces unsigned long long */
+unsigned long long sumsq(signed short *in,int cnt);
+unsigned long long sumsq_port(signed short *in,int cnt);
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *in,int cnt);
+unsigned long long sumsq_sse(signed short *in,int cnt);
+unsigned long long sumsq_sse2(signed short *in,int cnt);
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *in,int cnt);
+/* Low-level data structures and routines */
+int cpu_features(void);
+#endif /* _FEC_H_ */
diff --git a/fixed.h b/fixed.h
new file mode 100644
index 0000000..0ff27b2
--- /dev/null
+++ b/fixed.h
@@ -0,0 +1,33 @@
+/* Stuff specific to the CCSDS (255,223) RS codec
+ * (255,223) code over GF(256). Note: the conventional basis is still
+ * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c
+ *
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+static inline int mod255(int x){
+ while (x >= 255) {
+ x -= 255;
+ x = (x >> 8) + (x & 255);
+ }
+ return x;
+#define MODNN(x) mod255(x)
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+#define MM 8
+#define NN 255
+#define ALPHA_TO CCSDS_alpha_to
+#define INDEX_OF CCSDS_index_of
+#define GENPOLY CCSDS_poly
+#define NROOTS 32
+#define FCR 112
+#define PRIM 11
+#define IPRIM 116
+#define PAD pad
diff --git a/gen_ccsds.c b/gen_ccsds.c
new file mode 100644
index 0000000..e1e2e26
--- /dev/null
+++ b/gen_ccsds.c
@@ -0,0 +1,39 @@
+/* Generate tables for CCSDS code
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "char.h"
+#include "rs-common.h"
+#include "fec.h"
+int main(){
+ struct rs *rs;
+ int i;
+ rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */
+ assert(rs != NULL);
+ printf("char CCSDS_alpha_to[] = {");
+ for(i=0;i<256;i++){
+ if((i % 16) == 0)
+ printf("\n");
+ printf("0x%02x,",rs->alpha_to[i]);
+ }
+ printf("\n};\n\nchar CCSDS_index_of[] = {");
+ for(i=0;i<256;i++){
+ if((i % 16) == 0)
+ printf("\n");
+ printf("%3d,",rs->index_of[i]);
+ }
+ printf("\n};\n\nchar CCSDS_poly[] = {");
+ for(i=0;i<33;i++){
+ if((i % 16) == 0)
+ printf("\n");
+ printf("%3d,",rs->genpoly[i]);
+ }
+ printf("\n};\n");
+ exit(0);
diff --git a/gen_ccsds_tal.c b/gen_ccsds_tal.c
new file mode 100644
index 0000000..fc75503
--- /dev/null
+++ b/gen_ccsds_tal.c
@@ -0,0 +1,53 @@
+/* Conversion lookup tables from conventional alpha to Berlekamp's
+ * dual-basis representation. Used in the CCSDS version only.
+ * taltab[] -- convert conventional to dual basis
+ * tal1tab[] -- convert dual basis to conventional
+ * Note: the actual RS encoder/decoder works with the conventional basis.
+ * So data is converted from dual to conventional basis before either
+ * encoding or decoding and then converted back.
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#define DTYPE unsigned char
+DTYPE Taltab[256],Tal1tab[256];
+static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b };
+/* Generate conversion lookup tables between conventional alpha representation
+ * (@**7, @**6, ...@**0)
+ * and Berlekamp's dual basis representation
+ * (l0, l1, ...l7)
+ */
+int main(){
+ int i,j,k;
+ for(i=0;i<256;i++){/* For each value of input */
+ Taltab[i] = 0;
+ for(j=0;j<8;j++) /* for each column of matrix */
+ for(k=0;k<8;k++){ /* for each row of matrix */
+ if(i & (1<<k))
+ Taltab[i] ^= tal[7-k] & (1<<j);
+ }
+ Tal1tab[Taltab[i]] = i;
+ }
+ printf("unsigned char Taltab[] = {\n");
+ for(i=0;i<256;i++){
+ if((i % 16) == 0)
+ printf("\n");
+ printf("0x%02x,",Taltab[i]);
+ }
+ printf("\n};\n\nunsigned char Tal1tab[] = {");
+ for(i=0;i<256;i++){
+ if((i % 16) == 0)
+ printf("\n");
+ printf("0x%02x,",Tal1tab[i]);
+ }
+ printf("\n};\n");
+ exit(0);
diff --git a/init_rs.c b/init_rs.c
new file mode 100644
index 0000000..ef1cf47
--- /dev/null
+++ b/init_rs.c
@@ -0,0 +1,39 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+#if !defined(NULL)
+#define NULL ((void *)0)
+#include "rs-common.h"
+void free_rs(void *p){
+ struct rs *rs = (struct rs *)p;
+ free(rs->alpha_to);
+ free(rs->index_of);
+ free(rs->genpoly);
+ free(rs);
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_common(int symsize,int gfpoly,int fcr,int prim,
+ int nroots,int pad){
+ struct rs *rs;
+#include "init_rs.h"
+ return rs;
diff --git a/init_rs.h b/init_rs.h
new file mode 100644
index 0000000..2b2ae98
--- /dev/null
+++ b/init_rs.h
@@ -0,0 +1,106 @@
+/* Common code for intializing a Reed-Solomon control block (char or int symbols)
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#undef NULL
+#define NULL ((void *)0)
+ int i, j, sr,root,iprim;
+ rs = NULL;
+ /* Check parameter ranges */
+ if(symsize < 0 || symsize > 8*sizeof(data_t)){
+ goto done;
+ }
+ if(fcr < 0 || fcr >= (1<<symsize))
+ goto done;
+ if(prim <= 0 || prim >= (1<<symsize))
+ goto done;
+ if(nroots < 0 || nroots >= (1<<symsize))
+ goto done; /* Can't have more roots than symbol values! */
+ if(pad < 0 || pad >= ((1<<symsize) -1 - nroots))
+ goto done; /* Too much padding */
+ rs = (struct rs *)calloc(1,sizeof(struct rs));
+ if(rs == NULL)
+ goto done;
+ rs->mm = symsize;
+ rs->nn = (1<<symsize)-1;
+ rs->pad = pad;
+ rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+ if(rs->alpha_to == NULL){
+ free(rs);
+ rs = NULL;
+ goto done;
+ }
+ rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+ if(rs->index_of == NULL){
+ free(rs->alpha_to);
+ free(rs);
+ rs = NULL;
+ goto done;
+ }
+ /* Generate Galois field lookup tables */
+ rs->index_of[0] = A0; /* log(zero) = -inf */
+ rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */
+ sr = 1;
+ for(i=0;i<rs->nn;i++){
+ rs->index_of[sr] = i;
+ rs->alpha_to[i] = sr;
+ sr <<= 1;
+ if(sr & (1<<symsize))
+ sr ^= gfpoly;
+ sr &= rs->nn;
+ }
+ if(sr != 1){
+ /* field generator polynomial is not primitive! */
+ free(rs->alpha_to);
+ free(rs->index_of);
+ free(rs);
+ rs = NULL;
+ goto done;
+ }
+ /* Form RS code generator polynomial from its roots */
+ rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1));
+ if(rs->genpoly == NULL){
+ free(rs->alpha_to);
+ free(rs->index_of);
+ free(rs);
+ rs = NULL;
+ goto done;
+ }
+ rs->fcr = fcr;
+ rs->prim = prim;
+ rs->nroots = nroots;
+ /* Find prim-th root of 1, used in decoding */
+ for(iprim=1;(iprim % prim) != 0;iprim += rs->nn)
+ ;
+ rs->iprim = iprim / prim;
+ rs->genpoly[0] = 1;
+ for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) {
+ rs->genpoly[i+1] = 1;
+ /* Multiply rs->genpoly[] by @**(root + x) */
+ for (j = i; j > 0; j--){
+ if (rs->genpoly[j] != 0)
+ rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)];
+ else
+ rs->genpoly[j] = rs->genpoly[j-1];
+ }
+ /* rs->genpoly[0] can never be zero */
+ rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)];
+ }
+ /* convert rs->genpoly[] to index form for quicker encoding */
+ for (i = 0; i <= nroots; i++)
+ rs->genpoly[i] = rs->index_of[rs->genpoly[i]];
+ done:;
diff --git a/init_rs_char.c b/init_rs_char.c
new file mode 100644
index 0000000..a51099a
--- /dev/null
+++ b/init_rs_char.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "char.h"
+#include "rs-common.h"
+void free_rs_char(void *p){
+ struct rs *rs = (struct rs *)p;
+ free(rs->alpha_to);
+ free(rs->index_of);
+ free(rs->genpoly);
+ free(rs);
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+ int nroots,int pad){
+ struct rs *rs;
+#include "init_rs.h"
+ return rs;
diff --git a/init_rs_int.c b/init_rs_int.c
new file mode 100644
index 0000000..a6036c2
--- /dev/null
+++ b/init_rs_int.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "int.h"
+#include "rs-common.h"
+void free_rs_int(void *p){
+ struct rs *rs = (struct rs *)p;
+ free(rs->alpha_to);
+ free(rs->index_of);
+ free(rs->genpoly);
+ free(rs);
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+ int nroots,int pad){
+ struct rs *rs;
+#include "init_rs.h"
+ return rs;
diff --git a/int.h b/int.h
new file mode 100644
index 0000000..46e865d
--- /dev/null
+++ b/int.h
@@ -0,0 +1,22 @@
+/* Stuff specific to the general (integer) version of the Reed-Solomon codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned int data_t;
+#define MODNN(x) modnn(rs,x)
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to)
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
diff --git a/lesser.txt b/lesser.txt
new file mode 100644
index 0000000..b1e3f5a
--- /dev/null
+++ b/lesser.txt
@@ -0,0 +1,504 @@
+ Version 2.1, February 1999
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+ Preamble
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+ a) The modified work must itself be a software library.
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+ How to Apply These Terms to Your New Libraries
+ If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change. You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+ To apply these terms, attach the following notices to the library. It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+ <one line to give the library's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+Also add information on how to contact you by electronic and paper mail.
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary. Here is a sample; alter the names:
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+ <signature of Ty Coon>, 1 April 1990
+ Ty Coon, President of Vice
+That's all there is to it!
diff --git a/makefile.in b/makefile.in
new file mode 100644
index 0000000..53fdfcb
--- /dev/null
+++ b/makefile.in
@@ -0,0 +1,242 @@
+# Makefile prototype for configure
+# Copyright 2004 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+# @configure_input@
+srcdir = @srcdir@
+prefix = @prefix@
+VPATH = @srcdir@
+LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \
+ viterbi39.o viterbi39_port.o \
+ viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \
+ decode_rs_char.o decode_rs_int.o decode_rs_8.o \
+ init_rs_char.o init_rs_int.o ccsds_tab.o \
+ encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \
+ dotprod.o dotprod_port.o \
+ peakval.o peakval_port.o \
+ sumsq.o sumsq_port.o
+all: libfec.a $(SHARED_LIB)
+test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest
+ @echo "Correctness tests:"
+ ./vtest27 -e 3.0 -n 1000 -v
+ ./vtest29 -e 2.5 -n 1000 -v
+ ./vtest39 -e 2.5 -n 1000 -v
+ ./vtest615 -e 1.0 -n 100 -v
+ ./rstest
+ ./dtest
+ ./sumsq_test
+ ./peaktest
+ @echo "Speed tests:"
+ ./vtest27
+ ./vtest29
+ ./vtest39
+ ./vtest615
+install: all
+ mkdir -p @libdir@
+ install -m 644 -p $(SHARED_LIB) libfec.a @libdir@
+# (cd @libdir@;ln -f -s $(SHARED_LIB) libfec.so)
+ mkdir -p @includedir@
+ install -m 644 -p fec.h @includedir@
+ mkdir -m 0755 -p @mandir@/man3
+ install -m 644 -p simd-viterbi.3 rs.3 dsp.3 @mandir@/man3
+peaktest: peaktest.o libfec.a
+ gcc -g -o $@ $^
+sumsq_test: sumsq_test.o libfec.a
+ gcc -g -o $@ $^
+dtest: dtest.o libfec.a
+ gcc -g -o $@ $^ -lm
+vtest27: vtest27.o libfec.a
+ gcc -g -o $@ $^ -lm
+vtest29: vtest29.o libfec.a
+ gcc -g -o $@ $^ -lm
+vtest39: vtest39.o libfec.a
+ gcc -g -o $@ $^ -lm
+vtest615: vtest615.o libfec.a
+ gcc -g -o $@ $^ -lm
+rstest: rstest.o libfec.a
+ gcc -g -o $@ $^
+rs_speedtest: rs_speedtest.o libfec.a
+ gcc -g -o $@ $^
+# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug?
+vtest27.o: vtest27.c fec.h
+ gcc -g -c $<
+vtest29.o: vtest29.c fec.h
+ gcc -g -c $<
+vtest39.o: vtest39.c fec.h
+ gcc -g -c $<
+vtest615.o: vtest615.c fec.h
+ gcc -g -c $<
+libfec.a: $(LIBS)
+ ar rv $@ $^
+ ranlib libfec.a
+# for Darwin
+libfec.dylib: $(LIBS)
+ $(CC) -dynamiclib -install_name $@ -o $@ $^
+# for Linux et al
+libfec.so: $(LIBS)
+ gcc -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc
+dotprod.o: dotprod.c fec.h
+dotprod_port.o: dotprod_port.c fec.h
+viterbi27.o: viterbi27.c fec.h
+viterbi27_port.o: viterbi27_port.c fec.h
+viterbi29.o: viterbi29.c fec.h
+viterbi39.o: viterbi39.c fec.h
+viterbi39_port.o: viterbi39_port.c fec.h
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+viterbi39_sse.o: viterbi39_sse.c fec.h
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+encode_rs_char.o: encode_rs_char.c char.h rs-common.h
+encode_rs_int.o: encode_rs_int.c int.h rs-common.h
+encode_rs_8.o: encode_rs_8.c fixed.h
+encode_rs_av.o: encode_rs_av.c fixed.h
+decode_rs_char.o: decode_rs_char.c char.h rs-common.h
+decode_rs_int.o: decode_rs_int.c int.h rs-common.h
+decode_rs_8.o: decode_rs_8.c fixed.h
+init_rs_char.o: init_rs_char.c char.h rs-common.h
+init_rs_int.o: init_rs_int.c int.h rs-common.h
+ccsds_tab.o: ccsds_tab.c
+ccsds_tab.c: gen_ccsds
+ ./gen_ccsds > ccsds_tab.c
+gen_ccsds: gen_ccsds.o init_rs_char.o
+ gcc -o $@ $^
+gen_ccsds.o: gen_ccsds.c
+ gcc $(CFLAGS) -c -o $@ $<
+ccsds_tal.o: ccsds_tal.c
+ccsds_tal.c: gen_ccsds_tal
+ ./gen_ccsds_tal > ccsds_tal.c
+exercise_char.o: exercise.c
+ gcc $(CFLAGS) -c -o $@ $<
+exercise_int.o: exercise.c
+ gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $<
+exercise_8.o: exercise.c
+ gcc -DFIXED=1 $(CFLAGS) -c -o $@ $<
+exercise_ccsds.o: exercise.c
+ gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $<
+viterbi27.o: viterbi27.c fec.h
+viterbi27_port.o: viterbi27_port.c fec.h
+viterbi27_av.o: viterbi27_av.c fec.h
+viterbi27_mmx.o: viterbi27_mmx.c fec.h
+ gcc $(CFLAGS) -mmmx -c -o $@ $<
+viterbi27_sse.o: viterbi27_sse.c fec.h
+ gcc $(CFLAGS) -msse -c -o $@ $<
+viterbi27_sse2.o: viterbi27_sse2.c fec.h
+ gcc $(CFLAGS) -msse2 -c -o $@ $<
+viterbi29.o: viterbi29.c fec.h
+viterbi29_port.o: viterbi29_port.c fec.h
+viterbi29_av.o: viterbi29_av.c fec.h
+viterbi29_mmx.o: viterbi29_mmx.c fec.h
+ gcc $(CFLAGS) -mmmx -c -o $@ $<
+viterbi29_sse.o: viterbi29_sse.c fec.h
+ gcc $(CFLAGS) -msse -c -o $@ $<
+viterbi29_sse2.o: viterbi29_sse2.c fec.h
+ gcc $(CFLAGS) -msse2 -c -o $@ $<
+viterbi39.o: viterbi39.c fec.h
+viterbi39_port.o: viterbi39_port.c fec.h
+viterbi39_av.o: viterbi39_av.c fec.h
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+ gcc $(CFLAGS) -mmmx -c -o $@ $<
+viterbi39_sse.o: viterbi39_sse.c fec.h
+ gcc $(CFLAGS) -msse -c -o $@ $<
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+ gcc $(CFLAGS) -msse2 -c -o $@ $<
+viterbi615.o: viterbi615.c fec.h
+viterbi615_port.o: viterbi615_port.c fec.h
+viterbi615_av.o: viterbi615_av.c fec.h
+viterbi615_mmx.o: viterbi615_mmx.c fec.h
+ gcc $(CFLAGS) -mmmx -c -o $@ $<
+viterbi615_sse.o: viterbi615_sse.c fec.h
+ gcc $(CFLAGS) -msse -c -o $@ $<
+viterbi615_sse2.o: viterbi615_sse2.c fec.h
+ gcc $(CFLAGS) -msse2 -c -o $@ $<
+cpu_mode_x86.o: cpu_mode_x86.c fec.h
+cpu_mode_ppc.o: cpu_mode_ppc.c fec.h
+ rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core
+ rm -rf autom4te.cache
+distclean: clean
+ rm -f config.log config.cache config.status config.h makefile
diff --git a/mmxbfly27.s b/mmxbfly27.s
new file mode 100644
index 0000000..4abbf48
--- /dev/null
+++ b/mmxbfly27.s
@@ -0,0 +1,148 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+ for 64-state (k=7) convolutional code
+ Copyright 2004 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ;
+ # MMX (64-bit SIMD) version
+ # requires Pentium-MMX, Pentium-II or better
+ # These are offsets into struct v27, defined in viterbi27_mmx.c
+ .set DP,128
+ .set OLDMETRICS,132
+ .set NEWMETRICS,136
+ .text
+ .global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
+ .type update_viterbi27_blk_mmx,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ movl 12(%ebp),%ebx # ebx = syms
+ movw (%ebx),%ax # ax = second symbol : first symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ movb %ah,%bl
+ andl $255,%eax
+ andl $255,%ebx
+ # shift into first array index dimension slot
+ shll $5,%eax
+ shll $5,%ebx
+ # each invocation of this macro will do 8 butterflies in parallel
+ .MACRO butterfly GROUP
+ # Compute branch metrics
+ movq (Mettab27_1+8*\GROUP)(%eax),%mm3
+ movq fifteens,%mm0
+ paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
+ paddb ones,%mm3 # emulate pavgb - this may not be necessary
+ psrlq $1,%mm3
+ pand %mm0,%mm3
+ movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0
+ movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
+ movq %mm6,%mm1
+ movq %mm2,%mm7
+ paddb %mm3,%mm6
+ paddb %mm3,%mm2
+ pxor %mm0,%mm3 # invert branch metric
+ paddb %mm3,%mm7 # path metric for inverted symbols
+ paddb %mm3,%mm1
+ # live registers 1 2 6 7
+ # Compare mm6 and mm7; mm1 and mm2
+ pxor %mm3,%mm3
+ movq %mm6,%mm4
+ movq %mm1,%mm5
+ psubb %mm7,%mm4 # mm4 = mm6 - mm7
+ psubb %mm2,%mm5 # mm5 = mm1 - mm2
+ pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better)
+ pcmpgtb %mm3,%mm5 # mm5 = second set of decisions
+ # live registers 1 2 4 5 6 7
+ # select survivors
+ movq %mm4,%mm0
+ pand %mm4,%mm7
+ movq %mm5,%mm3
+ pand %mm5,%mm2
+ pandn %mm6,%mm0
+ pandn %mm1,%mm3
+ por %mm0,%mm7 # mm7 = first set of survivors
+ por %mm3,%mm2 # mm2 = second set of survivors
+ # live registers 2 4 5 7
+ # interleave & store decisions in mm4, mm5
+ # interleave & store new branch metrics in mm2, mm7
+ movq %mm4,%mm3
+ movq %mm7,%mm0
+ punpckhbw %mm5,%mm4
+ punpcklbw %mm5,%mm3
+ punpcklbw %mm2,%mm7 # interleave second 8 new metrics
+ punpckhbw %mm2,%mm0 # interleave first 8 new metrics
+ movq %mm4,(16*\GROUP+8)(%edx)
+ movq %mm3,(16*\GROUP)(%edx)
+ movq %mm7,(16*\GROUP)(%edi)
+ movq %mm0,(16*\GROUP+8)(%edi)
+ .endm
+# invoke macro 4 times for a total of 32 butterflies
+ butterfly GROUP=0
+ butterfly GROUP=1
+ butterfly GROUP=2
+ butterfly GROUP=3
+ addl $64,%edx # bump decision pointer
+ # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: emms
+ movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 8
+ .byte 15,15,15,15,15,15,15,15
+ .align 8
+ones: .byte 1,1,1,1,1,1,1,1
diff --git a/mmxbfly29.s b/mmxbfly29.s
new file mode 100644
index 0000000..e37cab8
--- /dev/null
+++ b/mmxbfly29.s
@@ -0,0 +1,161 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+ for 256-state (k=9) convolutional code
+ Copyright 2004 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits);
+ # These are offsets into struct v29, defined in viterbi29.h
+ .set DP,512
+ .set OLDMETRICS,516
+ .set NEWMETRICS,520
+ .text
+ .global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2
+ .type update_viterbi29_blk_mmx,@function
+ .align 16
+ # MMX (64-bit SIMD) version
+ # requires Pentium-MMX, Pentium-II or better
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ movl 12(%ebp),%ebx # ebx = syms
+ movw (%ebx),%ax # ax = second symbol : first symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ movb %ah,%bl
+ andl $255,%eax
+ andl $255,%ebx
+ # shift into first array index dimension slot
+ shll $7,%eax
+ shll $7,%ebx
+ # each invocation of this macro will do 8 butterflies in parallel
+ .MACRO butterfly GROUP
+ # Compute branch metrics
+ movq (Mettab29_1+8*\GROUP)(%eax),%mm3
+ movq fifteens,%mm0
+ paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3
+ paddb ones,%mm3 # emulate pavgb - this may not be necessary
+ psrlq $1,%mm3
+ pand %mm0,%mm3
+ movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0
+ movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1
+ movq %mm6,%mm1
+ movq %mm2,%mm7
+ paddb %mm3,%mm6
+ paddb %mm3,%mm2
+ pxor %mm0,%mm3 # invert branch metric
+ paddb %mm3,%mm7 # path metric for inverted symbols
+ paddb %mm3,%mm1
+ # live registers 1 2 6 7
+ # Compare mm6 and mm7; mm1 and mm2
+ pxor %mm3,%mm3
+ movq %mm6,%mm4
+ movq %mm1,%mm5
+ psubb %mm7,%mm4 # mm4 = mm6 - mm7
+ psubb %mm2,%mm5 # mm5 = mm1 - mm2
+ pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better)
+ pcmpgtb %mm3,%mm5 # mm5 = second set of decisions
+ # live registers 1 2 4 5 6 7
+ # select survivors
+ movq %mm4,%mm0
+ pand %mm4,%mm7
+ movq %mm5,%mm3
+ pand %mm5,%mm2
+ pandn %mm6,%mm0
+ pandn %mm1,%mm3
+ por %mm0,%mm7 # mm7 = first set of survivors
+ por %mm3,%mm2 # mm2 = second set of survivors
+ # live registers 2 4 5 7
+ # interleave & store decisions in mm4, mm5
+ # interleave & store new branch metrics in mm2, mm7
+ movq %mm4,%mm3
+ movq %mm7,%mm0
+ punpckhbw %mm5,%mm4
+ punpcklbw %mm5,%mm3
+ punpcklbw %mm2,%mm7 # interleave second 8 new metrics
+ punpckhbw %mm2,%mm0 # interleave first 8 new metrics
+ movq %mm4,(16*\GROUP+8)(%edx)
+ movq %mm3,(16*\GROUP)(%edx)
+ movq %mm7,(16*\GROUP)(%edi)
+ movq %mm0,(16*\GROUP+8)(%edi)
+ .endm
+# invoke macro 16 times for a total of 128 butterflies
+ butterfly GROUP=0
+ butterfly GROUP=1
+ butterfly GROUP=2
+ butterfly GROUP=3
+ butterfly GROUP=4
+ butterfly GROUP=5
+ butterfly GROUP=6
+ butterfly GROUP=7
+ butterfly GROUP=8
+ butterfly GROUP=9
+ butterfly GROUP=10
+ butterfly GROUP=11
+ butterfly GROUP=12
+ butterfly GROUP=13
+ butterfly GROUP=14
+ butterfly GROUP=15
+ addl $256,%edx # bump decision pointer
+ # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: emms
+ movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 8
+ .byte 15,15,15,15,15,15,15,15
+ .align 8
+ones: .byte 1,1,1,1,1,1,1,1
diff --git a/peak_mmx_assist.s b/peak_mmx_assist.s
new file mode 100644
index 0000000..dae831f
--- /dev/null
+++ b/peak_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+# Find peak value in signed 16-bit input samples
+# int peakval_mmx(signed short *in,int cnt);
+ .global peakval_mmx
+ .type peakval_mmx,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ pushl %ebx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm7,%mm7 # clear peak
+1: subl $4,%ecx
+ jl 2f
+ movq (%esi),%mm0
+ movq %mm0,%mm1
+ psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive
+ pxor %mm1,%mm0 # complement negatives
+ psubw %mm1,%mm0 # add 1 to negatives
+ movq %mm7,%mm6 # copy previous peak
+ pcmpgtw %mm0,%mm6 # ff == old peak greater
+ pand %mm6,%mm7 # select old peaks that are greater
+ pandn %mm0,%mm6 # select new values that are greater
+ por %mm6,%mm7
+ addl $8,%esi
+ jmp 1b
+2: movd %mm7,%eax
+ psrlq $16,%mm7
+ andl $0xffff,%eax
+ movd %mm7,%edx
+ psrlq $16,%mm7
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 3f
+ movl %edx,%eax
+ movd %mm7,%edx
+ psrlq $16,%mm7
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 4f
+ movl %edx,%eax
+ movd %mm7,%edx
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 5f
+ movl %edx,%eax
+ emms
+ popl %ebx
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/peak_sse2_assist.s b/peak_sse2_assist.s
new file mode 100644
index 0000000..1dee3a8
--- /dev/null
+++ b/peak_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+ .text
+# Find peak absolute value in signed 16-bit input samples
+# int peakval_sse2(signed short *in,int cnt);
+ .global peakval_sse2
+ .type peakval_sse2,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %xmm7,%xmm7 # clear peak
+1: subl $8,%ecx
+ jl 2f
+ movaps (%esi),%xmm0
+ movaps %xmm0,%xmm1
+ psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive
+ pxor %xmm1,%xmm0 # complement negatives
+ psubw %xmm1,%xmm0 # add 1 to negatives
+ pmaxsw %xmm0,%xmm7 # store peak
+ addl $16,%esi
+ jmp 1b
+2: movaps %xmm7,%xmm0
+ psrldq $8,%xmm0
+ pmaxsw %xmm0,%xmm7
+ movaps %xmm7,%xmm0
+ psrlq $32,%xmm0
+ pmaxsw %xmm0,%xmm7
+ movaps %xmm7,%xmm0
+ psrlq $16,%xmm0
+ pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7
+ movd %xmm7,%eax
+ andl $0xffff,%eax
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/peak_sse_assist.s b/peak_sse_assist.s
new file mode 100644
index 0000000..ea6fce8
--- /dev/null
+++ b/peak_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+# Find peak absolute value in signed 16-bit input samples
+# int peakval_sse(signed short *in,int cnt);
+ .global peakval_sse
+ .type peakval_sse,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm7,%mm7 # clear peak
+1: subl $4,%ecx
+ jl 2f
+ movq (%esi),%mm0
+ movq %mm0,%mm1
+ psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive
+ pxor %mm1,%mm0 # complement negatives
+ psubw %mm1,%mm0 # add 1 to negatives
+ pmaxsw %mm0,%mm7 # store peak
+ addl $8,%esi
+ jmp 1b
+2: movq %mm7,%mm0
+ psrlq $32,%mm0
+ pmaxsw %mm0,%mm7
+ movq %mm7,%mm0
+ psrlq $16,%mm0
+ pmaxsw %mm0,%mm7 # min value in low word of %mm7
+ movd %mm7,%eax
+ andl $0xffff,%eax
+ emms
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/peaktest.c b/peaktest.c
new file mode 100644
index 0000000..fa4b280
--- /dev/null
+++ b/peaktest.c
@@ -0,0 +1,38 @@
+/* Verify correctness of the peak routine
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+int peakval(signed short *,int);
+int peakval_port(signed short *,int);
+int main(){
+ int i,s;
+ int result,rresult;
+ signed short samples[NSAMP];
+ srandom(time(NULL));
+ for(i=0;i<NSAMP;i++){
+ do {
+ s = random() & 0x0fff;
+ } while(s == 0x8000);
+ samples[i] = s;
+ }
+ samples[5] = 25000;
+ rresult = peakval_port(&samples[OFFSET],NSAMP-OFFSET);
+ result = peakval(&samples[OFFSET],NSAMP-OFFSET);
+ if(result == rresult){
+ printf("OK\n");
+ } else {
+ printf("peak mismatch: %d != %d\n",result,rresult);
+ }
+ exit(0);
diff --git a/peakval.c b/peakval.c
new file mode 100644
index 0000000..811a3a9
--- /dev/null
+++ b/peakval.c
@@ -0,0 +1,39 @@
+/* Switch to appropriate version of peakval routine
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_port(signed short *b,int cnt);
+#ifdef __i386__
+int peakval_mmx(signed short *b,int cnt);
+int peakval_sse(signed short *b,int cnt);
+int peakval_sse2(signed short *b,int cnt);
+#ifdef __VEC__
+int peakval_av(signed short *b,int cnt);
+int peakval(signed short *b,int cnt){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return peakval_port(b,cnt);
+#ifdef __i386__
+ case MMX:
+ return peakval_mmx(b,cnt);
+ case SSE:
+ return peakval_sse(b,cnt);
+ case SSE2:
+ return peakval_sse2(b,cnt);
+#ifdef __VEC__
+ case ALTIVEC:
+ return peakval_av(b,cnt);
+ }
diff --git a/peakval_av.c b/peakval_av.c
new file mode 100644
index 0000000..ae54c10
--- /dev/null
+++ b/peakval_av.c
@@ -0,0 +1,61 @@
+/* Return the largest absolute value of a vector of signed shorts
+ * This is the Altivec SIMD version.
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "fec.h"
+signed short peakval_av(signed short *in,int cnt){
+ vector signed short x;
+ int pad;
+ union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s;
+ vector signed short smallest,largest;
+ smallest = (vector signed short)(0);
+ largest = (vector signed short)(0);
+ if((pad = (int)in & 15)!=0){
+ /* Load unaligned leading word */
+ x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+ if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+ s.c[15] = (8-cnt)<<4;
+ x = vec_sro(x,s.cv);
+ }
+ smallest = vec_min(smallest,x);
+ largest = vec_max(largest,x);
+ in += 8-pad/2;
+ cnt -= 8-pad/2;
+ }
+ /* Everything is now aligned, rip through most of the block */
+ while(cnt >= 8){
+ x = vec_ld(0,in);
+ smallest = vec_min(smallest,x);
+ largest = vec_max(largest,x);
+ in += 8;
+ cnt -= 8;
+ }
+ /* Handle trailing fragment, if any */
+ if(cnt > 0){
+ x = vec_ld(0,in);
+ s.c[15] = (8-cnt)<<4;
+ x = vec_sro(x,s.cv);
+ smallest = vec_min(smallest,x);
+ largest = vec_max(largest,x);
+ }
+ /* Combine and extract result */
+ largest = vec_max(largest,vec_abs(smallest));
+ s.c[15] = 64; /* Shift right four 16-bit words */
+ largest = vec_max(largest,vec_sro(largest,s.cv));
+ s.c[15] = 32; /* Shift right two 16-bit words */
+ largest = vec_max(largest,vec_sro(largest,s.cv));
+ s.c[15] = 16; /* Shift right one 16-bit word */
+ largest = vec_max(largest,vec_sro(largest,s.cv));
+ s.hv = largest;
+ return s.s[7];
diff --git a/peakval_mmx.c b/peakval_mmx.c
new file mode 100644
index 0000000..436fe88
--- /dev/null
+++ b/peakval_mmx.c
@@ -0,0 +1,34 @@
+/* Wrapper for the MMX version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+int peakval_mmx_assist(signed short *,int);
+int peakval_mmx(signed short *b,int cnt){
+ int peak = 0;
+ int a;
+ while(((int)b & 7) != 0 && cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ a = peakval_mmx_assist(b,cnt);
+ if(a > peak)
+ peak = a;
+ b += cnt & ~3;
+ cnt &= 3;
+ while(cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ return peak;
diff --git a/peakval_mmx_assist.s b/peakval_mmx_assist.s
new file mode 100644
index 0000000..553cb79
--- /dev/null
+++ b/peakval_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+# Find peak value in signed 16-bit input samples
+# int peakval_mmx_assist(signed short *in,int cnt);
+ .global peakval_mmx_assist
+ .type peakval_mmx_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ pushl %ebx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm7,%mm7 # clear peak
+1: subl $4,%ecx
+ jl 2f
+ movq (%esi),%mm0
+ movq %mm0,%mm1
+ psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive
+ pxor %mm1,%mm0 # complement negatives
+ psubw %mm1,%mm0 # add 1 to negatives
+ movq %mm7,%mm6 # copy previous peak
+ pcmpgtw %mm0,%mm6 # ff == old peak greater
+ pand %mm6,%mm7 # select old peaks that are greater
+ pandn %mm0,%mm6 # select new values that are greater
+ por %mm6,%mm7
+ addl $8,%esi
+ jmp 1b
+2: movd %mm7,%eax
+ psrlq $16,%mm7
+ andl $0xffff,%eax
+ movd %mm7,%edx
+ psrlq $16,%mm7
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 3f
+ movl %edx,%eax
+ movd %mm7,%edx
+ psrlq $16,%mm7
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 4f
+ movl %edx,%eax
+ movd %mm7,%edx
+ andl $0xffff,%edx
+ cmpl %edx,%eax
+ jnl 5f
+ movl %edx,%eax
+ emms
+ popl %ebx
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/peakval_port.c b/peakval_port.c
new file mode 100644
index 0000000..07ab316
--- /dev/null
+++ b/peakval_port.c
@@ -0,0 +1,16 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_port(signed short *b,int len){
+ int peak = 0;
+ int a,i;
+ for(i=0;i<len;i++){
+ a = abs(b[i]);
+ if(a > peak)
+ peak = a;
+ }
+ return peak;
diff --git a/peakval_sse.c b/peakval_sse.c
new file mode 100644
index 0000000..9868b7f
--- /dev/null
+++ b/peakval_sse.c
@@ -0,0 +1,35 @@
+/* IA-32 SSE version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_sse_assist(signed short *,int);
+int peakval_sse(signed short *b,int cnt){
+ int peak = 0;
+ int a;
+ while(((int)b & 7) != 0 && cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ a = peakval_sse_assist(b,cnt);
+ if(a > peak)
+ peak = a;
+ b += cnt & ~3;
+ cnt &= 3;
+ while(cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ return peak;
diff --git a/peakval_sse2.c b/peakval_sse2.c
new file mode 100644
index 0000000..79d9059
--- /dev/null
+++ b/peakval_sse2.c
@@ -0,0 +1,34 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_sse2_assist(signed short *,int);
+int peakval_sse2(signed short *b,int cnt){
+ int peak = 0;
+ int a;
+ while(((int)b & 15) != 0 && cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ a = peakval_sse2_assist(b,cnt);
+ if(a > peak)
+ peak = a;
+ b += cnt & ~7;
+ cnt &= 7;
+ while(cnt != 0){
+ a = abs(*b);
+ if(a > peak)
+ peak = a;
+ b++;
+ cnt--;
+ }
+ return peak;
diff --git a/peakval_sse2_assist.s b/peakval_sse2_assist.s
new file mode 100644
index 0000000..c7a58e7
--- /dev/null
+++ b/peakval_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+# Find peak absolute value in signed 16-bit input samples
+# int peakval_sse2_assist(signed short *in,int cnt);
+ .global peakval_sse2_assist
+ .type peakval_sse2_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %xmm7,%xmm7 # clear peak
+1: subl $8,%ecx
+ jl 2f
+ movaps (%esi),%xmm0
+ movaps %xmm0,%xmm1
+ psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive
+ pxor %xmm1,%xmm0 # complement negatives
+ psubw %xmm1,%xmm0 # add 1 to negatives
+ pmaxsw %xmm0,%xmm7 # store peak
+ addl $16,%esi
+ jmp 1b
+2: movaps %xmm7,%xmm0
+ psrldq $8,%xmm0
+ pmaxsw %xmm0,%xmm7
+ movaps %xmm7,%xmm0
+ psrlq $32,%xmm0
+ pmaxsw %xmm0,%xmm7
+ movaps %xmm7,%xmm0
+ psrlq $16,%xmm0
+ pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7
+ movd %xmm7,%eax
+ andl $0xffff,%eax
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/peakval_sse_assist.s b/peakval_sse_assist.s
new file mode 100644
index 0000000..827c800
--- /dev/null
+++ b/peakval_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+ .text
+# Find peak absolute value in signed 16-bit input samples
+# int peakval_sse_assist(signed short *in,int cnt);
+ .global peakval_sse_assist
+ .type peakval_sse_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm7,%mm7 # clear peak
+1: subl $4,%ecx
+ jl 2f
+ movq (%esi),%mm0
+ movq %mm0,%mm1
+ psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive
+ pxor %mm1,%mm0 # complement negatives
+ psubw %mm1,%mm0 # add 1 to negatives
+ pmaxsw %mm0,%mm7 # store peak
+ addl $8,%esi
+ jmp 1b
+2: movq %mm7,%mm0
+ psrlq $32,%mm0
+ pmaxsw %mm0,%mm7
+ movq %mm7,%mm0
+ psrlq $16,%mm0
+ pmaxsw %mm0,%mm7 # min value in low word of %mm7
+ movd %mm7,%eax
+ andl $0xffff,%eax
+ emms
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
diff --git a/rs-common.h b/rs-common.h
new file mode 100644
index 0000000..e64eb39
--- /dev/null
+++ b/rs-common.h
@@ -0,0 +1,26 @@
+/* Stuff common to all the general-purpose Reed-Solomon codecs
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+/* Reed-Solomon codec control block */
+struct rs {
+ int mm; /* Bits per symbol */
+ int nn; /* Symbols per block (= (1<<mm)-1) */
+ data_t *alpha_to; /* log lookup table */
+ data_t *index_of; /* Antilog lookup table */
+ data_t *genpoly; /* Generator polynomial */
+ int nroots; /* Number of generator roots = number of parity symbols */
+ int fcr; /* First consecutive root, index form */
+ int prim; /* Primitive element, index form */
+ int iprim; /* prim-th root of 1, index form */
+ int pad; /* Padding bytes in shortened block */
+static inline int modnn(struct rs *rs,int x){
+ while (x >= rs->nn) {
+ x -= rs->nn;
+ x = (x >> rs->mm) + (x & rs->nn);
+ }
+ return x;
diff --git a/rs.3 b/rs.3
new file mode 100644
index 0000000..5d71503
--- /dev/null
+++ b/rs.3
@@ -0,0 +1,198 @@
+init_rs_int, encode_rs_int, decode_rs_int, free_rs_int,
+init_rs_char, encode_rs_char, decode_rs_char, free_rs_char,
+encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds
+\- Reed-Solomon encoding/decoding
+.ft B
+#include "fec.h"
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+ int nroots,int pad);
+void encode_rs_int(void *rs,int *data,int *parity);
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+void free_rs_int(void *rs);
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+ int nroots,int pad);
+void encode_rs_char(void *rs,unsigned char *data,
+ unsigned char *parity);
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+ int no_eras);
+void free_rs_char(void *rs);
+void encode_rs_8(unsigned char *data,unsigned char *parity,
+ int pad);
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,
+ int pad);
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,
+ int pad);
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,
+ int pad);
+unsigned char Taltab[256];
+unsigned char Tal1tab[256];
+These functions implement Reed-Solomon error control encoding and
+decoding. For optimal performance in a variety of applications, three
+sets of functions are supplied. To access these functions, add "-lfec"
+to your linker command line.
+The functions with names ending in \fB_int\fR handle data in integer arrays,
+permitting arbitrarily large codewords limited only by machine
+The functions with names ending in \fB_char\fR take unsigned char arrays and can
+handle codes with symbols of 8 bits or less (i.e., with codewords of
+255 symbols or less).
+\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific
+(255,223) code with 8-bit symbols specified by the CCSDS:
+a field generator of 1 + X + X^2 + X^7 + X^8 and a code
+generator with first consecutive root = 112 and a primitive element of
+11. These functions use the conventional
+polynomial form, \fInot\fR the dual-basis specified in
+the CCSDS standard, to represent symbols. This code may be
+shortened by giving a non-zero \fBpad\fR value to produce a
+(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the
+specified number of zeroes at the front of the full codeword.
+For full CCSDS compatibility, \fBencode_rs_ccsds\fR and
+\fBdecode_rs_ccsds\fR are provided. These functions use two lookup
+tables, \fBTaltab\fR to convert from conventional to dual-basis, and
+\fBTal1tab\fR to perform the inverse mapping from dual-basis to
+conventional form, before and after calls to \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
+The \fB_8\fR and \fB_ccsds\fR functions do not require initialization.
+To use the general purpose RS encoder or decoder (i.e.,
+the \fB_char\fR or \fB_int\fR versions), the user must first
+call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The
+arguments are as follows:
+\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR
+or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a
+huge code would exhaust memory limits on a 32-bit machine). The resulting
+Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols,
+each containing \fBsymsize\fR bits. The codeword may be shortened with the
+\fBpad\fR parameter described below.
+\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients,
+with the 0th coefficient in the low order bit. The polynomial
+\fImust\fR be primitive; if not, the call will fail and NULL will be
+\fBfcr\fR gives, in index form, the first consecutive root of the
+Reed Solomon code generator polynomial.
+\fBprim\fR gives, in index form, the primitive element in the Galois field
+used to generate the Reed Solomon code generator polynomial.
+\fBnroots\fR gives the number of roots in the Reed Solomon code
+generator polynomial. This equals the number of parity symbols
+per code block.
+\fBpad\fR gives the number of leading symbols in the codeword
+that are implicitly padded to zero in a shortened code block.
+The resulting Reed-Solomon code has parameters (N,K), where
+N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR.
+The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept
+the pointer returned by \fBinit_rs_char\fR or
+\fBinit_rs_int\fR, respectively, to
+encode a block of data using the specified code.
+The input data array is expected to
+contain K symbols (of \fBsymsize\fR bits each, right justified
+in each char or int) and \fBnroots\fR parity symbols will be placed
+into the \fBparity\fR array, right justified.
+The \fBdecode_\fR functions correct
+the errors in a Reed-Solomon codeword of N symbols up to the capability of the code.
+An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR
+array to assist the decoder; this parameter may be NULL if no erasures
+are given. The number of erased symbols must be given in the \fBno_eras\fR
+To maximize performance, the encode and decode functions perform no
+"sanity checking" of their inputs. Decoder failure may result if
+\fBeras_pos\fR contains duplicate entries, and both encoder and
+decoder will fail if an input symbol exceeds its allowable range.
+(Symbol range overflow cannot occur with the \fB_8\fR or
+\fB_ccsds\fR functions,
+or with the \fB_char\fR functions when 8-bit symbols are specified.)
+The decoder corrects the symbols "in place", returning the number
+of symbols in error. If the codeword is uncorrectable, -1 is returned
+and the data block is unchanged. If \fBeras_pos\fR is non-null, it is
+used to return a list of corrected symbol positions, in no particular
+order. This means that the
+array passed through this parameter \fImust\fR have at least \fBnroots\fR
+elements to prevent a possible buffer overflow.
+The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal
+space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions,
+The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have
+corresponding \fBinit\fR and \fBfree\fR, nor do they take the
+\fBrs\fR argument accepted by the other functions as their parameters
+are statically compiled. These functions implement a code
+equivalent to calling
+and using the resulting pointer with \fBencode_rs_char\fR and
+\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal
+control structure that must be passed to the corresponding encode, decode
+and free functions. These functions return NULL on error.
+The \fBdecode_\fR functions return a count of corrected
+symbols, or -1 if the block was uncorrectible.
+Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert
+Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy
+(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar
+Welz (dwelz@web.de).
+Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the
+GNU Lesser General Public License (LGPL).
+CCSDS 101.0-B-6: Telemetry Channel Coding.
+CCSDS chose the "dual basis" symbol representation because it
+simplified the implementation of a Reed-Solomon encoder in dedicated
+hardware. However, this approach holds no advantages for a software
+implementation on a general purpose computer, so use of the dual basis
+is recommended only if compatibility with the CCSDS standard is needed,
+e.g., to decode data from an existing spacecraft using the CCSDS
+standard. If you just want a fast (255,223) RS codec without needing
+to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
diff --git a/rs_speedtest.c b/rs_speedtest.c
new file mode 100644
index 0000000..225f160
--- /dev/null
+++ b/rs_speedtest.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "fec.h"
+int main(){
+ unsigned char block[255];
+ int i;
+ void *rs;
+ struct rusage start,finish;
+ double extime;
+ int trials = 10000;
+ for(i=0;i<223;i++)
+ block[i] = 0x01;
+ rs = init_rs_char(8,0x187,112,11,32,0);
+ encode_rs_char(rs,block,&block[223]);
+ getrusage(RUSAGE_SELF,&start);
+ for(i=0;i<trials;i++){
+#if 0
+ block[0] ^= 0xff; /* Introduce an error */
+ block[2] ^= 0xff; /* Introduce an error */
+ decode_rs_char(rs,block,NULL,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d Reed-Solomon blocks using general decoder: %.2f sec\n",trials,extime);
+ printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+ encode_rs_8(block,&block[223],0);
+ getrusage(RUSAGE_SELF,&start);
+ for(i=0;i<trials;i++){
+#if 0
+ block[0] ^= 0xff; /* Introduce an error */
+ block[2] ^= 0xff; /* Introduce an error */
+ decode_rs_8(block,NULL,0,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d Reed-Solomon blocks using CCSDS decoder: %.2f sec\n",trials,extime);
+ printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+ exit(0);
diff --git a/rstest.c b/rstest.c
new file mode 100644
index 0000000..539b40a
--- /dev/null
+++ b/rstest.c
@@ -0,0 +1,296 @@
+/* Test the Reed-Solomon codecs
+ * for various block sizes and with random data and random error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "fec.h"
+struct etab {
+ int symsize;
+ int genpoly;
+ int fcs;
+ int prim;
+ int nroots;
+ int ntrials;
+} Tab[] = {
+ {2, 0x7, 1, 1, 1, 10 },
+ {3, 0xb, 1, 1, 2, 10 },
+ {4, 0x13, 1, 1, 4, 10 },
+ {5, 0x25, 1, 1, 6, 10 },
+ {6, 0x43, 1, 1, 8, 10 },
+ {7, 0x89, 1, 1, 10, 10 },
+ {8, 0x11d, 1, 1, 32, 10 },
+ {8, 0x187, 112,11, 32, 10 }, /* Duplicates CCSDS codec */
+ {9, 0x211, 1, 1, 32, 10 },
+ {10,0x409, 1, 1, 32, 10 },
+ {11,0x805, 1, 1, 32, 10 },
+ {12,0x1053, 1, 1, 32, 5 },
+ {13,0x201b, 1, 1, 32, 2 },
+ {14,0x4443, 1, 1, 32, 1 },
+ {15,0x8003, 1, 1, 32, 1 },
+ {16,0x1100b, 1, 1, 32, 1 },
+ {0, 0, 0, 0, 0},
+int exercise_char(struct etab *e);
+int exercise_int(struct etab *e);
+int exercise_8(void);
+int main(){
+ int i;
+ srandom(time(NULL));
+ printf("Testing fixed CCSDS encoder...\n");
+ exercise_8();
+ for(i=0;Tab[i].symsize != 0;i++){
+ int nn,kk;
+ nn = (1<<Tab[i].symsize) - 1;
+ kk = nn - Tab[i].nroots;
+ printf("Testing (%d,%d) code...\n",nn,kk);
+ if(Tab[i].symsize <= 8)
+ exercise_char(&Tab[i]);
+ else
+ exercise_int(&Tab[i]);
+ }
+ exit(0);
+int exercise_8(void){
+ int nn = 255;
+ unsigned char block[nn],tblock[nn];
+ int errlocs[nn],derrlocs[nn];
+ int i;
+ int errors;
+ int derrors,kk;
+ int errval,errloc;
+ int erasures;
+ int decoder_errors = 0;
+ /* Compute code parameters */
+ kk = 223;
+ /* Test up to the error correction capacity of the code */
+ for(errors=0;errors<=(nn-kk)/2;errors++){
+ /* Load block with random data and encode */
+ for(i=0;i<kk;i++)
+ block[i] = random() & nn;
+ memcpy(tblock,block,sizeof(block));
+ encode_rs_8(block,&block[kk],0);
+ /* Make temp copy, seed with errors */
+ memcpy(tblock,block,sizeof(block));
+ memset(errlocs,0,sizeof(errlocs));
+ memset(derrlocs,0,sizeof(derrlocs));
+ erasures=0;
+ for(i=0;i<errors;i++){
+ do {
+ errval = random() & nn;
+ } while(errval == 0); /* Error value must be nonzero */
+ do {
+ errloc = random() % nn;
+ } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+ errlocs[errloc] = 1;
+ if(random() & 1) /* 50-50 chance */
+ derrlocs[erasures++] = errloc;
+ tblock[errloc] ^= errval;
+ }
+ /* Decode the errored block */
+ derrors = decode_rs_8(tblock,derrlocs,erasures,0);
+ if(derrors != errors){
+ printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+ decoder_errors++;
+ }
+ for(i=0;i<derrors;i++){
+ if(errlocs[derrlocs[i]] == 0){
+ printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+ decoder_errors++;
+ }
+ }
+ if(memcmp(tblock,block,sizeof(tblock)) != 0){
+ printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+ decoder_errors++;
+ for(i=0;i<nn;i++)
+ printf(" %02x",tblock[i] ^ block[i]);
+ printf("\n");
+ }
+ }
+ return decoder_errors;
+int exercise_char(struct etab *e){
+ int nn = (1<<e->symsize) - 1;
+ unsigned char block[nn],tblock[nn];
+ int errlocs[nn],derrlocs[nn];
+ int i;
+ int errors;
+ int derrors,kk;
+ int errval,errloc;
+ int erasures;
+ int decoder_errors = 0;
+ void *rs;
+ if(e->symsize > 8)
+ return -1;
+ /* Compute code parameters */
+ kk = nn - e->nroots;
+ rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+ if(rs == NULL){
+ printf("init_rs_char failed!\n");
+ return -1;
+ }
+ /* Test up to the error correction capacity of the code */
+ for(errors=0;errors <= e->nroots/2;errors++){
+ /* Load block with random data and encode */
+ for(i=0;i<kk;i++)
+ block[i] = random() & nn;
+ memcpy(tblock,block,sizeof(block));
+ encode_rs_char(rs,block,&block[kk]);
+ /* Make temp copy, seed with errors */
+ memcpy(tblock,block,sizeof(block));
+ memset(errlocs,0,sizeof(errlocs));
+ memset(derrlocs,0,sizeof(derrlocs));
+ erasures=0;
+ for(i=0;i<errors;i++){
+ do {
+ errval = random() & nn;
+ } while(errval == 0); /* Error value must be nonzero */
+ do {
+ errloc = random() % nn;
+ } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+ errlocs[errloc] = 1;
+ if(random() & 1) /* 50-50 chance */
+ derrlocs[erasures++] = errloc;
+ tblock[errloc] ^= errval;
+ }
+ /* Decode the errored block */
+ derrors = decode_rs_char(rs,tblock,derrlocs,erasures);
+ if(derrors != errors){
+ printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+ decoder_errors++;
+ }
+ for(i=0;i<derrors;i++){
+ if(errlocs[derrlocs[i]] == 0){
+ printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+ decoder_errors++;
+ }
+ }
+ if(memcmp(tblock,block,sizeof(tblock)) != 0){
+ printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+ decoder_errors++;
+ for(i=0;i<nn;i++)
+ printf(" %02x",tblock[i] ^ block[i]);
+ printf("\n");
+ }
+ }
+ free_rs_char(rs);
+ return 0;
+int exercise_int(struct etab *e){
+ int nn = (1<<e->symsize) - 1;
+ int block[nn],tblock[nn];
+ int errlocs[nn],derrlocs[nn];
+ int i;
+ int errors;
+ int derrors,kk;
+ int errval,errloc;
+ int erasures;
+ int decoder_errors = 0;
+ void *rs;
+ /* Compute code parameters */
+ kk = nn - e->nroots;
+ rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+ if(rs == NULL){
+ printf("init_rs_int failed!\n");
+ return -1;
+ }
+ /* Test up to the error correction capacity of the code */
+ for(errors=0;errors <= e->nroots/2;errors++){
+ /* Load block with random data and encode */
+ for(i=0;i<kk;i++)
+ block[i] = random() & nn;
+ memcpy(tblock,block,sizeof(block));
+ encode_rs_int(rs,block,&block[kk]);
+ /* Make temp copy, seed with errors */
+ memcpy(tblock,block,sizeof(block));
+ memset(errlocs,0,sizeof(errlocs));
+ memset(derrlocs,0,sizeof(derrlocs));
+ erasures=0;
+ for(i=0;i<errors;i++){
+ do {
+ errval = random() & nn;
+ } while(errval == 0); /* Error value must be nonzero */
+ do {
+ errloc = random() % nn;
+ } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+ errlocs[errloc] = 1;
+ if(random() & 1) /* 50-50 chance */
+ derrlocs[erasures++] = errloc;
+ tblock[errloc] ^= errval;
+ }
+ /* Decode the errored block */
+ derrors = decode_rs_int(rs,tblock,derrlocs,erasures);
+ if(derrors != errors){
+ printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+ decoder_errors++;
+ }
+ for(i=0;i<derrors;i++){
+ if(errlocs[derrlocs[i]] == 0){
+ printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+ decoder_errors++;
+ }
+ }
+ if(memcmp(tblock,block,sizeof(tblock)) != 0){
+ printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+ decoder_errors++;
+ for(i=0;i<nn;i++)
+ printf(" %02x",tblock[i] ^ block[i]);
+ printf("\n");
+ }
+ }
+ free_rs_int(rs);
+ return 0;
diff --git a/sim.c b/sim.c
new file mode 100644
index 0000000..151b04c
--- /dev/null
+++ b/sim.c
@@ -0,0 +1,43 @@
+#include <math.h>
+#include <stdlib.h>
+#include "fec.h"
+#define MAX_RANDOM 0x7fffffff
+/* Generate gaussian random double with specified mean and std_dev */
+double normal_rand(double mean, double std_dev)
+ double fac,rsq,v1,v2;
+ static double gset;
+ static int iset;
+ if(iset){
+ /* Already got one */
+ iset = 0;
+ return mean + std_dev*gset;
+ }
+ /* Generate two evenly distributed numbers between -1 and +1
+ * that are inside the unit circle
+ */
+ do {
+ v1 = 2.0 * (double)random() / MAX_RANDOM - 1;
+ v2 = 2.0 * (double)random() / MAX_RANDOM - 1;
+ rsq = v1*v1 + v2*v2;
+ } while(rsq >= 1.0 || rsq == 0.0);
+ fac = sqrt(-2.0*log(rsq)/rsq);
+ gset = v1*fac;
+ iset++;
+ return mean + std_dev*v2*fac;
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){
+ int sample;
+ sample = offset + gain*normal_rand(sym?amp:-amp,1.0);
+ /* Clip to 8-bit offset range */
+ if(sample < 0)
+ sample = 0;
+ else if(sample > clip)
+ sample = clip;
+ return sample;
diff --git a/simd-viterbi.3 b/simd-viterbi.3
new file mode 100644
index 0000000..4c67593
--- /dev/null
+++ b/simd-viterbi.3
@@ -0,0 +1,247 @@
+create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk,
+chainback_viterbi27, delete_viterbi27,
+create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk,
+chainback_viterbi29, delete_viterbi29,
+create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk,
+chainback_viterbi39, delete_viterbi39,
+create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk,
+chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders
+.ft B
+#include "fec.h"
+void *create_viterbi27(int blocklen);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+.ft B
+void *create_viterbi29(int blocklen);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+.ft B
+void *create_viterbi39(int blocklen);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+.ft B
+void *create_viterbi615(int blocklen);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+These functions implement high performance Viterbi decoders for four
+convolutional codes: a rate 1/2 constraint length 7 (k=7) code
+("viterbi27"), a rate 1/2 k=9 code ("viterbi29"),
+a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615").
+The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve
+decoding speed.
+On the IA32 there are three different SIMD instruction sets. The first
+and most common is MMX, introduced on later Intel Pentiums and then on
+the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe,
+etc). SSE was introduced on the Pentium III and later implemented in
+the AMD Athlon 4 (AMD calls it "3D Now! Professional"). Most
+recently, SSE2 was introduced in the Intel Pentium 4, and has been
+adopted by more recent AMD CPUs. The presence of SSE2 implies the
+existence of SSE, which in turn implies MMX.
+Altivec is the PowerPC SIMD instruction set. It is roughly comparable
+to SSE2. Altivec was introduced to the general public in the Apple
+Macintosh G4; it is also present in the G5. Altivec is actually a
+Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it
+"VMX". All refer to the same thing.
+When built for the IA32 or PPC architectures, the functions
+automatically use the most powerful SIMD instruction set available. If
+no SIMD instructions are available, or if the library is built for a
+non-IA32, non-PPC machine, a portable C version is executed
+Four versions of each function are provided, one for each code.
+In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39"
+or "viterbi615" as desired.
+Before Viterbi decoding can begin, an instance must first be created with
+\fBcreate_viterbi()\fR. This function creates and returns a pointer to
+an internal control structure
+containing the path metrics and the branch
+decisions. \fBcreate_viterbi()\fR takes one argument that gives the
+length of the data block in bits. You \fImust not\fR attempt to
+decode a block longer than the length given to \fBcreate_viterbi()\fR.
+Before decoding a new frame,
+\fBinit_viterbi()\fR must be called to reset the decoder state.
+It accepts the instance pointer returned by
+\fBcreate_viterbi()\fR and the initial starting state of the
+convolutional encoder (usually 0). If the initial starting state is unknown or
+incorrect, the decoder will still function but the decoded data may be
+incorrect at the start of the block.
+Blocks of received symbols are processed with calls to
+\fBupdate_viterbi_blk()\fR. The \fBnbits\fR parameter specifies the
+number of \fIdata bits\fR (not channel symbols) represented by the
+\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in
+\fBsyms\fR is twice \fInbits\fR, and so on.)
+Each symbol is expected to range
+from 0 through 255, with 0 corresponding to a "strong 0" and 255
+corresponding to a "strong 1". The caller is responsible for
+determining the proper pairing of input symbols (commonly known as
+decoder symbol phasing).
+At the end of the block, the data is recovered with a call to
+\fBchainback_viterbi()\fR. The arguments are the pointer to the
+decoder instance, a pointer to a user-supplied buffer into which the
+decoded data is to be written, the number of data bits (not bytes)
+that are to be decoded, and the terminal state of the convolutional
+encoder at the end of the frame (usually 0). If the terminal state is
+incorrect or unknown, the decoded data bits at the end of the frame
+may be unreliable. The decoded data is written in big-endian order,
+i.e., the first bit in the frame is written into the high order bit of
+the first byte in the buffer. If the frame is not an integral number
+of bytes long, the low order bits of the last byte in the frame will
+be unused.
+Note that the decoders assume the use of a tail, i.e., the encoding
+and transmission of a sufficient number of padding bits beyond the end
+of the user data to force the convolutional encoder into the known
+terminal state given to \fBchainback_viterbi()\fR. The tail is
+always one bit less than the constraint length of the code, so the k=7
+code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits
+(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail
+The tail bits are not included in the length arguments to
+\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if
+the block contains 1000 user bits, then this would be the length
+parameter given to \fBcreate_viterbi27()\fR and
+\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called
+with a total of 2012 symbols - the last 12 encoded symbols
+representing the tail bits.
+After the call to \fBchainback_viterbi()\fR, the decoder may be reset
+with a call to \fBinit_viterbi()\fR and another block can be decoded.
+Alternatively, \fBdelete_viterbi()\fR can be called to free all resources
+used by the Viterbi decoder.
+The \fBset_viterbi_polynomial()\fR function allows use of other than the default
+code generator polynomials. Although only one set of polynomials are generally
+used with each code, there can are different conventions as to their order and
+symbol polarity, and these functions simplifies their use.
+The default polynomials for the viterbi27 routes
+are those of the NASA-JPL convention \fIwithout\fR symbol inversion.
+The NASA-JPL convention normally inverts the first symbol.
+The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second.
+To set the NASA-JPL convention with symbol inversion:
+.ft B
+int polys[2] = { -V27POLYA,V27POLYB };
+.ft R
+and to set the CCSDS convention with symbol inversion:
+.ft B
+int polys[2] = { V27POLYB,-V27POLYA };
+.ft R
+The default polynomials for the viterbi615 routines
+are those used by the Cassini spacecraft \fIwithout\fR
+symbol inversion. Mars Pathfinder (MPF) and STEREO
+swap the third and fourth polynomials.
+Both conventions invert the
+first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions.
+To set the Cassini convention with symbol inversion, do the following:
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF };
+.ft R
+and to set the MPF/STEREO convention with symbol inversion:
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF };
+.ft R
+For performance reasons, calling this function changes the code
+generator polynomials for \fIall\fR instances of corresponding Viterbi decoder,
+including those already created.
+These decoders have all been extensively tested and found to provide
+performance consistent with that expected for soft-decision Viterbi
+decoding with 8-bit symbols.
+Due to internal differences, the implementations
+vary slightly in error performance. In
+general, the portable C versions exhibit the best error performance
+because they use full-sized branch metrics, and the MMX versions
+exhibit the worst because they use 8-bit branch metrics with modulo
+comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and
+r=1/2 k=9 codes use unsigned
+8-bit branch metrics, and are almost as good as the C versions. The
+r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD
+Calling the functions listed above automatically calls the appropriate
+version of the function depending on the CPU type and available SIMD
+instructions. A particular version can also be called directly by
+appending the appropriate suffix to the function name. The available
+suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX,
+SSE, SSE2, Altivec and portable versions, respectively. For example,
+the SSE2 version of the update_viterbi27_blk() function can be invoked
+as update_viterbi27_blk_sse2().
+Naturally, the _av functions are only available on the PowerPC and the
+_mmx, _sse and _sse2 versions are only available on IA-32. Calling
+a SIMD-enabled function on a CPU that doesn't support the appropriate
+set of instructions will result in an illegal instruction exception.
+\fBcreate_viterbi\fR returns a pointer to the structure containing
+the decoder state.
+The other functions return -1 on error, 0 otherwise.
+Phil Karn, KA9Q (karn@ka9q.net)
+This software may be used under the terms of the GNU Limited General Public License (LGPL).
diff --git a/sqtest.c b/sqtest.c
new file mode 100644
index 0000000..b2abb09
--- /dev/null
+++ b/sqtest.c
@@ -0,0 +1,42 @@
+/* Verify correctness of the sum-of-square routines */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+long long sumsq_wq(signed short *in,int cnt);
+long long sumsq_wq_ref(signed short *in,int cnt);
+int main(){
+ int i;
+ long long result,rresult;
+ signed short samples[NSAMP];
+ srandom(time(NULL));
+ for(i=0;i<NSAMP;i++)
+ samples[i] = random() & 0xffff;
+ rresult = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+ result = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+ if(result == rresult){
+ printf("OK\n");
+ } else {
+ printf("sum mismatch: %lld != %lld\n",result,rresult);
+ }
+ exit(0);
+long long sumsq_wq_ref(signed short *in,int cnt){
+ long long sum = 0;
+ int i;
+ for(i=0;i<cnt;i++){
+ sum += (long)in[i] * in[i];
+ }
+ return sum;
diff --git a/sse2bfly27.s b/sse2bfly27.s
new file mode 100644
index 0000000..27422a2
--- /dev/null
+++ b/sse2bfly27.s
@@ -0,0 +1,202 @@
+/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
+ for 64-state (k=7) convolutional code
+ Copyright 2003 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ;
+ # SSE2 (128-bit integer SIMD) version
+ # Requires Pentium 4 or better
+ # These are offsets into struct v27, defined in viterbi27.h
+ .set DP,128
+ .set OLDMETRICS,132
+ .set NEWMETRICS,136
+ .text
+ .global update_viterbi27_blk_sse2,Branchtab27_sse2
+ .type update_viterbi27_blk_sse2,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ xorl %eax,%eax
+ movl 12(%ebp),%ebx # ebx = syms
+ movb (%ebx),%al
+ movd %eax,%xmm6 # xmm6[0] = first symbol
+ movb 1(%ebx),%al
+ movd %eax,%xmm5 # xmm5[0] = second symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0]
+ punpcklbw %xmm5,%xmm5
+ pshuflw $0,%xmm6,%xmm6 # copy low word to low 3
+ pshuflw $0,%xmm5,%xmm5
+ punpcklqdq %xmm6,%xmm6 # propagate to all 16
+ punpcklqdq %xmm5,%xmm5
+ # xmm6 now contains first symbol in each byte, xmm5 the second
+ movdqa thirtyones,%xmm7
+ # each invocation of this macro does 16 butterflies in parallel
+ .MACRO butterfly GROUP
+ # compute branch metrics
+ movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
+ movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
+ # This is okay for this
+ # code because the worst-case metric spread (at high Eb/No) is only 120,
+ # well within the range of our unsigned 8-bit path metrics, and even within
+ # the range of signed 8-bit path metrics
+ pavgb %xmm3,%xmm4
+ psrlw $3,%xmm4
+ pand %xmm7,%xmm4
+ movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0
+ movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,%xmm1
+ paddusb %xmm4,%xmm0 # note use of saturating arithmetic
+ paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not?
+ # negate branch metrics
+ pxor %xmm7,%xmm4
+ paddusb %xmm4,%xmm1
+ paddusb %xmm4,%xmm2
+ # Find survivors, leave in mm0,2
+ pminub %xmm1,%xmm0
+ pminub %xmm3,%xmm2
+ # get decisions, leave in mm1,3
+ pcmpeqb %xmm0,%xmm1
+ pcmpeqb %xmm2,%xmm3
+ # interleave and store new branch metrics in mm0,2
+ movdqa %xmm0,%xmm4
+ punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics
+ punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics
+ movdqa %xmm0,(32*\GROUP+16)(%edi)
+ movdqa %xmm4,(32*\GROUP)(%edi)
+ # interleave decisions & store
+ movdqa %xmm1,%xmm4
+ punpckhbw %xmm3,%xmm1
+ punpcklbw %xmm3,%xmm4
+ # work around bug in gas due to Intel doc error
+ .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx
+ shll $16,%ebx
+ .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax
+ orl %eax,%ebx
+ movl %ebx,(4*\GROUP)(%edx)
+ .endm
+ # invoke macro 2 times for a total of 32 butterflies
+ butterfly GROUP=0
+ butterfly GROUP=1
+ addl $8,%edx # bump decision pointer
+ # See if we have to normalize. This requires an explanation. We don't want
+ # our path metrics to exceed 255 on the *next* iteration. Since the
+ # largest branch metric is 30, that means we don't want any to exceed 225
+ # on *this* iteration. Rather than look them all, we just pick an arbitrary one
+ # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
+ # determined worst-case metric spread for this code and branch metrics in the range 0-30.
+ # This is extremely conservative, and empirical testing at a variety of Eb/Nos might
+ # show that a higher threshold could be used without affecting BER performance
+ movl (%edi),%eax # extract first output metric
+ andl $255,%eax
+ cmp $105,%eax
+ jle done # No, no need to normalize
+ # Normalize by finding smallest metric and subtracting it
+ # from all metrics. We can't just pick an arbitrary small constant because
+ # the minimum metric might be zero!
+ movdqa (%edi),%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa 16(%edi),%xmm1
+ pminub %xmm1,%xmm4
+ movdqa 32(%edi),%xmm2
+ pminub %xmm2,%xmm4
+ movdqa 48(%edi),%xmm3
+ pminub %xmm3,%xmm4
+ # crunch down to single lowest metric
+ movdqa %xmm4,%xmm5
+ psrldq $8,%xmm5 # the count to psrldq is bytes, not bits!
+ pminub %xmm5,%xmm4
+ movdqa %xmm4,%xmm5
+ psrlq $32,%xmm5
+ pminub %xmm5,%xmm4
+ movdqa %xmm4,%xmm5
+ psrlq $16,%xmm5
+ pminub %xmm5,%xmm4
+ movdqa %xmm4,%xmm5
+ psrlq $8,%xmm5
+ pminub %xmm5,%xmm4 # now in lowest byte of %xmm4
+ punpcklbw %xmm4,%xmm4 # lowest 2 bytes
+ pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes
+ punpcklqdq %xmm4,%xmm4 # all 16 bytes
+ # xmm4 now contains lowest metric in all 16 bytes
+ # subtract it from every output metric
+ psubusb %xmm4,%xmm0
+ psubusb %xmm4,%xmm1
+ psubusb %xmm4,%xmm2
+ psubusb %xmm4,%xmm3
+ movdqa %xmm0,(%edi)
+ movdqa %xmm1,16(%edi)
+ movdqa %xmm2,32(%edi)
+ movdqa %xmm3,48(%edi)
+ # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 16
+ .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
diff --git a/sse2bfly29.s b/sse2bfly29.s
new file mode 100644
index 0000000..0fa1742
--- /dev/null
+++ b/sse2bfly29.s
@@ -0,0 +1,245 @@
+/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies
+ for 256-state (k=9) convolutional code
+ Copyright 2004 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ;
+ # SSE2 (128-bit integer SIMD) version
+ # Requires Pentium 4 or better
+ # These are offsets into struct v29, defined in viterbi29.h
+ .set DP,512
+ .set OLDMETRICS,516
+ .set NEWMETRICS,520
+ .text
+ .global update_viterbi29_blk_sse2,Branchtab29_sse2
+ .type update_viterbi29_blk_sse2,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ xorl %eax,%eax
+ movl 12(%ebp),%ebx # ebx = syms
+ movb (%ebx),%al
+ movd %eax,%xmm6 # xmm6[0] = first symbol
+ movb 1(%ebx),%al
+ movd %eax,%xmm5 # xmm5[0] = second symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0]
+ punpcklbw %xmm5,%xmm5
+ movdqa thirtyones,%xmm7
+ pshuflw $0,%xmm6,%xmm6 # copy low word to low 3
+ pshuflw $0,%xmm5,%xmm5
+ punpcklqdq %xmm6,%xmm6 # propagate to all 16
+ punpcklqdq %xmm5,%xmm5
+ # xmm6 now contains first symbol in each byte, xmm5 the second
+ movdqa thirtyones,%xmm7
+ # each invocation of this macro does 16 butterflies in parallel
+ .MACRO butterfly GROUP
+ # compute branch metrics
+ movdqa Branchtab29_sse2+(16*\GROUP),%xmm4
+ movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pavgb %xmm3,%xmm4
+ psrlw $3,%xmm4
+ pand %xmm7,%xmm4 # xmm4 contains branch metrics
+ movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0
+ movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,%xmm1
+ paddusb %xmm4,%xmm0
+ paddusb %xmm4,%xmm3
+ # invert branch metrics
+ pxor %xmm7,%xmm4
+ paddusb %xmm4,%xmm1
+ paddusb %xmm4,%xmm2
+ # Find survivors, leave in mm0,2
+ pminub %xmm1,%xmm0
+ pminub %xmm3,%xmm2
+ # get decisions, leave in mm1,3
+ pcmpeqb %xmm0,%xmm1
+ pcmpeqb %xmm2,%xmm3
+ # interleave and store new branch metrics in mm0,2
+ movdqa %xmm0,%xmm4
+ punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics
+ punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics
+ movdqa %xmm0,(32*\GROUP+16)(%edi)
+ movdqa %xmm4,(32*\GROUP)(%edi)
+ # interleave decisions & store
+ movdqa %xmm1,%xmm4
+ punpckhbw %xmm3,%xmm1
+ punpcklbw %xmm3,%xmm4
+ # work around bug in gas due to Intel doc error
+ .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx
+ shll $16,%ebx
+ .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax
+ orl %eax,%ebx
+ movl %ebx,(4*\GROUP)(%edx)
+ .endm
+ # invoke macro 8 times for a total of 128 butterflies
+ butterfly GROUP=0
+ butterfly GROUP=1
+ butterfly GROUP=2
+ butterfly GROUP=3
+ butterfly GROUP=4
+ butterfly GROUP=5
+ butterfly GROUP=6
+ butterfly GROUP=7
+ addl $32,%edx # bump decision pointer
+ # see if we have to normalize
+ movl (%edi),%eax # extract first output metric
+ andl $255,%eax
+ cmp $50,%eax # is it greater than 50?
+ movl $0,%eax
+ jle done # No, no need to normalize
+ # Normalize by finding smallest metric and subtracting it
+ # from all metrics
+ movdqa (%edi),%xmm0
+ pminub 16(%edi),%xmm0
+ pminub 32(%edi),%xmm0
+ pminub 48(%edi),%xmm0
+ pminub 64(%edi),%xmm0
+ pminub 80(%edi),%xmm0
+ pminub 96(%edi),%xmm0
+ pminub 112(%edi),%xmm0
+ pminub 128(%edi),%xmm0
+ pminub 144(%edi),%xmm0
+ pminub 160(%edi),%xmm0
+ pminub 176(%edi),%xmm0
+ pminub 192(%edi),%xmm0
+ pminub 208(%edi),%xmm0
+ pminub 224(%edi),%xmm0
+ pminub 240(%edi),%xmm0
+ # crunch down to single lowest metric
+ movdqa %xmm0,%xmm1
+ psrldq $8,%xmm0 # the count to psrldq is bytes, not bits!
+ pminub %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+ psrlq $32,%xmm0
+ pminub %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+ psrlq $16,%xmm0
+ pminub %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+ psrlq $8,%xmm0
+ pminub %xmm1,%xmm0
+ punpcklbw %xmm0,%xmm0 # lowest 2 bytes
+ pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes
+ punpcklqdq %xmm0,%xmm0 # all 16 bytes
+ # xmm0 now contains lowest metric in all 16 bytes
+ # subtract it from every output metric
+ movdqa (%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,(%edi)
+ movdqa 16(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,16(%edi)
+ movdqa 32(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,32(%edi)
+ movdqa 48(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,48(%edi)
+ movdqa 64(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,64(%edi)
+ movdqa 80(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,80(%edi)
+ movdqa 96(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,96(%edi)
+ movdqa 112(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,112(%edi)
+ movdqa 128(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,128(%edi)
+ movdqa 144(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,144(%edi)
+ movdqa 160(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,160(%edi)
+ movdqa 176(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,176(%edi)
+ movdqa 192(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,192(%edi)
+ movdqa 208(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,208(%edi)
+ movdqa 224(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,224(%edi)
+ movdqa 240(%edi),%xmm1
+ psubusb %xmm0,%xmm1
+ movdqa %xmm1,240(%edi)
+ # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 16
+ .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
diff --git a/ssebfly27.s b/ssebfly27.s
new file mode 100644
index 0000000..7f445da
--- /dev/null
+++ b/ssebfly27.s
@@ -0,0 +1,205 @@
+/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
+ for 64-state (k=7) convolutional code
+ Copyright 2001 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ;
+ # SSE (64-bit integer SIMD) version
+ # Requires Pentium III or better
+ # These are offsets into struct v27, defined in viterbi27.h
+ .set DP,128
+ .set OLDMETRICS,132
+ .set NEWMETRICS,136
+.global update_viterbi27_blk_sse,Branchtab27_sse
+ .type update_viterbi27_blk_sse,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ xorl %eax,%eax
+ movl 12(%ebp),%ebx # %ebx = syms
+ movb (%ebx),%al
+ movd %eax,%mm6 # mm6[0] = first symbol
+ movb 1(%ebx),%al
+ movd %eax,%mm5 # mm5[0] = second symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ punpcklbw %mm6,%mm6 # mm6[1] = mm6[0]
+ punpcklbw %mm5,%mm5
+ movq thirtyones,%mm7
+ pshufw $0,%mm6,%mm6 # copy low word to upper 3
+ pshufw $0,%mm5,%mm5
+ # mm6 now contains first symbol in each byte, mm5 the second
+ # each invocation of this macro does 8 butterflies in parallel
+ .MACRO butterfly GROUP
+ # compute branch metrics
+ movq Branchtab27_sse+(8*\GROUP),%mm4
+ movq Branchtab27_sse+32+(8*\GROUP),%mm3
+ pxor %mm6,%mm4
+ pxor %mm5,%mm3
+ pavgb %mm3,%mm4 # mm4 contains branch metrics
+ psrlw $3,%mm4
+ pand %mm7,%mm4
+ movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0
+ movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1
+ movq %mm0,%mm2
+ movq %mm3,%mm1
+ paddusb %mm4,%mm0
+ paddusb %mm4,%mm3
+ # invert branch metrics. This works only because they're 5 bits
+ pxor %mm7,%mm4
+ paddusb %mm4,%mm1
+ paddusb %mm4,%mm2
+ # Find survivors, leave in mm0,2
+ pminub %mm1,%mm0
+ pminub %mm3,%mm2
+ # get decisions, leave in mm1,3
+ pcmpeqb %mm0,%mm1
+ pcmpeqb %mm2,%mm3
+ # interleave and store new branch metrics in mm0,2
+ movq %mm0,%mm4
+ punpckhbw %mm2,%mm0 # interleave second 8 new metrics
+ punpcklbw %mm2,%mm4 # interleave first 8 new metrics
+ movq %mm0,(16*\GROUP+8)(%edi)
+ movq %mm4,(16*\GROUP)(%edi)
+ # interleave decisions, accumulate into %ebx
+ movq %mm1,%mm4
+ punpckhbw %mm3,%mm1
+ punpcklbw %mm3,%mm4
+ # Due to an error in the Intel instruction set ref (the register
+ # fields are swapped), gas assembles pmovmskb incorrectly
+ # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+ .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax
+ shll $((16*\GROUP+8)&31),%eax
+ orl %eax,%ebx
+ .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax
+ shll $((16*\GROUP)&31),%eax
+ orl %eax,%ebx
+ .endm
+ # invoke macro 4 times for a total of 32 butterflies
+ xorl %ebx,%ebx # clear decisions
+ butterfly GROUP=0
+ butterfly GROUP=1
+ movl %ebx,(%edx) # stash first 32 decisions
+ xorl %ebx,%ebx
+ butterfly GROUP=2
+ butterfly GROUP=3
+ movl %ebx,4(%edx) # stash second 32 decisions
+ addl $8,%edx # bump decision pointer
+ # see if we have to normalize
+ movl (%edi),%eax # extract first output metric
+ andl $255,%eax
+ cmpl $150,%eax # is it greater than 150?
+ movl $0,%eax
+ jle done # No, no need to normalize
+ # Normalize by finding smallest metric and subtracting it
+ # from all metrics
+ movq (%edi),%mm0
+ pminub 8(%edi),%mm0
+ pminub 16(%edi),%mm0
+ pminub 24(%edi),%mm0
+ pminub 32(%edi),%mm0
+ pminub 40(%edi),%mm0
+ pminub 48(%edi),%mm0
+ pminub 56(%edi),%mm0
+ # mm0 contains 8 smallest metrics
+ # crunch down to single lowest metric
+ movq %mm0,%mm1
+ psrlq $32,%mm0
+ pminub %mm1,%mm0
+ movq %mm0,%mm1
+ psrlq $16,%mm0
+ pminub %mm1,%mm0
+ movq %mm0,%mm1
+ psrlq $8,%mm0
+ pminub %mm1,%mm0
+ punpcklbw %mm0,%mm0 # expand to all 8 bytes
+ pshufw $0,%mm0,%mm0
+ # mm0 now contains lowest metric in all 8 bytes
+ # subtract it from every output metric
+ # Trashes %mm7
+ movq \MEM,%mm7
+ psubusb \REG,%mm7
+ movq %mm7,\MEM
+ .endm
+ PSUBUSBM %mm0,(%edi)
+ PSUBUSBM %mm0,8(%edi)
+ PSUBUSBM %mm0,16(%edi)
+ PSUBUSBM %mm0,24(%edi)
+ PSUBUSBM %mm0,32(%edi)
+ PSUBUSBM %mm0,40(%edi)
+ PSUBUSBM %mm0,48(%edi)
+ PSUBUSBM %mm0,56(%edi)
+ movd %mm0,%eax
+ and $0xff,%eax
+done: # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: emms
+ movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 16
+ .byte 31,31,31,31,31,31,31,31
diff --git a/ssebfly29.s b/ssebfly29.s
new file mode 100644
index 0000000..d7d2149
--- /dev/null
+++ b/ssebfly29.s
@@ -0,0 +1,271 @@
+/* Intel SIMD SSE implementation of Viterbi ACS butterflies
+ for 256-state (k=9) convolutional code
+ Copyright 2004 Phil Karn, KA9Q
+ This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+ void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits);
+ # SSE (64-bit integer SIMD) version
+ # Requires Pentium III or better
+ # These are offsets into struct v29, defined in viterbi29.h
+ .set DP,512
+ .set OLDMETRICS,516
+ .set NEWMETRICS,520
+ .text
+ .global update_viterbi29_blk_sse,Branchtab29_sse
+ .type update_viterbi29_blk_sse,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx
+ pushl %ebx
+ movl 8(%ebp),%edx # edx = vp
+ testl %edx,%edx
+ jnz 0f
+ movl -1,%eax
+ jmp err
+0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
+ movl NEWMETRICS(%edx),%edi # edi -> new metrics
+ movl DP(%edx),%edx # edx -> decisions
+1: movl 16(%ebp),%eax # eax = nbits
+ decl %eax
+ jl 2f # passed zero, we're done
+ movl %eax,16(%ebp)
+ xorl %eax,%eax
+ movl 12(%ebp),%ebx # ebx = syms
+ movb (%ebx),%al
+ movd %eax,%mm6 # mm6[0] = first symbol
+ movb 1(%ebx),%al
+ movd %eax,%mm5 # mm5[0] = second symbol
+ addl $2,%ebx
+ movl %ebx,12(%ebp)
+ punpcklbw %mm6,%mm6 # mm6[1] = mm6[0]
+ punpcklbw %mm5,%mm5
+ movq thirtyones,%mm7
+ pshufw $0,%mm6,%mm6 # copy low word to upper 3
+ pshufw $0,%mm5,%mm5
+ # mm6 now contains first symbol in each byte, mm5 the second
+ # each invocation of this macro does 8 butterflies in parallel
+ .MACRO butterfly GROUP
+ # compute branch metrics
+ movq Branchtab29_sse+(8*\GROUP),%mm4
+ movq Branchtab29_sse+128+(8*\GROUP),%mm3
+ pxor %mm6,%mm4
+ pxor %mm5,%mm3
+ pavgb %mm3,%mm4 # mm4 contains branch metrics
+ psrlw $3,%mm4
+ pand %mm7,%mm4
+ movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0
+ movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1
+ movq %mm0,%mm2
+ movq %mm3,%mm1
+ paddusb %mm4,%mm0
+ paddusb %mm4,%mm3
+ # invert branch metrics. This works only because they're 5 bits
+ pxor %mm7,%mm4
+ paddusb %mm4,%mm1
+ paddusb %mm4,%mm2
+ # Find survivors, leave in mm0,2
+ pminub %mm1,%mm0
+ pminub %mm3,%mm2
+ # get decisions, leave in mm1,3
+ pcmpeqb %mm0,%mm1
+ pcmpeqb %mm2,%mm3
+ # interleave and store new branch metrics in mm0,2
+ movq %mm0,%mm4
+ punpckhbw %mm2,%mm0 # interleave second 8 new metrics
+ punpcklbw %mm2,%mm4 # interleave first 8 new metrics
+ movq %mm0,(16*\GROUP+8)(%edi)
+ movq %mm4,(16*\GROUP)(%edi)
+ # interleave decisions, accumulate into %ebx
+ movq %mm1,%mm4
+ punpckhbw %mm3,%mm1
+ punpcklbw %mm3,%mm4
+ # Due to an error in the Intel instruction set ref (the register
+ # fields are swapped), gas assembles pmovmskb incorrectly
+ # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+ .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax
+ shll $((16*\GROUP+8)&31),%eax
+ orl %eax,%ebx
+ .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax
+ shll $((16*\GROUP)&31),%eax
+ orl %eax,%ebx
+ .endm
+ # invoke macro 16 times for a total of 128 butterflies
+ xorl %ebx,%ebx # clear decisions
+ butterfly GROUP=0
+ butterfly GROUP=1
+ movl %ebx,(%edx) # stash first 32 decisions
+ xorl %ebx,%ebx
+ butterfly GROUP=2
+ butterfly GROUP=3
+ movl %ebx,4(%edx) # stash second 32 decisions
+ xorl %ebx,%ebx # clear decisions
+ butterfly GROUP=4
+ butterfly GROUP=5
+ movl %ebx,8(%edx) # stash first 32 decisions
+ xorl %ebx,%ebx
+ butterfly GROUP=6
+ butterfly GROUP=7
+ movl %ebx,12(%edx) # stash second 32 decisions
+ xorl %ebx,%ebx # clear decisions
+ butterfly GROUP=8
+ butterfly GROUP=9
+ movl %ebx,16(%edx) # stash first 32 decisions
+ xorl %ebx,%ebx
+ butterfly GROUP=10
+ butterfly GROUP=11
+ movl %ebx,20(%edx) # stash second 32 decisions
+ xorl %ebx,%ebx # clear decisions
+ butterfly GROUP=12
+ butterfly GROUP=13
+ movl %ebx,24(%edx) # stash first 32 decisions
+ xorl %ebx,%ebx
+ butterfly GROUP=14
+ butterfly GROUP=15
+ movl %ebx,28(%edx) # stash second 32 decisions
+ addl $32,%edx # bump decision pointer
+ # see if we have to normalize
+ movl (%edi),%eax # extract first output metric
+ andl $255,%eax
+ cmp $50,%eax # is it greater than 50?
+ movl $0,%eax
+ jle done # No, no need to normalize
+ # Normalize by finding smallest metric and subtracting it
+ # from all metrics
+ movq (%edi),%mm0
+ pminub 8(%edi),%mm0
+ pminub 16(%edi),%mm0
+ pminub 24(%edi),%mm0
+ pminub 32(%edi),%mm0
+ pminub 40(%edi),%mm0
+ pminub 48(%edi),%mm0
+ pminub 56(%edi),%mm0
+ pminub 64(%edi),%mm0
+ pminub 72(%edi),%mm0
+ pminub 80(%edi),%mm0
+ pminub 88(%edi),%mm0
+ pminub 96(%edi),%mm0
+ pminub 104(%edi),%mm0
+ pminub 112(%edi),%mm0
+ pminub 120(%edi),%mm0
+ pminub 128(%edi),%mm0
+ pminub 136(%edi),%mm0
+ pminub 144(%edi),%mm0
+ pminub 152(%edi),%mm0
+ pminub 160(%edi),%mm0
+ pminub 168(%edi),%mm0
+ pminub 176(%edi),%mm0
+ pminub 184(%edi),%mm0
+ pminub 192(%edi),%mm0
+ pminub 200(%edi),%mm0
+ pminub 208(%edi),%mm0
+ pminub 216(%edi),%mm0
+ pminub 224(%edi),%mm0
+ pminub 232(%edi),%mm0
+ pminub 240(%edi),%mm0
+ pminub 248(%edi),%mm0
+ # mm0 contains 8 smallest metrics
+ # crunch down to single lowest metric
+ movq %mm0,%mm1
+ psrlq $32,%mm0
+ pminub %mm1,%mm0
+ movq %mm0,%mm1
+ psrlq $16,%mm0
+ pminub %mm1,%mm0
+ movq %mm0,%mm1
+ psrlq $8,%mm0
+ pminub %mm1,%mm0
+ movq 8(%edi),%mm1 # reload
+ punpcklbw %mm0,%mm0 # expand to all 8 bytes
+ pshufw $0,%mm0,%mm0
+ # mm0 now contains lowest metric in all 8 bytes
+ # subtract it from every output metric
+ # Trashes %mm7
+ movq \MEM,%mm7
+ psubusb \REG,%mm7
+ movq %mm7,\MEM
+ .endm
+ PSUBUSBM %mm0,(%edi)
+ PSUBUSBM %mm0,8(%edi)
+ PSUBUSBM %mm0,16(%edi)
+ PSUBUSBM %mm0,24(%edi)
+ PSUBUSBM %mm0,32(%edi)
+ PSUBUSBM %mm0,40(%edi)
+ PSUBUSBM %mm0,48(%edi)
+ PSUBUSBM %mm0,56(%edi)
+ PSUBUSBM %mm0,64(%edi)
+ PSUBUSBM %mm0,72(%edi)
+ PSUBUSBM %mm0,80(%edi)
+ PSUBUSBM %mm0,88(%edi)
+ PSUBUSBM %mm0,96(%edi)
+ PSUBUSBM %mm0,104(%edi)
+ PSUBUSBM %mm0,112(%edi)
+ PSUBUSBM %mm0,120(%edi)
+ PSUBUSBM %mm0,128(%edi)
+ PSUBUSBM %mm0,136(%edi)
+ PSUBUSBM %mm0,144(%edi)
+ PSUBUSBM %mm0,152(%edi)
+ PSUBUSBM %mm0,160(%edi)
+ PSUBUSBM %mm0,168(%edi)
+ PSUBUSBM %mm0,176(%edi)
+ PSUBUSBM %mm0,184(%edi)
+ PSUBUSBM %mm0,192(%edi)
+ PSUBUSBM %mm0,200(%edi)
+ PSUBUSBM %mm0,208(%edi)
+ PSUBUSBM %mm0,216(%edi)
+ PSUBUSBM %mm0,224(%edi)
+ PSUBUSBM %mm0,232(%edi)
+ PSUBUSBM %mm0,240(%edi)
+ PSUBUSBM %mm0,248(%edi)
+ # swap metrics
+ movl %esi,%eax
+ movl %edi,%esi
+ movl %eax,%edi
+ jmp 1b
+2: emms
+ movl 8(%ebp),%ebx # ebx = vp
+ # stash metric pointers
+ movl %esi,OLDMETRICS(%ebx)
+ movl %edi,NEWMETRICS(%ebx)
+ movl %edx,DP(%ebx) # stash incremented value of vp->dp
+ xorl %eax,%eax
+err: popl %ebx
+ popl %edx
+ popl %edi
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 8
+ .byte 31,31,31,31,31,31,31,31
diff --git a/sumsq.c b/sumsq.c
new file mode 100644
index 0000000..9ed6a39
--- /dev/null
+++ b/sumsq.c
@@ -0,0 +1,40 @@
+/* Compute the sum of the squares of a vector of signed shorts
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+unsigned long long sumsq_port(signed short *,int);
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *,int);
+unsigned long long sumsq_sse(signed short *,int);
+unsigned long long sumsq_sse2(signed short *,int);
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *,int);
+unsigned long long sumsq(signed short *in,int cnt){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return sumsq_port(in,cnt);
+#ifdef __i386__
+ case SSE:
+ case MMX:
+ return sumsq_mmx(in,cnt);
+ case SSE2:
+ return sumsq_sse2(in,cnt);
+#ifdef __VEC__
+ case ALTIVEC:
+ return sumsq_av(in,cnt);
+ }
diff --git a/sumsq_av.c b/sumsq_av.c
new file mode 100644
index 0000000..53c6acf
--- /dev/null
+++ b/sumsq_av.c
@@ -0,0 +1,78 @@
+/* Compute the sum of the squares of a vector of signed shorts
+ * This is the Altivec SIMD version. It's a little hairy because Altivec
+ * does not do 64-bit operations directly, so we have to accumulate separate
+ * 32-bit sums and carries
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "fec.h"
+unsigned long long sumsq_av(signed short *in,int cnt){
+ long long sum;
+ vector signed short x;
+ vector unsigned int sums,carries,s1,s2;
+ int pad;
+ union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s;
+ carries = sums = (vector unsigned int)(0);
+ if((pad = (int)in & 15)!=0){
+ /* Load unaligned leading word */
+ x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+ if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+ s.c[15] = (8-cnt)<<4;
+ x = vec_sro(x,s.cv);
+ }
+ sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+ in += 8-pad/2;
+ cnt -= 8-pad/2;
+ }
+ /* Everything is now aligned, rip through most of the block */
+ while(cnt >= 8){
+ x = vec_ld(0,in);
+ /* A single vec_msum cannot overflow, but we have to sum it with
+ * the earlier terms separately to handle the carries
+ * The cast to unsigned is OK because squares are always positive
+ */
+ s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+ carries = vec_add(carries,vec_addc(sums,s1));
+ sums = vec_add(sums,s1);
+ in += 8;
+ cnt -= 8;
+ }
+ /* Handle trailing fragment, if any */
+ if(cnt > 0){
+ x = vec_ld(0,in);
+ s.c[15] = (8-cnt)<<4;
+ x = vec_sro(x,s.cv);
+ s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+ carries = vec_add(carries,vec_addc(sums,s1));
+ sums = vec_add(sums,s1);
+ }
+ /* Combine 4 sub-sums and carries */
+ s.c[15] = 64; /* Shift right two 32-bit words */
+ s1 = vec_sro(sums,s.cv);
+ s2 = vec_sro(carries,s.cv);
+ carries = vec_add(carries,vec_addc(sums,s1));
+ sums = vec_add(sums,s1);
+ carries = vec_add(carries,s2);
+ s.c[15] = 32; /* Shift right one 32-bit word */
+ s1 = vec_sro(sums,s.cv);
+ s2 = vec_sro(carries,s.cv);
+ carries = vec_add(carries,vec_addc(sums,s1));
+ sums = vec_add(sums,s1);
+ carries = vec_add(carries,s2);
+ /* Extract sum and carries from right-hand words and combine into result */
+ s.iv = sums;
+ sum = s.w[3];
+ s.iv = carries;
+ sum += (long long)s.w[3] << 32;
+ return sum;
diff --git a/sumsq_mmx.c b/sumsq_mmx.c
new file mode 100644
index 0000000..e766831
--- /dev/null
+++ b/sumsq_mmx.c
@@ -0,0 +1,35 @@
+/* Compute the sum of the squares of a vector of signed shorts
+ * MMX-assisted version (also used on SSE)
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+long long sumsq_mmx_assist(signed short *,int);
+long long sumsq_mmx(signed short *in,int cnt){
+ long long sum = 0;
+ /* Handle stuff before the next 8-byte boundary */
+ while(((int)in & 7) != 0 && cnt != 0){
+ sum += (long)in[0] * in[0];
+ in++;
+ cnt--;
+ }
+ sum += sumsq_mmx_assist(in,cnt);
+ in += cnt & ~7;
+ cnt &= 7;
+ /* Handle up to 7 words at end */
+ while(cnt != 0){
+ sum += (long)in[0] * in[0];
+ in++;
+ cnt--;
+ }
+ return sum;
diff --git a/sumsq_mmx_assist.s b/sumsq_mmx_assist.s
new file mode 100644
index 0000000..b3bac66
--- /dev/null
+++ b/sumsq_mmx_assist.s
@@ -0,0 +1,83 @@
+# MMX assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+ .text
+# Evaluate sum of squares of signed 16-bit input samples
+# long long sumsq_mmx_assist(signed short *in,int cnt);
+ .global sumsq_mmx_assist
+ .type sumsq_mmx_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ pushl %ebx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ xor %eax,%eax
+ xor %edx,%edx
+ # Since 4 * 32767**2 < 2**32, we can accumulate two at a time
+1: subl $8,%ecx
+ jl 2f
+ movq (%esi),%mm0 # S0 S1 S2 S3
+ pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2)
+ movq 8(%esi),%mm6 # S4 S5 S6 S7
+ pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2)
+ paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
+ movd %mm0,%ebx
+ addl %ebx,%eax
+ adcl $0,%edx
+ psrlq $32,%mm0
+ movd %mm0,%ebx
+ addl %ebx,%eax
+ adcl $0,%edx
+ addl $16,%esi
+ jmp 1b
+2: emms
+ popl %ebx
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
+# Evaluate sum of squares of signed 16-bit input samples
+# long sumsq_wd_mmx_assist(signed short *in,int cnt);
+# Quick version, only safe for small numbers of small input values...
+ .global sumsq_wd_mmx_assist
+ .type sumsq_wd_mmx_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %mm2,%mm2 # zero sum
+1: subl $8,%ecx
+ jl 2f
+ movq (%esi),%mm0 # S0 S1 S2 S3
+ pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3)
+ movq 8(%esi),%mm1
+ pmaddwd %mm1,%mm1
+ paddd %mm1,%mm2
+ paddd %mm0,%mm2 # accumulate
+ addl $16,%esi
+ jmp 1b
+2: movd %mm2,%eax # even sum
+ psrlq $32,%mm2
+ movd %mm2,%edx # odd sum
+ addl %edx,%eax
+ emms
+ popl %esi
+ popl %ebp
+ ret
diff --git a/sumsq_port.c b/sumsq_port.c
new file mode 100644
index 0000000..6d0b4c1
--- /dev/null
+++ b/sumsq_port.c
@@ -0,0 +1,16 @@
+/* Compute the sum of the squares of a vector of signed shorts
+ * Portable C version
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+unsigned long long sumsq_port(signed short *in,int cnt){
+ long long sum = 0;
+ int i;
+ for(i=0;i<cnt;i++){
+ sum += (int)in[i] * (int)in[i];
+ }
+ return sum;
diff --git a/sumsq_sse2.c b/sumsq_sse2.c
new file mode 100644
index 0000000..b05d2e9
--- /dev/null
+++ b/sumsq_sse2.c
@@ -0,0 +1,33 @@
+/* Compute the sum of the squares of a vector of signed shorts
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+long long sumsq_sse2_assist(signed short *,int);
+long long sumsq_sse2(signed short *in,int cnt){
+ long long sum = 0;
+ /* Handle stuff before the next 8-byte boundary */
+ while(((int)in & 15) != 0 && cnt != 0){
+ sum += (long)in[0] * in[0];
+ in++;
+ cnt--;
+ }
+ sum += sumsq_sse2_assist(in,cnt);
+ in += cnt & ~7;
+ cnt &= 7;
+ /* Handle up to 7 trailing words */
+ while(cnt != 0){
+ sum += (long)in[0] * in[0];
+ in++;
+ cnt--;
+ }
+ return sum;
diff --git a/sumsq_sse2_assist.s b/sumsq_sse2_assist.s
new file mode 100644
index 0000000..d1c4ee7
--- /dev/null
+++ b/sumsq_sse2_assist.s
@@ -0,0 +1,49 @@
+# SSE2 assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+ .text
+# Evaluate sum of squares of signed 16-bit input samples
+# long long sumsq_sse2_assist(signed short *in,int cnt);
+ .global sumsq_sse2_assist
+ .type sumsq_sse2_assist,@function
+ .align 16
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ecx
+ movl 8(%ebp),%esi
+ movl 12(%ebp),%ecx
+ pxor %xmm2,%xmm2 # zero sum
+ movaps low,%xmm3 # load mask
+1: subl $8,%ecx
+ jl 2f
+ movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7
+ pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
+ movaps %xmm0,%xmm1
+ pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
+ paddq %xmm1,%xmm2 # sum even-numbered dwords
+ psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
+ paddq %xmm0,%xmm2 # sum odd-numbered dwords
+ addl $16,%esi
+ jmp 1b
+2: movaps %xmm2,%xmm0
+ psrldq $8,%xmm0
+ paddq %xmm2,%xmm0 # combine 64-bit sums
+ movd %xmm0,%eax # low 32 bits of sum
+ psrldq $4,%xmm0
+ movd %xmm0,%edx # high 32 bits of sum
+ popl %ecx
+ popl %esi
+ popl %ebp
+ ret
+ .data
+ .align 16
+low: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
diff --git a/sumsq_test.c b/sumsq_test.c
new file mode 100644
index 0000000..4debd47
--- /dev/null
+++ b/sumsq_test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "config.h"
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"frame-length",1,NULL,'l'},
+ {"frame-count",1,NULL,'n'},
+ {"verbose",0,NULL,'v'},
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {NULL},
+int Verbose = 0;
+int main(int argc,char *argv[]){
+ signed short *buf;
+ int i,d,trial,trials=10000;
+ int bufsize = 2048;
+ long long port_sum,simd_sum;
+ time_t t;
+ int timetrials=0;
+ find_cpu_mode();
+ time(&t);
+ srandom(t);
+ while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'l':
+ bufsize = atoi(optarg);
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ case 'v':
+ Verbose++;
+ break;
+ case 'T':
+ timetrials++;
+ break;
+ }
+ }
+ buf = (signed short *)calloc(bufsize,sizeof(signed short));
+ if(timetrials){
+ for(trial=0;trial<trials;trial++){
+ (void)sumsq(buf,bufsize);
+ }
+ } else {
+ for(trial=0;trial<trials;trial++){
+ int length,offset;
+ offset = random() & 7;
+ length = (random() % bufsize) - offset;
+ if(length <= 0)
+ continue;
+ for(i=0;i<bufsize;i++)
+ buf[i] = random();
+ port_sum = sumsq_port(buf+offset,length);
+ simd_sum = sumsq(buf+offset,length);
+ if(port_sum != simd_sum){
+ printf("offset %d len %d port_sum = %lld simd_sum = %lld ",offset,length,port_sum,simd_sum);
+ printf("ERROR! diff = %lld\n",simd_sum-port_sum);
+ }
+ }
+ }
+ exit(0);
diff --git a/viterbi27.c b/viterbi27.c
new file mode 100644
index 0000000..554da92
--- /dev/null
+++ b/viterbi27.c
@@ -0,0 +1,161 @@
+/* K=7 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27(int len){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return create_viterbi27_port(len);
+#ifdef __VEC__
+ case ALTIVEC:
+ return create_viterbi27_av(len);
+#ifdef __i386__
+ case MMX:
+ return create_viterbi27_mmx(len);
+ case SSE:
+ return create_viterbi27_sse(len);
+ case SSE2:
+ return create_viterbi27_sse2(len);
+ }
+void set_viterbi27_polynomial(int polys[2]){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ set_viterbi27_polynomial_port(polys);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ set_viterbi27_polynomial_av(polys);
+ break;
+#ifdef __i386__
+ case MMX:
+ set_viterbi27_polynomial_mmx(polys);
+ break;
+ case SSE:
+ set_viterbi27_polynomial_sse(polys);
+ break;
+ case SSE2:
+ set_viterbi27_polynomial_sse2(polys);
+ break;
+ }
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27(void *p,int starting_state){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return init_viterbi27_port(p,starting_state);
+#ifdef __VEC__
+ case ALTIVEC:
+ return init_viterbi27_av(p,starting_state);
+#ifdef __i386__
+ case MMX:
+ return init_viterbi27_mmx(p,starting_state);
+ case SSE:
+ return init_viterbi27_sse(p,starting_state);
+ case SSE2:
+ return init_viterbi27_sse2(p,starting_state);
+ }
+/* Viterbi chainback */
+int chainback_viterbi27(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return chainback_viterbi27_port(p,data,nbits,endstate);
+#ifdef __VEC__
+ case ALTIVEC:
+ return chainback_viterbi27_av(p,data,nbits,endstate);
+#ifdef __i386__
+ case MMX:
+ return chainback_viterbi27_mmx(p,data,nbits,endstate);
+ case SSE:
+ return chainback_viterbi27_sse(p,data,nbits,endstate);
+ case SSE2:
+ return chainback_viterbi27_sse2(p,data,nbits,endstate);
+ }
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27(void *p){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ delete_viterbi27_port(p);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ delete_viterbi27_av(p);
+ break;
+#ifdef __i386__
+ case MMX:
+ delete_viterbi27_mmx(p);
+ break;
+ case SSE:
+ delete_viterbi27_sse(p);
+ break;
+ case SSE2:
+ delete_viterbi27_sse2(p);
+ break;
+ }
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){
+ if(p == NULL)
+ return -1;
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ update_viterbi27_blk_port(p,syms,nbits);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ update_viterbi27_blk_av(p,syms,nbits);
+ break;
+#ifdef __i386__
+ case MMX:
+ update_viterbi27_blk_mmx(p,syms,nbits);
+ break;
+ case SSE:
+ update_viterbi27_blk_sse(p,syms,nbits);
+ break;
+ case SSE2:
+ update_viterbi27_blk_sse2(p,syms,nbits);
+ break;
+ }
+ return 0;
diff --git a/viterbi27_av.c b/viterbi27_av.c
new file mode 100644
index 0000000..98d7344
--- /dev/null
+++ b/viterbi27_av.c
@@ -0,0 +1,210 @@
+/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "fec.h"
+typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t;
+typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t;
+static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_av(void *p,int starting_state){
+ struct v27 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<4;i++)
+ vp->metrics1.v[i] = (vector unsigned char)(63);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi27_polynomial_av(int polys[2]){
+ int state;
+ for(state=0;state < 32;state++){
+ Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_av(int len){
+ struct v27 *vp;
+ if(!Init){
+ int polys[2] = { V27POLYA,V27POLYB };
+ set_viterbi27_polynomial_av(polys);
+ }
+ if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi27_av(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi27_av(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v27 *vp = p;
+ decision_t *d = (decision_t *)vp->decisions;
+ if(p == NULL)
+ return -1;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 64;
+ endstate <<= 2;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 6; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate>>2] & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_av(void *p){
+ struct v27 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+/* Process received symbols */
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){
+ struct v27 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ vector unsigned char survivor0,survivor1,sym0v,sym1v;
+ vector bool char decision0,decision1;
+ vector unsigned char metric,m_metric,m0,m1,m2,m3;
+ void *tmp;
+ /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */
+ sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms));
+ sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */
+ sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */
+ syms += 2;
+ /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */
+ /* Form first set of 16 branch metrics */
+ metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v));
+ metric = vec_sr(metric,(vector unsigned char)(3));
+ m_metric = vec_sub((vector unsigned char)(31),metric);
+ /* Form first set of path metrics */
+ m0 = vec_adds(vp->old_metrics->v[0],metric);
+ m3 = vec_adds(vp->old_metrics->v[2],metric);
+ m1 = vec_adds(vp->old_metrics->v[2],m_metric);
+ m2 = vec_adds(vp->old_metrics->v[0],m_metric);
+ /* Form second set of 16 branch metrics */
+ metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v));
+ metric = vec_sr(metric,(vector unsigned char)(3));
+ m_metric = vec_sub((vector unsigned char)(31),metric);
+ /* Compare and select first set */
+ decision0 = vec_cmpgt(m0,m1);
+ decision1 = vec_cmpgt(m2,m3);
+ survivor0 = vec_min(m0,m1);
+ survivor1 = vec_min(m2,m3);
+ /* Compute second set of path metrics */
+ m0 = vec_adds(vp->old_metrics->v[1],metric);
+ m3 = vec_adds(vp->old_metrics->v[3],metric);
+ m1 = vec_adds(vp->old_metrics->v[3],m_metric);
+ m2 = vec_adds(vp->old_metrics->v[1],m_metric);
+ /* Interleave and store first decisions and survivors */
+ d->v[0] = vec_mergeh(decision0,decision1);
+ d->v[1] = vec_mergel(decision0,decision1);
+ vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1);
+ vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1);
+ /* Compare and select second set */
+ decision0 = vec_cmpgt(m0,m1);
+ decision1 = vec_cmpgt(m2,m3);
+ survivor0 = vec_min(m0,m1);
+ survivor1 = vec_min(m2,m3);
+ /* Interleave and store second set of decisions and survivors */
+ d->v[2] = vec_mergeh(decision0,decision1);
+ d->v[3] = vec_mergel(decision0,decision1);
+ vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1);
+ vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1);
+ /* renormalize if necessary */
+ if(vp->new_metrics->c[0] >= 105){
+ vector unsigned char scale0,scale1;
+ /* Find smallest metric and splat */
+ scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]);
+ scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]);
+ scale0 = vec_min(scale0,scale1);
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+ /* Now subtract from all metrics */
+ vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0);
+ vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0);
+ vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0);
+ vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi27_mmx.c b/viterbi27_mmx.c
new file mode 100644
index 0000000..a6d5125
--- /dev/null
+++ b/viterbi27_mmx.c
@@ -0,0 +1,115 @@
+/* K=7 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+typedef union { char c[64]; __m64 v[8];} decision_t;
+typedef union { unsigned char c[64]; __m64 v[8];} metric_t;
+unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16)));
+unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16)));
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly27.s!
+ */
+struct v27 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_mmx(void *p,int starting_state){
+ struct v27 *vp = (struct v27 *)p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<64;i++)
+ vp->metrics1.c[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi27_polynomial_mmx(int polys[2]){
+ int state;
+ for(state=0;state < 32;state++){
+ int symbol;
+ for(symbol = 0;symbol < 256;symbol++){
+ int sym;
+ sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+ Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+ sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+ Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+ }
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_mmx(int len){
+ struct v27 *vp;
+ int polys[2] = { V27POLYA, V27POLYB };
+ if(Init == 0){
+ set_viterbi27_polynomial_mmx(polys);
+ }
+ if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi27_mmx(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi27_mmx(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v27 *vp = (struct v27 *)p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate &= 63;
+ d += 6; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate>>2] & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_mmx(void *p){
+ struct v27 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
diff --git a/viterbi27_port.c b/viterbi27_port.c
new file mode 100644
index 0000000..7cac2b3
--- /dev/null
+++ b/viterbi27_port.c
@@ -0,0 +1,191 @@
+/* K=7 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned int w[64]; } metric_t;
+typedef union { unsigned long w[2];} decision_t;
+static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16)));
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_port(void *p,int starting_state){
+ struct v27 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<64;i++)
+ vp->metrics1.w[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi27_polynomial_port(int polys[2]){
+ int state;
+ for(state=0;state < 32;state++){
+ Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_port(int len){
+ struct v27 *vp;
+ if(!Init){
+ int polys[2] = { V27POLYA, V27POLYB };
+ set_viterbi27_polynomial_port(polys);
+ }
+ if((vp = malloc(sizeof(struct v27))) == NULL)
+ return NULL;
+ if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi27_port(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi27_port(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v27 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 64;
+ endstate <<= 2;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 6; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_port(void *p){
+ struct v27 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+ metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\
+ m0 = vp->old_metrics->w[i] + metric;\
+ m1 = vp->old_metrics->w[i+32] + (510 - metric);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i)&31);\
+ m0 -= (metric+metric-510);\
+ m1 += (metric+metric-510);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i+1)&31);\
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){
+ struct v27 *vp = p;
+ void *tmp;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ unsigned char sym0,sym1;
+ d->w[0] = d->w[1] = 0;
+ sym0 = *syms++;
+ sym1 = *syms++;
+ BFLY(0);
+ BFLY(1);
+ BFLY(2);
+ BFLY(3);
+ BFLY(4);
+ BFLY(5);
+ BFLY(6);
+ BFLY(7);
+ BFLY(8);
+ BFLY(9);
+ BFLY(10);
+ BFLY(11);
+ BFLY(12);
+ BFLY(13);
+ BFLY(14);
+ BFLY(15);
+ BFLY(16);
+ BFLY(17);
+ BFLY(18);
+ BFLY(19);
+ BFLY(20);
+ BFLY(21);
+ BFLY(22);
+ BFLY(23);
+ BFLY(24);
+ BFLY(25);
+ BFLY(26);
+ BFLY(27);
+ BFLY(28);
+ BFLY(29);
+ BFLY(30);
+ BFLY(31);
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi27_sse.c b/viterbi27_sse.c
new file mode 100644
index 0000000..cd1f287
--- /dev/null
+++ b/viterbi27_sse.c
@@ -0,0 +1,113 @@
+/* K=7 r=1/2 Viterbi decoder for SSE
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+typedef union { unsigned char c[64]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in ssebfly27.s!
+ */
+struct v27 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse(int len){
+ struct v27 *vp;
+ if(!Init){
+ int polys[2] = { V27POLYA, V27POLYB };
+ set_viterbi27_polynomial_sse(polys);
+ }
+ if((vp = malloc(sizeof(struct v27))) == NULL)
+ return NULL;
+ if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi27(vp,0);
+ return vp;
+void set_viterbi27_polynomial_sse(int polys[2]){
+ int state;
+ for(state=0;state < 32;state++){
+ Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse(void *p,int starting_state){
+ struct v27 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<64;i++)
+ vp->metrics1.c[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+ return 0;
+/* Viterbi chainback */
+int chainback_viterbi27_sse(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v27 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 64;
+ endstate <<= 2;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 6; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse(void *p){
+ struct v27 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
diff --git a/viterbi27_sse2.c b/viterbi27_sse2.c
new file mode 100644
index 0000000..bc01710
--- /dev/null
+++ b/viterbi27_sse2.c
@@ -0,0 +1,180 @@
+/* K=7 r=1/2 Viterbi decoder for SSE2
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+typedef union { unsigned char c[64]; __m128i v[4]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly27.s!
+ */
+struct v27 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse2(void *p,int starting_state){
+ struct v27 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<64;i++)
+ vp->metrics1.c[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi27_polynomial_sse2(int polys[2]){
+ int state;
+ for(state=0;state < 32;state++){
+ Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse2(int len){
+ void *p;
+ struct v27 *vp;
+ if(!Init){
+ int polys[2] = { V27POLYA, V27POLYB };
+ set_viterbi27_polynomial_sse2(polys);
+ }
+ /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+ if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27)))
+ return NULL;
+ vp = (struct v27 *)p;
+ if((p = malloc((len+6)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ vp->decisions = (decision_t *)p;
+ init_viterbi27_sse2(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi27_sse2(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v27 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 64;
+ endstate <<= 2;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 6; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse2(void *p){
+ struct v27 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+#if 0
+/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */
+void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){
+ struct v27 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m128i sym0v,sym1v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_epi8(syms[0]);
+ sym1v = _mm_set1_epi8(syms[1]);
+ syms += 2;
+ for(i=0;i<2;i++){
+ __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics */
+ metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v));
+ /* There's no packed bytes right shift in SSE2, so we use the word version and mask
+ * (I'm *really* starting to like Altivec...)
+ */
+ metric = _mm_srli_epi16(metric,3);
+ metric = _mm_and_si128(metric,_mm_set1_epi8(31));
+ m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_add_epi8(vp->old_metrics->v[i],metric);
+ m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric);
+ m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric);
+ m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric);
+ /* Compare and select, using modulo arithmetic */
+ decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128());
+ decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128());
+ survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0));
+ survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2));
+ /* Pack each set of decisions into 16 bits */
+ d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1));
+ d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
diff --git a/viterbi29.c b/viterbi29.c
new file mode 100644
index 0000000..80cbb33
--- /dev/null
+++ b/viterbi29.c
@@ -0,0 +1,152 @@
+/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29(int len){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return create_viterbi29_port(len);
+#ifdef __VEC__
+ case ALTIVEC:
+ return create_viterbi29_av(len);
+#ifdef __i386__
+ case MMX:
+ return create_viterbi29_mmx(len);
+ case SSE:
+ return create_viterbi29_sse(len);
+ case SSE2:
+ return create_viterbi29_sse2(len);
+ }
+void set_viterbi29_polynomial(int polys[2]){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ set_viterbi29_polynomial_port(polys);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ set_viterbi29_polynomial_av(polys);
+ break;
+#ifdef __i386__
+ case MMX:
+ set_viterbi29_polynomial_mmx(polys);
+ break;
+ case SSE:
+ set_viterbi29_polynomial_sse(polys);
+ break;
+ case SSE2:
+ set_viterbi29_polynomial_sse2(polys);
+ break;
+ }
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29(void *p,int starting_state){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return init_viterbi29_port(p,starting_state);
+#ifdef __VEC__
+ case ALTIVEC:
+ return init_viterbi29_av(p,starting_state);
+#ifdef __i386__
+ case MMX:
+ return init_viterbi29_mmx(p,starting_state);
+ case SSE:
+ return init_viterbi29_sse(p,starting_state);
+ case SSE2:
+ return init_viterbi29_sse2(p,starting_state);
+ }
+/* Viterbi chainback */
+int chainback_viterbi29(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return chainback_viterbi29_port(p,data,nbits,endstate);
+#ifdef __VEC__
+ case ALTIVEC:
+ return chainback_viterbi29_av(p,data,nbits,endstate);
+#ifdef __i386__
+ case MMX:
+ return chainback_viterbi29_mmx(p,data,nbits,endstate);
+ case SSE:
+ return chainback_viterbi29_sse(p,data,nbits,endstate);
+ case SSE2:
+ return chainback_viterbi29_sse2(p,data,nbits,endstate);
+ }
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29(void *p){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ delete_viterbi29_port(p);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ delete_viterbi29_av(p);
+ break;
+#ifdef __i386__
+ case MMX:
+ delete_viterbi29_mmx(p);
+ break;
+ case SSE:
+ delete_viterbi29_sse(p);
+ break;
+ case SSE2:
+ delete_viterbi29_sse2(p);
+ break;
+ }
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return update_viterbi29_blk_port(p,syms,nbits);
+#ifdef __VEC__
+ case ALTIVEC:
+ return update_viterbi29_blk_av(p,syms,nbits);
+#ifdef __i386__
+ case MMX:
+ return update_viterbi29_blk_mmx(p,syms,nbits);
+ case SSE:
+ return update_viterbi29_blk_sse(p,syms,nbits);
+ case SSE2:
+ return update_viterbi29_blk_sse2(p,syms,nbits);
+ }
diff --git a/viterbi29_av.c b/viterbi29_av.c
new file mode 100644
index 0000000..31c8d27
--- /dev/null
+++ b/viterbi29_av.c
@@ -0,0 +1,190 @@
+/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <sys/sysctl.h>
+#include "fec.h"
+typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t;
+typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t;
+static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v29 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_av(void *p,int starting_state){
+ struct v29 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<16;i++)
+ vp->metrics1.v[i] = (vector unsigned char)(63);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi29_polynomial_av(int polys[2]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_av(int len){
+ struct v29 *vp;
+ if(!Init){
+ int polys[2] = { V29POLYA,V29POLYB };
+ set_viterbi29_polynomial_av(polys);
+ }
+ if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi29_av(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi29_av(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v29 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate] & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_av(void *p){
+ struct v29 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){
+ struct v29 *vp = p;
+ decision_t *d;
+ int i;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ vector unsigned char sym1v,sym2v;
+ void *tmp;
+ /* All this seems necessary just to load a byte into all elements of a vector! */
+ sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */
+ sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */
+ sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */
+ syms += 2;
+ for(i=0;i<8;i++){
+ vector bool char decision0,decision1;
+ vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics */
+ metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v));
+ metric = vec_sr(metric,(vector unsigned char)(3));
+ m_metric = (vector unsigned char)(31) - metric;
+ /* Add branch metrics to path metrics */
+ m0 = vec_adds(vp->old_metrics->v[i],metric);
+ m3 = vec_adds(vp->old_metrics->v[8+i],metric);
+ m1 = vec_adds(vp->old_metrics->v[8+i],m_metric);
+ m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+ /* Compare and select first set */
+ decision0 = vec_cmpgt(m0,m1);
+ decision1 = vec_cmpgt(m2,m3);
+ survivor0 = vec_min(m0,m1);
+ survivor1 = vec_min(m2,m3);
+ /* Interleave and store decisions and survivors */
+ d->v[2*i] = vec_mergeh(decision0,decision1);
+ d->v[2*i+1] = vec_mergel(decision0,decision1);
+ vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+ }
+ d++;
+ /* renormalize if necessary */
+ if(vp->new_metrics->c[0] >= 50){
+ int i;
+ vector unsigned char scale0,scale1;
+ /* Find smallest metric and splat */
+ scale0 = vp->new_metrics->v[0];
+ scale1 = vp->new_metrics->v[1];
+ for(i=2;i<16;i+=2){
+ scale0 = vec_min(scale0,vp->new_metrics->v[i]);
+ scale1 = vec_min(scale1,vp->new_metrics->v[i+1]);
+ }
+ scale0 = vec_min(scale0,scale1);
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+ scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+ /* Now subtract from all metrics */
+ for(i=0;i<16;i++)
+ vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0);
+ }
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi29_mmx.c b/viterbi29_mmx.c
new file mode 100644
index 0000000..563f40a
--- /dev/null
+++ b/viterbi29_mmx.c
@@ -0,0 +1,118 @@
+/* K=9 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+typedef union { char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned char c[256]; __m64 v[32];} metric_t;
+unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8)));
+unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8)));
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly29.s!
+ */
+struct v29 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_mmx(int len){
+ struct v29 *vp;
+ if(Init == 0){
+ int polys[2] = {V29POLYA,V29POLYB};
+ set_viterbi29_polynomial_mmx(polys);
+ }
+ if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi29(vp,0);
+ return vp;
+void set_viterbi29_polynomial_mmx(int polys[2]){
+ int state;
+ for(state=0;state < 128;state++){
+ int symbol;
+ for(symbol = 0;symbol < 256;symbol++){
+ int sym;
+ sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+ Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+ sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+ Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+ }
+ }
+ Init++;
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_mmx(void *p,int starting_state){
+ struct v29 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.c[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+/* Viterbi chainback */
+int chainback_viterbi29_mmx(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v29 *vp = (struct v29 *)p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate &= 255;
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate] & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_mmx(void *p){
+ struct v29 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
diff --git a/viterbi29_port.c b/viterbi29_port.c
new file mode 100644
index 0000000..292dce8
--- /dev/null
+++ b/viterbi29_port.c
@@ -0,0 +1,166 @@
+/* K=9 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+static union { unsigned char c[128]; } Branchtab29[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v29 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_port(void *p,int starting_state){
+ struct v29 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.w[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi29_polynomial_port(int polys[2]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_port(int len){
+ struct v29 *vp;
+ if(!Init){
+ int polys[2] = {V29POLYA,V29POLYB};
+ set_viterbi29_polynomial_port(polys);
+ }
+ if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi29_port(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi29_port(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v29 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_port(void *p){
+ struct v29 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+ metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\
+ m0 = vp->old_metrics->w[i] + metric;\
+ m1 = vp->old_metrics->w[i+128] + (510 - metric);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i)&31);\
+ m0 -= (metric+metric-510);\
+ m1 += (metric+metric-510);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i+1)&31);\
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){
+ struct v29 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ void *tmp;
+ unsigned char sym0,sym1;
+ int i;
+ for(i=0;i<8;i++)
+ d->w[i] = 0;
+ sym0 = *syms++;
+ sym1 = *syms++;
+ for(i=0;i<128;i++)
+ BFLY(i);
+ d++;
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi29_sse.c b/viterbi29_sse.c
new file mode 100644
index 0000000..4a92e5f
--- /dev/null
+++ b/viterbi29_sse.c
@@ -0,0 +1,114 @@
+/* K=9 r=1/2 Viterbi decoder for SSE
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+typedef union { unsigned char w[256]; __m64 v[32];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t;
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v29 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse(int len){
+ struct v29 *vp;
+ if(!Init){
+ int polys[2] = { V29POLYA,V29POLYB };
+ set_viterbi29_polynomial_sse(polys);
+ }
+ if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi29(vp,0);
+ return vp;
+void set_viterbi29_polynomial_sse(int polys[2]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse(void *p,int starting_state){
+ struct v29 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.w[i] = 200;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+/* Viterbi chainback */
+int chainback_viterbi29_sse(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v29 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse(void *p){
+ struct v29 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
diff --git a/viterbi29_sse2.c b/viterbi29_sse2.c
new file mode 100644
index 0000000..4c7336c
--- /dev/null
+++ b/viterbi29_sse2.c
@@ -0,0 +1,119 @@
+/* K=9 r=1/2 Viterbi decoder for SSE2
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <emmintrin.h>
+#include "fec.h"
+typedef union { unsigned char c[256]; __m128i v[16];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2];
+static int Init = 0;
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly29.s!
+ */
+struct v29 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse2(void *p,int starting_state){
+ struct v29 *vp = p;
+ int i;
+ for(i=0;i<256;i++)
+ vp->metrics1.c[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi29_polynomial_sse2(int polys[2]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse2(int len){
+ void *p;
+ struct v29 *vp;
+ if(!Init){
+ int polys[2] = {V29POLYA,V29POLYB};
+ set_viterbi29_polynomial(polys);
+ }
+ /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+ if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29)))
+ return NULL;
+ vp = (struct v29 *)p;
+ if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ vp->decisions = (decision_t *)p;
+ init_viterbi29_sse2(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi29_sse2(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v29 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse2(void *p){
+ struct v29 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
diff --git a/viterbi39.c b/viterbi39.c
new file mode 100644
index 0000000..ac28c2c
--- /dev/null
+++ b/viterbi39.c
@@ -0,0 +1,153 @@
+/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39(int len){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return create_viterbi39_port(len);
+#ifdef __VEC__
+ case ALTIVEC:
+ return create_viterbi39_av(len);
+#ifdef __i386__
+ case MMX:
+ return create_viterbi39_mmx(len);
+ case SSE:
+ return create_viterbi39_sse(len);
+ case SSE2:
+ return create_viterbi39_sse2(len);
+ }
+void set_viterbi39_polynomial(int polys[3]){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ set_viterbi39_polynomial_port(polys);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ set_viterbi39_polynomial_av(polys);
+ break;
+#ifdef __i386__
+ case MMX:
+ set_viterbi39_polynomial_mmx(polys);
+ break;
+ case SSE:
+ set_viterbi39_polynomial_sse(polys);
+ break;
+ case SSE2:
+ set_viterbi39_polynomial_sse2(polys);
+ break;
+ }
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39(void *p,int starting_state){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return init_viterbi39_port(p,starting_state);
+#ifdef __VEC__
+ case ALTIVEC:
+ return init_viterbi39_av(p,starting_state);
+#ifdef __i386__
+ case MMX:
+ return init_viterbi39_mmx(p,starting_state);
+ case SSE:
+ return init_viterbi39_sse(p,starting_state);
+ case SSE2:
+ return init_viterbi39_sse2(p,starting_state);
+ }
+/* Viterbi chainback */
+int chainback_viterbi39(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return chainback_viterbi39_port(p,data,nbits,endstate);
+#ifdef __VEC__
+ case ALTIVEC:
+ return chainback_viterbi39_av(p,data,nbits,endstate);
+#ifdef __i386__
+ case MMX:
+ return chainback_viterbi39_mmx(p,data,nbits,endstate);
+ case SSE:
+ return chainback_viterbi39_sse(p,data,nbits,endstate);
+ case SSE2:
+ return chainback_viterbi39_sse2(p,data,nbits,endstate);
+ }
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39(void *p){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ delete_viterbi39_port(p);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ delete_viterbi39_av(p);
+ break;
+#ifdef __i386__
+ case MMX:
+ delete_viterbi39_mmx(p);
+ break;
+ case SSE:
+ delete_viterbi39_sse(p);
+ break;
+ case SSE2:
+ delete_viterbi39_sse2(p);
+ break;
+ }
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return update_viterbi39_blk_port(p,syms,nbits);
+#ifdef __VEC__
+ case ALTIVEC:
+ return update_viterbi39_blk_av(p,syms,nbits);
+#ifdef __i386__
+ case MMX:
+ return update_viterbi39_blk_mmx(p,syms,nbits);
+ case SSE:
+ return update_viterbi39_blk_sse(p,syms,nbits);
+ case SSE2:
+ return update_viterbi39_blk_sse2(p,syms,nbits);
+ }
diff --git a/viterbi39_av.c b/viterbi39_av.c
new file mode 100644
index 0000000..2deed51
--- /dev/null
+++ b/viterbi39_av.c
@@ -0,0 +1,251 @@
+/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t;
+typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t;
+static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v39 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_av(void *p,int starting_state){
+ struct v39 *vp = p;
+ int i;
+ for(i=0;i<32;i++)
+ vp->metrics1.v[i] = (vector unsigned short)(1000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi39_polynomial_av(int polys[3]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_av(int len){
+ struct v39 *vp;
+ if(!Init){
+ int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+ set_viterbi39_polynomial_av(polys);
+ }
+ vp = (struct v39 *)malloc(sizeof(struct v39));
+ vp->decisions = malloc(sizeof(decision_t)*(len+8));
+ init_viterbi39_av(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi39_av(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v39 *vp = p;
+ decision_t *d = (decision_t *)vp->decisions;
+ int path_metric;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ path_metric = vp->old_metrics->s[endstate];
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+ endstate = (k << 7) | (endstate >> 1);
+ data[nbits>>3] = endstate;
+ }
+ return path_metric;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_av(void *p){
+ struct v39 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){
+ struct v39 *vp = p;
+ decision_t *d = (decision_t *)vp->dp;
+ int path_metric = 0;
+ vector unsigned char decisions = (vector unsigned char)(0);
+ while(nbits--){
+ vector unsigned short symv,sym0v,sym1v,sym2v;
+ vector unsigned char s;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+ symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */
+ sym0v = vec_splat(symv,0);
+ sym1v = vec_splat(symv,1);
+ sym2v = vec_splat(symv,2);
+ syms += 3;
+ for(i=0;i<16;i++){
+ vector bool short decision0,decision1;
+ vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * the metrics are in the range 0-765
+ */
+ m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v));
+ m1 = vec_xor(Branchtab39[2].v[i],sym2v);
+ metric = vec_add(m0,m1);
+ m_metric = vec_sub((vector unsigned short)(765),metric);
+ /* Add branch metrics to path metrics */
+ m0 = vec_adds(vp->old_metrics->v[i],metric);
+ m3 = vec_adds(vp->old_metrics->v[16+i],metric);
+ m1 = vec_adds(vp->old_metrics->v[16+i],m_metric);
+ m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ decision0 = vec_cmpgt(m0,m1);
+ decision1 = vec_cmpgt(m2,m3);
+ survivor0 = vec_min(m0,m1);
+ survivor1 = vec_min(m2,m3);
+ /* Store decisions and survivors.
+ * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+ * a funny interleaved fashion that we undo in the chainback function.
+ */
+ decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+ /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+ * 0xff is equivalent to adding 1, which sets the lsb.
+ */
+ decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+ vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+ if((i % 8) == 7){
+ /* We've accumulated a total of 128 decisions, stash and start again */
+ d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+ }
+ }
+#if 0
+ /* Experimentally determine metric spread
+ * The results are fixed for a given code and input symbol size
+ */
+ {
+ int i;
+ vector unsigned short min_metric;
+ vector unsigned short max_metric;
+ union { vector unsigned short v; unsigned short s[8];} t;
+ int minimum,maximum;
+ static int max_spread = 0;
+ min_metric = max_metric = vp->new_metrics->v[0];
+ for(i=1;i<32;i++){
+ min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+ max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+ }
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+ t.v = min_metric;
+ minimum = t.s[0];
+ t.v = max_metric;
+ maximum = t.s[0];
+ if(maximum-minimum > max_spread){
+ max_spread = maximum-minimum;
+ printf("metric spread = %d\n",max_spread);
+ }
+ }
+ /* Renormalize if necessary. This deserves some explanation.
+ * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825
+ * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+ * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+ * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+ * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+ * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+ * All this is borne out by experiment.
+ */
+ if(vp->new_metrics->s[0] >= USHRT_MAX-5000){
+ vector unsigned short scale;
+ union { vector unsigned short v; unsigned short s[8];} t;
+ /* Find smallest metric and splat */
+ scale = vp->new_metrics->v[0];
+ for(i=1;i<32;i++)
+ scale = vec_min(scale,vp->new_metrics->v[i]);
+ scale = vec_min(scale,vec_sld(scale,scale,8));
+ scale = vec_min(scale,vec_sld(scale,scale,4));
+ scale = vec_min(scale,vec_sld(scale,scale,2));
+ /* Subtract it from all metrics
+ * Work backwards to try to improve the cache hit ratio, assuming LRU
+ */
+ for(i=31;i>=0;i--)
+ vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+ t.v = scale;
+ path_metric += t.s[0];
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return path_metric;
diff --git a/viterbi39_mmx.c b/viterbi39_mmx.c
new file mode 100644
index 0000000..875391a
--- /dev/null
+++ b/viterbi39_mmx.c
@@ -0,0 +1,185 @@
+/* K=9 r=1/3 Viterbi decoder for x86 MMX
+ * Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+typedef union { unsigned char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned short s[256]; __m64 v[64];} metric_t;
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v39 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_mmx(void *p,int starting_state){
+ struct v39 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.s[i] = 1000;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi39_polynomial_mmx(int polys[3]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+ Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+ Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_mmx(int len){
+ struct v39 *vp;
+ if(!Init){
+ int polys[3] = { V39POLYA,V39POLYB,V39POLYC };
+ set_viterbi39_polynomial_mmx(polys);
+ }
+ if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+ return NULL;
+ if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi39_mmx(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi39_mmx(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v39 *vp = p;
+ decision_t *d;
+ int path_metric;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate %= 256;
+ path_metric = vp->old_metrics->s[endstate];
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate] & 1;
+ endstate = (k << 7) | (endstate >> 1);
+ data[nbits>>3] = endstate;
+ }
+ return path_metric;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_mmx(void *p){
+ struct v39 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){
+ struct v39 *vp = p;
+ decision_t *d;
+ int path_metric = 0;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m64 sym0v,sym1v,sym2v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_pi16(syms[0]);
+ sym1v = _mm_set1_pi16(syms[1]);
+ sym2v = _mm_set1_pi16(syms[2]);
+ syms += 3;
+ for(i=0;i<32;i++){
+ __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-1530
+ */
+ m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+ metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+ m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric);
+ m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric);
+ m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select
+ * There's no packed min instruction in MMX, so we use modulo arithmetic
+ * to form the decisions and then do the select the hard way
+ */
+ decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+ decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+ survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+ survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ /* Merge decisions and store as bytes */
+ d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+ }
+ if(vp->new_metrics->s[0] < vp->old_metrics->s[0])
+ path_metric += 65536; /* Hack: wraparound probably occured */
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ _mm_empty();
+ return path_metric;
diff --git a/viterbi39_port.c b/viterbi39_port.c
new file mode 100644
index 0000000..5685c90
--- /dev/null
+++ b/viterbi39_port.c
@@ -0,0 +1,168 @@
+/* K=9 r=1/3 Viterbi decoder in portable C
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+static union { unsigned char c[128]; } Branchtab39[3];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v39 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_port(void *p,int starting_state){
+ struct v39 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.w[i] = 63;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+ return 0;
+void set_viterbi39_polynomial_port(int polys[3]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+ Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+ Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+ }
+ Init++;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_port(int len){
+ struct v39 *vp;
+ if(!Init){
+ int polys[3] = {V39POLYA,V39POLYB,V39POLYC};
+ set_viterbi39_polynomial_port(polys);
+ }
+ if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+ return NULL;
+ if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi39_port(vp,0);
+ return vp;
+/* Viterbi chainback */
+int chainback_viterbi39_port(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v39 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = vp->decisions;
+ /* Make room beyond the end of the encoder register so we can
+ * accumulate a full byte of decoded data
+ */
+ endstate %= 256;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_port(void *p){
+ struct v39 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+ metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \
+ (Branchtab39[2].c[i] ^ sym2);\
+ m0 = vp->old_metrics->w[i] + metric;\
+ m1 = vp->old_metrics->w[i+128] + (765 - metric);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i)&31);\
+ m0 -= (metric+metric-765);\
+ m1 += (metric+metric-765);\
+ decision = (signed int)(m0-m1) > 0;\
+ vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+ d->w[i/16] |= decision << ((2*i+1)&31);\
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){
+ struct v39 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ void *tmp;
+ unsigned char sym0,sym1,sym2;
+ int i;
+ for(i=0;i<8;i++)
+ d->w[i] = 0;
+ sym0 = *syms++;
+ sym1 = *syms++;
+ sym2 = *syms++;
+ for(i=0;i<128;i++)
+ BFLY(i);
+ d++;
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi39_sse.c b/viterbi39_sse.c
new file mode 100644
index 0000000..c2f2865
--- /dev/null
+++ b/viterbi39_sse.c
@@ -0,0 +1,201 @@
+/* K=9 r=1/3 Viterbi decoder for x86 SSE
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+typedef union { signed short s[256]; __m64 v[64];} metric_t;
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v39 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse(void *p,int starting_state){
+ struct v39 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<256;i++)
+ vp->metrics1.s[i] = (SHRT_MIN+1000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse(int len){
+ struct v39 *vp;
+ if(!Init){
+ int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+ set_viterbi39_polynomial_sse(polys);
+ }
+ if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){
+ return NULL;
+ }
+ if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi39_sse(vp,0);
+ return vp;
+void set_viterbi39_polynomial_sse(int polys[3]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+ Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+ Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi39_sse(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v39 *vp = p;
+ decision_t *d;
+ int path_metric;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate %= 256;
+ path_metric = vp->old_metrics->s[endstate];
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+ k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+ endstate = (k << 7) | (endstate >> 1);
+ data[nbits>>3] = endstate;
+ }
+ return path_metric - SHRT_MIN;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse(void *p){
+ struct v39 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){
+ struct v39 *vp = p;
+ decision_t *d;
+ int path_metric = 0;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m64 sym0v,sym1v,sym2v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_pi16(syms[0]);
+ sym1v = _mm_set1_pi16(syms[1]);
+ sym2v = _mm_set1_pi16(syms[2]);
+ syms += 3;
+ for(i=0;i<32;i++){
+ __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-765
+ */
+ m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+ metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+ m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric);
+ m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric);
+ m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ survivor0 = _mm_min_pi16(m0,m1);
+ survivor1 = _mm_min_pi16(m2,m3);
+ decision0 = _mm_cmpeq_pi16(survivor0,m1);
+ decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ /* Pack decisions into 8 bits and store */
+ d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+ }
+ /* See if we need to renormalize
+ * Max metric spread for this code with 0-255 branch metrics is 12750
+ */
+ if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+ int i,adjust;
+ __m64 adjustv;
+ union { __m64 v; signed short w[4]; } t;
+ /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+ adjustv = vp->new_metrics->v[0];
+ for(i=1;i<64;i++)
+ adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+ adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+ adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));
+ t.v = adjustv;
+ adjust = t.w[0] - SHRT_MIN;
+ path_metric += adjust;
+ adjustv = _mm_set1_pi16(adjust);
+ for(i=0;i<64;i++)
+ vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ _mm_empty();
+ return path_metric;
diff --git a/viterbi39_sse2.c b/viterbi39_sse2.c
new file mode 100644
index 0000000..f13794e
--- /dev/null
+++ b/viterbi39_sse2.c
@@ -0,0 +1,200 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
+typedef union { signed short s[256]; __m128i v[32];} metric_t;
+static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v39 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse2(void *p,int starting_state){
+ struct v39 *vp = p;
+ int i;
+ for(i=0;i<256;i++)
+ vp->metrics1.s[i] = (SHRT_MIN+1000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse2(int len){
+ void *p;
+ struct v39 *vp;
+ if(!Init){
+ int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+ set_viterbi39_polynomial_sse2(polys);
+ }
+ /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+ if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
+ return NULL;
+ vp = (struct v39 *)p;
+ if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ vp->decisions = (decision_t *)p;
+ init_viterbi39_sse2(vp,0);
+ return vp;
+void set_viterbi39_polynomial_sse2(int polys[3]){
+ int state;
+ for(state=0;state < 128;state++){
+ Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+ Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+ Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi39_sse2(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v39 *vp = p;
+ decision_t *d = (decision_t *)vp->decisions;
+ int path_metric;
+ endstate %= 256;
+ path_metric = vp->old_metrics->s[endstate];
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 8; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+ endstate = (k << 7) | (endstate >> 1);
+ data[nbits>>3] = endstate;
+ }
+ return path_metric;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse2(void *p){
+ struct v39 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){
+ struct v39 *vp = p;
+ decision_t *d = (decision_t *)vp->dp;
+ int path_metric = 0;
+ while(nbits--){
+ __m128i sym0v,sym1v,sym2v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_epi16(syms[0]);
+ sym1v = _mm_set1_epi16(syms[1]);
+ sym2v = _mm_set1_epi16(syms[2]);
+ syms += 3;
+ /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+ for(i=0;i<16;i++){
+ __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-765
+ */
+ m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
+ metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
+ m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
+ m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
+ m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ survivor0 = _mm_min_epi16(m0,m1);
+ survivor1 = _mm_min_epi16(m2,m3);
+ decision0 = _mm_cmpeq_epi16(survivor0,m1);
+ decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+ d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+ }
+ /* See if we need to renormalize */
+ if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+ int i,adjust;
+ __m128i adjustv;
+ union { __m128i v; signed short w[8]; } t;
+ /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+ adjustv = vp->new_metrics->v[0];
+ for(i=1;i<32;i++)
+ adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+ t.v = adjustv;
+ adjust = t.w[0] - SHRT_MIN;
+ path_metric += adjust;
+ adjustv = _mm_set1_epi16(adjust);
+ /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+ * This is okay since it can't overflow anyway
+ */
+ for(i=0;i<32;i++)
+ vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return path_metric;
diff --git a/viterbi615.c b/viterbi615.c
new file mode 100644
index 0000000..6dda51f
--- /dev/null
+++ b/viterbi615.c
@@ -0,0 +1,155 @@
+/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615(int len){
+ find_cpu_mode();
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return create_viterbi615_port(len);
+#ifdef __VEC__
+ case ALTIVEC:
+ return create_viterbi615_av(len);
+#ifdef __i386__
+ case MMX:
+ return create_viterbi615_mmx(len);
+ case SSE:
+ return create_viterbi615_sse(len);
+ case SSE2:
+ return create_viterbi615_sse2(len);
+ }
+void set_viterbi615_polynomial(int polys[6]){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ set_viterbi615_polynomial_port(polys);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ set_viterbi615_polynomial_av(polys);
+ break;
+#ifdef __i386__
+ case MMX:
+ set_viterbi615_polynomial_mmx(polys);
+ break;
+ case SSE:
+ set_viterbi615_polynomial_sse(polys);
+ break;
+ case SSE2:
+ set_viterbi615_polynomial_sse2(polys);
+ break;
+ }
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615(void *p,int starting_state){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return init_viterbi615_port(p,starting_state);
+#ifdef __VEC__
+ case ALTIVEC:
+ return init_viterbi615_av(p,starting_state);
+#ifdef __i386__
+ case MMX:
+ return init_viterbi615_mmx(p,starting_state);
+ case SSE:
+ return init_viterbi615_sse(p,starting_state);
+ case SSE2:
+ return init_viterbi615_sse2(p,starting_state);
+ }
+/* Viterbi chainback */
+int chainback_viterbi615(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return chainback_viterbi615_port(p,data,nbits,endstate);
+#ifdef __VEC__
+ case ALTIVEC:
+ return chainback_viterbi615_av(p,data,nbits,endstate);
+#ifdef __i386__
+ case MMX:
+ return chainback_viterbi615_mmx(p,data,nbits,endstate);
+ case SSE:
+ return chainback_viterbi615_sse(p,data,nbits,endstate);
+ case SSE2:
+ return chainback_viterbi615_sse2(p,data,nbits,endstate);
+ }
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615(void *p){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ delete_viterbi615_port(p);
+ break;
+#ifdef __VEC__
+ case ALTIVEC:
+ delete_viterbi615_av(p);
+ break;
+#ifdef __i386__
+ case MMX:
+ delete_viterbi615_mmx(p);
+ break;
+ case SSE:
+ delete_viterbi615_sse(p);
+ break;
+ case SSE2:
+ delete_viterbi615_sse2(p);
+ break;
+ }
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){
+ switch(Cpu_mode){
+ case PORT:
+ default:
+ return update_viterbi615_blk_port(p,syms,nbits);
+#ifdef __VEC__
+ case ALTIVEC:
+ return update_viterbi615_blk_av(p,syms,nbits);
+#ifdef __i386__
+ case MMX:
+ return update_viterbi615_blk_mmx(p,syms,nbits);
+ case SSE:
+ return update_viterbi615_blk_sse(p,syms,nbits);
+ case SSE2:
+ return update_viterbi615_blk_sse2(p,syms,nbits);
+ }
diff --git a/viterbi615_av.c b/viterbi615_av.c
new file mode 100644
index 0000000..4a6ce9c
--- /dev/null
+++ b/viterbi615_av.c
@@ -0,0 +1,257 @@
+/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t;
+typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t;
+static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v615 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_av(void *p,int starting_state){
+ struct v615 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<2048;i++)
+ vp->metrics1.v[i] = (vector unsigned short)(5000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_av(int len){
+ struct v615 *vp;
+ if(!Init){
+ int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+ set_viterbi615_polynomial_av(polys);
+ }
+ vp = (struct v615 *)malloc(sizeof(struct v615));
+ vp->decisions = malloc(sizeof(decision_t)*(len+14));
+ init_viterbi615_av(vp,0);
+ return vp;
+void set_viterbi615_polynomial_av(int polys[6]){
+ int state;
+ int i;
+ for(state=0;state < 8192;state++){
+ for(i=0;i<6;i++)
+ Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi615_av(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v615 *vp = p;
+ decision_t *d = (decision_t *)vp->decisions;
+ int path_metric;
+ endstate %= 16384;
+ path_metric = vp->old_metrics->s[endstate];
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 14; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+ endstate = (k << 13) | (endstate >> 1);
+ data[nbits>>3] = endstate >> 6;
+ }
+ return path_metric;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_av(void *p){
+ struct v615 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){
+ struct v615 *vp = p;
+ decision_t *d = (decision_t *)vp->dp;
+ int path_metric = 0;
+ vector unsigned char decisions = (vector unsigned char)(0);
+ while(nbits--){
+ vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+ vector unsigned char s;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+ symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */
+ sym0v = vec_splat(symv,0);
+ sym1v = vec_splat(symv,1);
+ sym2v = vec_splat(symv,2);
+ sym3v = vec_splat(symv,3);
+ sym4v = vec_splat(symv,4);
+ sym5v = vec_splat(symv,5);
+ syms += 6;
+ for(i=0;i<1024;i++){
+ vector bool short decision0,decision1;
+ vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-1530
+ */
+ m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v));
+ m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v));
+ m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v));
+ metric = vec_add(m0,m1);
+ metric = vec_add(metric,m2);
+ m_metric = vec_sub((vector unsigned short)(1530),metric);
+ /* Add branch metrics to path metrics */
+ m0 = vec_adds(vp->old_metrics->v[i],metric);
+ m3 = vec_adds(vp->old_metrics->v[1024+i],metric);
+ m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric);
+ m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ decision0 = vec_cmpgt(m0,m1);
+ decision1 = vec_cmpgt(m2,m3);
+ survivor0 = vec_min(m0,m1);
+ survivor1 = vec_min(m2,m3);
+ /* Store decisions and survivors.
+ * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+ * a funny interleaved fashion that we undo in the chainback function.
+ */
+ decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+ /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+ * 0xff is equivalent to adding 1, which sets the lsb.
+ */
+ decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+ vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+ if((i % 8) == 7){
+ /* We've accumulated a total of 128 decisions, stash and start again */
+ d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+ }
+ }
+#if 0
+ /* Experimentally determine metric spread
+ * The results are fixed for a given code and input symbol size
+ */
+ {
+ int i;
+ vector unsigned short min_metric;
+ vector unsigned short max_metric;
+ union { vector unsigned short v; unsigned short s[8];} t;
+ int minimum,maximum;
+ static int max_spread = 0;
+ min_metric = max_metric = vp->new_metrics->v[0];
+ for(i=1;i<2048;i++){
+ min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+ max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+ }
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+ min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+ max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+ t.v = min_metric;
+ minimum = t.s[0];
+ t.v = max_metric;
+ maximum = t.s[0];
+ if(maximum-minimum > max_spread){
+ max_spread = maximum-minimum;
+ printf("metric spread = %d\n",max_spread);
+ }
+ }
+ /* Renormalize if necessary. This deserves some explanation.
+ * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750.
+ * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+ * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+ * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+ * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+ * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+ * All this is borne out by experiment.
+ */
+ if(vp->new_metrics->s[0] >= USHRT_MAX-12750){
+ vector unsigned short scale;
+ union { vector unsigned short v; unsigned short s[8];} t;
+ /* Find smallest metric and splat */
+ scale = vp->new_metrics->v[0];
+ for(i=1;i<2048;i++)
+ scale = vec_min(scale,vp->new_metrics->v[i]);
+ scale = vec_min(scale,vec_sld(scale,scale,8));
+ scale = vec_min(scale,vec_sld(scale,scale,4));
+ scale = vec_min(scale,vec_sld(scale,scale,2));
+ /* Subtract it from all metrics
+ * Work backwards to try to improve the cache hit ratio, assuming LRU
+ */
+ for(i=2047;i>=0;i--)
+ vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+ t.v = scale;
+ path_metric += t.s[0];
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return path_metric;
diff --git a/viterbi615_mmx.c b/viterbi615_mmx.c
new file mode 100644
index 0000000..89a56f7
--- /dev/null
+++ b/viterbi615_mmx.c
@@ -0,0 +1,183 @@
+/* K=15 r=1/6 Viterbi decoder for x86 MMX
+ * Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t;
+typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t;
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v615 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_mmx(void *p,int starting_state){
+ struct v615 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<16384;i++)
+ vp->metrics1.s[i] = 5000;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_mmx(int len){
+ struct v615 *vp;
+ if(!Init){
+ int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+ set_viterbi615_polynomial_mmx(polys);
+ }
+ if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+ return NULL;
+ if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi615_mmx(vp,0);
+ return vp;
+void set_viterbi615_polynomial_mmx(int polys[6]){
+ int state;
+ int i;
+ for(state=0;state < 8192;state++){
+ for(i=0;i<6;i++)
+ Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi615_mmx(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v615 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate %= 16384;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 14; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = d[nbits].c[endstate] & 1;
+ endstate = (k << 13) | (endstate >> 1);
+ data[nbits>>3] = endstate >> 6;
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_mmx(void *p){
+ struct v615 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){
+ struct v615 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_pi16(syms[0]);
+ sym1v = _mm_set1_pi16(syms[1]);
+ sym2v = _mm_set1_pi16(syms[2]);
+ sym3v = _mm_set1_pi16(syms[3]);
+ sym4v = _mm_set1_pi16(syms[4]);
+ sym5v = _mm_set1_pi16(syms[5]);
+ syms += 6;
+ for(i=0;i<2048;i++){
+ __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-1530
+ */
+ m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+ m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+ m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+ metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+ m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric);
+ m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric);
+ m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select
+ * There's no packed min instruction in MMX, so we use modulo arithmetic
+ * to form the decisions and then do the select the hard way
+ */
+ decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+ decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+ survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+ survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ /* Merge decisions and store as bytes */
+ d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ _mm_empty();
+ return 0;
diff --git a/viterbi615_port.c b/viterbi615_port.c
new file mode 100644
index 0000000..89bdd80
--- /dev/null
+++ b/viterbi615_port.c
@@ -0,0 +1,156 @@
+/* K=15 r=1/6 Viterbi decoder in portable C
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { unsigned long w[16384]; } metric_t;
+static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16)));
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v615 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ decision_t *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ decision_t *decisions; /* Beginning of decisions for block */
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_port(int len){
+ struct v615 *vp;
+ if(!Init){
+ int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+ set_viterbi615_polynomial_port(polys);
+ }
+ if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+ return NULL;
+ if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi615(vp,0);
+ return vp;
+void set_viterbi615_polynomial_port(int polys[6]){
+ int state;
+ int i;
+ for(state=0;state < 8192;state++){
+ for(i=0;i<6;i++)
+ Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+ }
+ Init++;
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_port(void *p,int starting_state){
+ struct v615 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<16384;i++)
+ vp->metrics1.w[i] = 1000;
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */
+ return 0;
+/* Viterbi chainback */
+int chainback_viterbi615_port(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v615 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate %= 16384;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 14; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+ endstate = (k << 13) | (endstate >> 1);
+ data[nbits>>3] = endstate >> 6;
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_port(void *p){
+ struct v615 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned long metric,m0,m1,m2,m3,decision0,decision1;\
+ metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\
+ +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\
+ +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\
+ m0 = vp->old_metrics->w[i] + metric;\
+ m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\
+ m2 = vp->old_metrics->w[i] + (1530-metric);\
+ m3 = vp->old_metrics->w[i+8192] + metric;\
+ decision0 = (signed long)(m0-m1) >= 0;\
+ decision1 = (signed long)(m2-m3) >= 0;\
+ vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\
+ vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\
+ d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){
+ struct v615 *vp = p;
+ void *tmp;
+ decision_t *d;
+ int i;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ memset(d,0,sizeof(decision_t));
+ for(i=0;i<8192;i++)
+ BFLY(i);
+ syms += 6;
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/viterbi615_sse.c b/viterbi615_sse.c
new file mode 100644
index 0000000..de0f8af
--- /dev/null
+++ b/viterbi615_sse.c
@@ -0,0 +1,201 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { signed short s[16384]; __m64 v[4096];} metric_t;
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v615 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse(void *p,int starting_state){
+ struct v615 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<16384;i++)
+ vp->metrics1.s[i] = (SHRT_MIN+5000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse(int len){
+ struct v615 *vp;
+ if(!Init){
+ int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+ set_viterbi615_polynomial_sse(polys);
+ }
+ if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){
+ return NULL;
+ }
+ if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ init_viterbi615_sse(vp,0);
+ return vp;
+void set_viterbi615_polynomial_sse(int polys[6]){
+ int state;
+ int i;
+ for(state=0;state < 8192;state++){
+ for(i=0;i<6;i++)
+ Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi615_sse(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v615 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->decisions;
+ endstate %= 16384;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 14; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+ k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+ endstate = (k << 13) | (endstate >> 1);
+ data[nbits>>3] = endstate >> 6;
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse(void *p){
+ struct v615 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){
+ struct v615 *vp = p;
+ decision_t *d;
+ if(p == NULL)
+ return -1;
+ d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_pi16(syms[0]);
+ sym1v = _mm_set1_pi16(syms[1]);
+ sym2v = _mm_set1_pi16(syms[2]);
+ sym3v = _mm_set1_pi16(syms[3]);
+ sym4v = _mm_set1_pi16(syms[4]);
+ sym5v = _mm_set1_pi16(syms[5]);
+ syms += 6;
+ for(i=0;i<2048;i++){
+ __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-1530
+ */
+ m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+ m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+ m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+ metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+ m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric);
+ m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric);
+ m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ survivor0 = _mm_min_pi16(m0,m1);
+ survivor1 = _mm_min_pi16(m2,m3);
+ decision0 = _mm_cmpeq_pi16(survivor0,m1);
+ decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ /* Pack decisions into 8 bits and store */
+ d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+ }
+ /* See if we need to renormalize
+ * Max metric spread for this code with 0-255 branch metrics is 12750
+ */
+ if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+ int i,adjust;
+ __m64 adjustv;
+ union { __m64 v; signed short w[4]; } t;
+ /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+ adjustv = vp->new_metrics->v[0];
+ for(i=1;i<4096;i++)
+ adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+ adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+ adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));
+ t.v = adjustv;
+ adjust = t.w[0] - SHRT_MIN;
+ adjustv = _mm_set1_pi16(adjust);
+ for(i=0;i<4096;i++)
+ vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ _mm_empty();
+ return 0;
diff --git a/viterbi615_sse2.c b/viterbi615_sse2.c
new file mode 100644
index 0000000..7f711e5
--- /dev/null
+++ b/viterbi615_sse2.c
@@ -0,0 +1,204 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t;
+typedef union { signed short s[16384]; __m128i v[2048];} metric_t;
+static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6];
+static int Init = 0;
+/* State info for instance of Viterbi decoder */
+struct v615 {
+ metric_t metrics1; /* path metric buffer 1 */
+ metric_t metrics2; /* path metric buffer 2 */
+ void *dp; /* Pointer to current decision */
+ metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+ void *decisions; /* Beginning of decisions for block */
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse2(void *p,int starting_state){
+ struct v615 *vp = p;
+ int i;
+ if(p == NULL)
+ return -1;
+ for(i=0;i<16384;i++)
+ vp->metrics1.s[i] = (SHRT_MIN+5000);
+ vp->old_metrics = &vp->metrics1;
+ vp->new_metrics = &vp->metrics2;
+ vp->dp = vp->decisions;
+ vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+ return 0;
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse2(int len){
+ void *p;
+ struct v615 *vp;
+ if(!Init){
+ int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+ set_viterbi615_polynomial_sse2(polys);
+ }
+ /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+ if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615)))
+ return NULL;
+ vp = (struct v615 *)p;
+ if((p = malloc((len+14)*sizeof(decision_t))) == NULL){
+ free(vp);
+ return NULL;
+ }
+ vp->decisions = (decision_t *)p;
+ init_viterbi615_sse2(vp,0);
+ return vp;
+void set_viterbi615_polynomial_sse2(int polys[6]){
+ int state;
+ int i;
+ for(state=0;state < 8192;state++){
+ for(i=0;i<6;i++)
+ Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+ }
+ Init++;
+/* Viterbi chainback */
+int chainback_viterbi615_sse2(
+ void *p,
+ unsigned char *data, /* Decoded output data */
+ unsigned int nbits, /* Number of data bits */
+ unsigned int endstate){ /* Terminal encoder state */
+ struct v615 *vp = p;
+ decision_t *d = (decision_t *)vp->decisions;
+ endstate %= 16384;
+ /* The store into data[] only needs to be done every 8 bits.
+ * But this avoids a conditional branch, and the writes will
+ * combine in the cache anyway
+ */
+ d += 14; /* Look past tail */
+ while(nbits-- != 0){
+ int k;
+ k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+ endstate = (k << 13) | (endstate >> 1);
+ data[nbits>>3] = endstate >> 6;
+ }
+ return 0;
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse2(void *p){
+ struct v615 *vp = p;
+ if(vp != NULL){
+ free(vp->decisions);
+ free(vp);
+ }
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){
+ struct v615 *vp = p;
+ decision_t *d = (decision_t *)vp->dp;
+ while(nbits--){
+ __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+ void *tmp;
+ int i;
+ /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+ sym0v = _mm_set1_epi16(syms[0]);
+ sym1v = _mm_set1_epi16(syms[1]);
+ sym2v = _mm_set1_epi16(syms[2]);
+ sym3v = _mm_set1_epi16(syms[3]);
+ sym4v = _mm_set1_epi16(syms[4]);
+ sym5v = _mm_set1_epi16(syms[5]);
+ syms += 6;
+ /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+ for(i=0;i<1024;i++){
+ __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+ /* Form branch metrics
+ * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+ * the XOR operations constitute conditional negation.
+ * metric and m_metric (-metric) are in the range 0-1530
+ */
+ m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v));
+ m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v));
+ m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v));
+ metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2));
+ m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric);
+ /* Add branch metrics to path metrics */
+ m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+ m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric);
+ m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric);
+ m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+ /* Compare and select */
+ survivor0 = _mm_min_epi16(m0,m1);
+ survivor1 = _mm_min_epi16(m2,m3);
+ decision0 = _mm_cmpeq_epi16(survivor0,m1);
+ decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+ d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+ /* Store surviving metrics */
+ vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+ vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+ }
+ /* See if we need to renormalize
+ * Max metric spread for this code with 0-90 branch metrics is 405
+ */
+ if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+ int i,adjust;
+ __m128i adjustv;
+ union { __m128i v; signed short w[8]; } t;
+ /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+ adjustv = vp->new_metrics->v[0];
+ for(i=1;i<2048;i++)
+ adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+ adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+ t.v = adjustv;
+ adjust = t.w[0] - SHRT_MIN;
+ adjustv = _mm_set1_epi16(adjust);
+ /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+ * This is okay since it can't overflow anyway
+ */
+ for(i=0;i<2048;i++)
+ vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+ }
+ d++;
+ /* Swap pointers to old and new metrics */
+ tmp = vp->old_metrics;
+ vp->old_metrics = vp->new_metrics;
+ vp->new_metrics = tmp;
+ }
+ vp->dp = d;
+ return 0;
diff --git a/vtest27.c b/vtest27.c
new file mode 100644
index 0000000..7256483
--- /dev/null
+++ b/vtest27.c
@@ -0,0 +1,184 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"frame-length",1,NULL,'l'},
+ {"frame-count",1,NULL,'n'},
+ {"ebn0",1,NULL,'e'},
+ {"gain",1,NULL,'g'},
+ {"verbose",0,NULL,'v'},
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {NULL},
+#define RATE (1./2.)
+#define MAXBYTES 10000
+double Gain = 32.0;
+int Verbose = 0;
+int main(int argc,char *argv[]){
+ int i,d,tr;
+ int sr=0,trials = 10000,errcnt,framebits=2048;
+ long long int tot_errs=0;
+ unsigned char bits[MAXBYTES];
+ unsigned char data[MAXBYTES];
+ unsigned char xordata[MAXBYTES];
+ unsigned char symbols[8*2*(MAXBYTES+6)];
+ void *vp;
+ extern char *optarg;
+ struct rusage start,finish;
+ double extime;
+ double gain,esn0,ebn0;
+ time_t t;
+ int badframes=0;
+ time(&t);
+ srandom(t);
+ ebn0 = -100;
+ while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'l':
+ framebits = atoi(optarg);
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ case 'e':
+ ebn0 = atof(optarg);
+ break;
+ case 'g':
+ Gain = atof(optarg);
+ break;
+ case 'v':
+ Verbose++;
+ break;
+ }
+ }
+ if(framebits > 8*MAXBYTES){
+ fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+ framebits = MAXBYTES*8;
+ }
+ if((vp = create_viterbi27(framebits)) == NULL){
+ printf("create_viterbi27 failed\n");
+ exit(1);
+ }
+ if(ebn0 != -100){
+ esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+ /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+ * only half the noise power, and the sqrt() converts power to
+ * voltage.
+ */
+ gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ for(tr=0;tr<trials;tr++){
+ /* Encode a frame of random data */
+ for(i=0;i<framebits+6;i++){
+ int bit = (i < framebits) ? (random() & 1) : 0;
+ sr = (sr << 1) | bit;
+ bits[i/8] = sr & 0xff;
+ symbols[2*i+0] = addnoise(parity(sr & V27POLYA),gain,Gain,127.5,255);
+ symbols[2*i+1] = addnoise(parity(sr & V27POLYB),gain,Gain,127.5,255);
+ }
+ /* Decode it and make sure we get the right answer */
+ /* Initialize Viterbi decoder */
+ init_viterbi27(vp,0);
+ /* Decode block */
+ update_viterbi27_blk(vp,symbols,framebits+6);
+ /* Do Viterbi chainback */
+ chainback_viterbi27(vp,data,framebits,0);
+ errcnt = 0;
+ for(i=0;i<framebits/8;i++){
+ int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+ errcnt += e;
+ tot_errs += e;
+ }
+ if(errcnt != 0)
+ badframes++;
+ if(Verbose > 1 && errcnt != 0){
+ printf("frame %d, %d errors: ",tr,errcnt);
+ for(i=0;i<framebits/8;i++){
+ printf("%02x",xordata[i]);
+ }
+ printf("\n");
+ }
+ if(Verbose)
+ printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+ tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+ badframes,tr+1,(double)badframes/(tr+1));
+ fflush(stdout);
+ }
+ if(Verbose > 1)
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ else if(Verbose == 0)
+ printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+ tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+ badframes,tr+1,(double)badframes/(tr+1));
+ else
+ printf("\n");
+ } else {
+ /* Do time trials */
+ memset(symbols,127,sizeof(symbols));
+ printf("Starting time trials\n");
+ getrusage(RUSAGE_SELF,&start);
+ for(tr=0;tr < trials;tr++){
+ /* Initialize Viterbi decoder */
+ init_viterbi27(vp,0);
+ /* Decode block */
+ update_viterbi27_blk(vp,symbols,framebits);
+ /* Do Viterbi chainback */
+ chainback_viterbi27(vp,data,framebits,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+ framebits,extime);
+ printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+ }
+ exit(0);
diff --git a/vtest29.c b/vtest29.c
new file mode 100644
index 0000000..8471b54
--- /dev/null
+++ b/vtest29.c
@@ -0,0 +1,185 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"frame-length",1,NULL,'l'},
+ {"frame-count",1,NULL,'n'},
+ {"ebn0",1,NULL,'e'},
+ {"gain",1,NULL,'g'},
+ {"verbose",0,NULL,'v'},
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {NULL},
+#define RATE (1./2.)
+#define MAXBYTES 10000
+double Gain = 32.0;
+int Verbose = 0;
+int main(int argc,char *argv[]){
+ int i,d,tr;
+ int sr=0,trials = 10000,errcnt,framebits=2048;
+ long long tot_errs=0;
+ unsigned char bits[MAXBYTES];
+ unsigned char data[MAXBYTES];
+ unsigned char xordata[MAXBYTES];
+ unsigned char symbols[8*2*(MAXBYTES+8)];
+ void *vp;
+ extern char *optarg;
+ struct rusage start,finish;
+ double extime;
+ double gain,esn0,ebn0;
+ time_t t;
+ int badframes=0;
+ time(&t);
+ srandom(t);
+ ebn0 = -100;
+ while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'l':
+ framebits = atoi(optarg);
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ case 'e':
+ ebn0 = atof(optarg);
+ break;
+ case 'g':
+ Gain = atof(optarg);
+ break;
+ case 'v':
+ Verbose++;
+ break;
+ }
+ }
+ if(framebits > 8*MAXBYTES){
+ fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+ framebits = MAXBYTES*8;
+ }
+ if((vp = create_viterbi29(framebits)) == NULL){
+ printf("create_viterbi29 failed\n");
+ exit(1);
+ }
+ if(ebn0 != -100){
+ esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+ /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+ * only half the noise power, and the sqrt() converts power to
+ * voltage.
+ */
+ gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ for(tr=0;tr<trials;tr++){
+ /* Encode a frame of random data */
+ for(i=0;i<framebits+8;i++){
+ int bit = (i < framebits) ? (random() & 1) : 0;
+ sr = (sr << 1) | bit;
+ bits[i/8] = sr & 0xff;
+ symbols[2*i+0] = addnoise(parity(sr & V29POLYA),gain,Gain,127.5,255);
+ symbols[2*i+1] = addnoise(parity(sr & V29POLYB),gain,Gain,127.5,255);
+ }
+ /* Decode it and make sure we get the right answer */
+ /* Initialize Viterbi decoder */
+ init_viterbi29(vp,0);
+ /* Decode block */
+ update_viterbi29_blk(vp,symbols,framebits+8);
+ /* Do Viterbi chainback */
+ chainback_viterbi29(vp,data,framebits,0);
+ errcnt = 0;
+ for(i=0;i<framebits/8;i++){
+ int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+ errcnt += e;
+ tot_errs += e;
+ }
+ if(errcnt != 0)
+ badframes++;
+ if(Verbose > 1 && errcnt != 0){
+ printf("frame %d, %d errors: ",tr,errcnt);
+ for(i=0;i<framebits/8;i++){
+ printf("%02x",xordata[i]);
+ }
+ printf("\n");
+ }
+ if(Verbose)
+ printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+ tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+ badframes,tr+1,(double)badframes/(tr+1));
+ fflush(stdout);
+ }
+ if(Verbose > 1)
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ else if(Verbose == 0)
+ printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+ tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+ badframes,tr+1,(double)badframes/(tr+1));
+ else
+ printf("\n");
+ } else {
+ /* Do time trials */
+ memset(symbols,127,sizeof(symbols));
+ printf("Starting time trials\n");
+ getrusage(RUSAGE_SELF,&start);
+ for(tr=0;tr < trials;tr++){
+ /* Initialize Viterbi decoder */
+ init_viterbi29(vp,0);
+ /* Decode block */
+ update_viterbi29_blk(vp,symbols,framebits);
+ /* Do Viterbi chainback */
+ chainback_viterbi29(vp,data,framebits,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+ framebits,extime);
+ printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+ }
+ exit(0);
diff --git a/vtest39.c b/vtest39.c
new file mode 100644
index 0000000..76723b2
--- /dev/null
+++ b/vtest39.c
@@ -0,0 +1,186 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"frame-length",1,NULL,'l'},
+ {"frame-count",1,NULL,'n'},
+ {"ebn0",1,NULL,'e'},
+ {"gain",1,NULL,'g'},
+ {"verbose",0,NULL,'v'},
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {NULL},
+#define RATE (1./3.)
+#define MAXBYTES 10000
+double Gain = 32.0;
+int Verbose = 0;
+int main(int argc,char *argv[]){
+ int i,d,tr;
+ int sr=0,trials = 10000,errcnt,framebits=2048;
+ long long tot_errs=0;
+ unsigned char bits[MAXBYTES];
+ unsigned char data[MAXBYTES];
+ unsigned char xordata[MAXBYTES];
+ unsigned char symbols[8*3*(MAXBYTES+8)];
+ void *vp;
+ extern char *optarg;
+ struct rusage start,finish;
+ double extime;
+ double gain,esn0,ebn0;
+ time_t t;
+ int badframes=0;
+ time(&t);
+ srandom(t);
+ ebn0 = -100;
+ while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'l':
+ framebits = atoi(optarg);
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ case 'e':
+ ebn0 = atof(optarg);
+ break;
+ case 'g':
+ Gain = atof(optarg);
+ break;
+ case 'v':
+ Verbose++;
+ break;
+ }
+ }
+ if(framebits > 8*MAXBYTES){
+ fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+ framebits = MAXBYTES*8;
+ }
+ if((vp = create_viterbi39(framebits)) == NULL){
+ printf("create_viterbi39 failed\n");
+ exit(1);
+ }
+ if(ebn0 != -100){
+ esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+ /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+ * only half the noise power, and the sqrt() converts power to
+ * voltage.
+ */
+ gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ for(tr=0;tr<trials;tr++){
+ /* Encode a frame of random data */
+ for(i=0;i<framebits+8;i++){
+ int bit = (i < framebits) ? (random() & 1) : 0;
+ sr = (sr << 1) | bit;
+ bits[i/8] = sr & 0xff;
+ symbols[3*i+0] = addnoise(parity(sr & V39POLYA),gain,Gain,127.5,255);
+ symbols[3*i+1] = addnoise(parity(sr & V39POLYB),gain,Gain,127.5,255);
+ symbols[3*i+2] = addnoise(parity(sr & V39POLYC),gain,Gain,127.5,255);
+ }
+ /* Decode it and make sure we get the right answer */
+ /* Initialize Viterbi decoder */
+ init_viterbi39(vp,0);
+ /* Decode block */
+ update_viterbi39_blk(vp,symbols,framebits+8);
+ /* Do Viterbi chainback */
+ chainback_viterbi39(vp,data,framebits,0);
+ errcnt = 0;
+ for(i=0;i<framebits/8;i++){
+ int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+ errcnt += e;
+ tot_errs += e;
+ }
+ if(errcnt != 0)
+ badframes++;
+ if(Verbose > 1 && errcnt != 0){
+ printf("frame %d, %d errors: ",tr,errcnt);
+ for(i=0;i<framebits/8;i++){
+ printf("%02x",xordata[i]);
+ }
+ printf("\n");
+ }
+ if(Verbose)
+ printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+ tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+ badframes,tr+1,(double)badframes/(tr+1));
+ fflush(stdout);
+ }
+ if(Verbose > 1)
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ else if(Verbose == 0)
+ printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+ tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+ badframes,tr+1,(double)badframes/(tr+1));
+ else
+ printf("\n");
+ } else {
+ /* Do time trials */
+ memset(symbols,127,sizeof(symbols));
+ printf("Starting time trials\n");
+ getrusage(RUSAGE_SELF,&start);
+ for(tr=0;tr < trials;tr++){
+ /* Initialize Viterbi decoder */
+ init_viterbi39(vp,0);
+ /* Decode block */
+ update_viterbi39_blk(vp,symbols,framebits);
+ /* Do Viterbi chainback */
+ chainback_viterbi39(vp,data,framebits,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+ framebits,extime);
+ printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+ }
+ exit(0);
diff --git a/vtest615.c b/vtest615.c
new file mode 100644
index 0000000..4bd8c4f
--- /dev/null
+++ b/vtest615.c
@@ -0,0 +1,191 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include "fec.h"
+struct option Options[] = {
+ {"frame-length",1,NULL,'l'},
+ {"frame-count",1,NULL,'n'},
+ {"ebn0",1,NULL,'e'},
+ {"gain",1,NULL,'g'},
+ {"verbose",0,NULL,'v'},
+ {"force-altivec",0,NULL,'a'},
+ {"force-port",0,NULL,'p'},
+ {"force-mmx",0,NULL,'m'},
+ {"force-sse",0,NULL,'s'},
+ {"force-sse2",0,NULL,'t'},
+ {NULL},
+#define RATE (1./6.)
+#define MAXBYTES 10000
+#define OFFSET (127.5)
+#define CLIP 255
+double Gain = 24.0;
+int Verbose = 0;
+int main(int argc,char *argv[]){
+ int i,d,tr;
+ int sr=0,trials = 10,errcnt,framebits=2048;
+ int tot_errs=0;
+ unsigned char bits[MAXBYTES];
+ unsigned char data[MAXBYTES];
+ unsigned char xordata[MAXBYTES];
+ unsigned char symbols[8*6*(MAXBYTES+14)];
+ void *vp;
+ extern char *optarg;
+ struct rusage start,finish;
+ double extime;
+ double gain,esn0,ebn0;
+ time_t t;
+ int badframes=0;
+ time(&t);
+ srandom(t);
+ ebn0 = -100;
+ while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+ while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+ switch(d){
+ case 'a':
+ Cpu_mode = ALTIVEC;
+ break;
+ case 'p':
+ Cpu_mode = PORT;
+ break;
+ case 'm':
+ Cpu_mode = MMX;
+ break;
+ case 's':
+ Cpu_mode = SSE;
+ break;
+ case 't':
+ Cpu_mode = SSE2;
+ break;
+ case 'l':
+ framebits = atoi(optarg);
+ break;
+ case 'n':
+ trials = atoi(optarg);
+ break;
+ case 'e':
+ ebn0 = atof(optarg);
+ break;
+ case 'g':
+ Gain = atof(optarg);
+ break;
+ case 'v':
+ Verbose++;
+ break;
+ }
+ }
+ if(framebits > 8*MAXBYTES){
+ fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+ framebits = MAXBYTES*8;
+ }
+ if((vp = create_viterbi615(framebits)) == NULL){
+ printf("create_viterbi615 failed\n");
+ exit(1);
+ }
+ if(ebn0 != -100){
+ esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+ /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+ * only half the noise power, and the sqrt() converts power to
+ * voltage.
+ */
+ gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ for(tr=0;tr<trials;tr++){
+ /* Encode a frame of random data */
+ for(i=0;i<framebits+14;i++){
+ int bit = (i < framebits) ? (random() & 1) : 0;
+ sr = (sr << 1) | bit;
+ bits[i/8] = sr & 0xff;
+ symbols[6*i+0] = addnoise(parity(sr & V615POLYA),gain,Gain,OFFSET,CLIP);
+ symbols[6*i+1] = addnoise(parity(sr & V615POLYB),gain,Gain,OFFSET,CLIP);
+ symbols[6*i+2] = addnoise(parity(sr & V615POLYC),gain,Gain,OFFSET,CLIP);
+ symbols[6*i+3] = addnoise(parity(sr & V615POLYD),gain,Gain,OFFSET,CLIP);
+ symbols[6*i+4] = addnoise(parity(sr & V615POLYE),gain,Gain,OFFSET,CLIP);
+ symbols[6*i+5] = addnoise(parity(sr & V615POLYF),gain,Gain,OFFSET,CLIP);
+ }
+ /* Decode it and make sure we get the right answer */
+ /* Initialize Viterbi decoder */
+ init_viterbi615(vp,0);
+ /* Decode block */
+ update_viterbi615_blk(vp,symbols,framebits+14);
+ /* Do Viterbi chainback */
+ chainback_viterbi615(vp,data,framebits,0);
+ errcnt = 0;
+ for(i=0;i<framebits/8;i++){
+ int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+ errcnt += e;
+ tot_errs += e;
+ }
+ if(errcnt != 0)
+ badframes++;
+ if(Verbose > 1 && errcnt != 0){
+ printf("frame %d, %d errors: ",tr,errcnt);
+ for(i=0;i<framebits/8;i++){
+ printf("%02x",xordata[i]);
+ }
+ printf("\n");
+ }
+ if(Verbose)
+ printf("BER %d/%d (%10.3g) FER %d/%d (%10.3g)\r",
+ tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+ badframes,(tr+1),(double)badframes/(tr+1));
+ fflush(stdout);
+ }
+ if(Verbose > 1)
+ printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+ else if(Verbose == 0)
+ printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n",
+ tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+ badframes,(tr+1),(double)badframes/(tr+1));
+ else
+ printf("\n");
+ } else {
+ /* Do time trials */
+ memset(symbols,127,sizeof(symbols));
+ printf("Starting time trials\n");
+ getrusage(RUSAGE_SELF,&start);
+ for(tr=0;tr < trials;tr++){
+ /* Initialize Viterbi decoder */
+ init_viterbi615(vp,0);
+ /* Decode block */
+ update_viterbi615_blk(vp,symbols,framebits+14);
+ /* Do Viterbi chainback */
+ chainback_viterbi615(vp,data,framebits,0);
+ }
+ getrusage(RUSAGE_SELF,&finish);
+ extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+ printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+ framebits,extime);
+ printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+ }
+ exit(0);