107 files changed, 11792 insertions, 4 deletions
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..e9e5baf
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,39 @@
+INSTALLATION INSTRUCTIONS
+
+To build and install the libfec libraries, simply say
+
+./configure
+make
+make test (optional)
+make install (as root)
+
+By default, "make install" puts the libfec libraries in
+/usr/local/lib, the include files in /usr/local/include, and the
+manual page in /usr/local/man.
+
+You may have an old version of the GNU assembler that cannot handle
+the relatively new SSE2 mnemonics. Update your version of the GNU
+"binutils" package.
+
+You may obtain the latest binutils package through your normal
+distribution channels or from:
+
+http://sources.redhat.com/binutils/
+
+TESTING THE FEC LIBRARY
+
+After running the ./configure script, optional tests can be built and
+run as follows:
+
+make test
+
+"make test" tests each routine, using the SIMD versions as
+appropriate, verifying correct operation and estimating Viterbi
+decoding speeds. These tests should always succeed unless something is
+broken.
+
+28 Mar 2004
+Phil Karn, karn@ka9q.net
+
+
+
diff --git a/README b/README
new file mode 100644
index 0000000..95253e2
--- /dev/null
+++ b/README
@@ -0,0 +1,120 @@
+COPYRIGHT
+
+This package is copyright 2006 by Phil Karn, KA9Q. It may be used
+under the terms of the GNU Lesser General Public License (LGPL). See
+the file "lesser.txt" in this package for license details.
+
+INTRODUCTION
+
+This package provides a set of functions that implement several
+popular forward error correction (FEC) algorithms and several low-level routines
+useful in modems implemented with digital signal processing (DSP).
+
+The following routines are provided:
+
+1. Viterbi decoders for the following convolutional codes:
+
+r=1/2 k=7 ("Voyager" code, now a widely used industry standard)
+r=1/2 k=9 (Used on the IS-95 CDMA forward link)
+r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions)
+
+2. Reed-Solomon encoders and decoders for any user-specified code.
+
+3. Optimized encoder and decoder for the CCSDS-standard (255,223)
+Reed-Solomon code, with and without the CCSDS-standard "dual basis"
+symbol representation.
+
+4. Compute dot product between a 16-bit buffer and a set of 16-bit
+coefficients. This is the basic DSP primitive for digital filtering
+and correlation.
+
+4. Compute sum of squares of a buffer of 16-bit signed integers. This is
+useful in DSP for finding the total energy in a signal.
+
+5. Find peak value in a buffer of 16-bit signed integers, useful for
+scaling a signal to prevent overflow.
+
+SIMD SUPPORT
+
+This package automatically makes use of various SIMD (Single
+Instruction stream, Multiple Data stream) instruction sets, when
+available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and
+Altivec on the PowerPC G4 and G5 used by Power Macintoshes.
+
+"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine",
+and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the
+IA-32.
+
+Many of the SIMD versions run more than an order of
+magnitude faster than their portable C versions. The available SIMD
+instruction sets, if any, are determined at run time and the proper
+version of each routine is automatically selected. If no SIMD
+instructions are available, the portable C version is invoked by
+default. On targets other than IA-32 and PPC, only the portable C
+version is built.
+
+The SIMD-assisted versions generally produce the same results as the C
+versions, with a few minor exceptions. The Viterbi decoders in C have
+a very slightly greater Eb/No performance due to their use of 32-bit
+path metrics. On the other hand, the SIMD versions use the
+"saturating" arithmetic available in these instructions to avoid the
+integer wraparounds that can occur in C when argument ranges are not
+properly constrained. This applies primarily to the "dotprod" (dot
+product) function.
+
+The MMX (MultiMedia eXtensions) instruction set was introduced on
+later Pentium CPUs; it is also implemented on the Pentium II and most
+AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was
+introduced in the Pentium III; AMD calls it "3D Now! Professional".
+Intel introduced SSE2 on the Pentium 4, and it has been picked up by
+later AMD CPUs. SSE support implies MMX support, while SSE2 support
+implies both SSE and MMX support.
+
+The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott
+New Instructions") was introduced in early 2004 with the latest
+("Prescott") revision of the Pentium 4. Relatively little was
+introduced with SSE3, and this library currently makes no use of it.
+
+See the various manual pages for details on how to use the library
+routines.
+
+Copyright 2006, Phil Karn, KA9Q
+karn@ka9q.net
+http://www.ka9q.net/
+
+This software may be used under the terms of the GNU Lesser General
+Public License (LGPL); see the file lesser.txt for details.
+
+Revision history:
+Version 1.0 released 29 May 2001
+
+Version 2.0 released 3 Dec 2001:
+Restructured to add support for shared libraries.
+
+Version 2.0.1 released 8 Dec 2001:
+Includes autoconf/configure script
+
+Version 2.0.2 released 4 Feb 2002:
+Add SIMD version override options
+Test for lack of SSE2 mnemonic support in 'as'
+Build only selected version
+
+Version 2.0.3 released 6 Feb 2002:
+Fix to parityb function in parity.h
+
+feclib version 1.0 released November 2003
+Merged SIMD-Viterbi, RS and DSP libraries
+Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time
+
+feclib version 2.0 (unreleased) Mar 2004
+General speedups and cleanups
+Switch from 4 to 8-bit input symbols on all Viterbi decoders
+Support for Altivec on PowerPC
+Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code
+Changed license to GNU Lesser General Public License (LGPL)
+
+feclib version 2.1 June 5 2006
+Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults
+
+feclib version 2.1.1 June 6 2006
+Fix test/benchmark time measurement on Linux
diff --git a/README.md b/README.md
deleted file mode 100644
index fdafed0..0000000
--- a/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-ka9q-fec
-========
-
-This is a fork of KA9Q's FEC library
diff --git a/ccsds.h b/ccsds.h
new file mode 100644
index 0000000..ae65468
--- /dev/null
+++ b/ccsds.h
@@ -0,0 +1,5 @@
+typedef unsigned char data_t;
+extern unsigned char Taltab[],Tal1tab[];
+#define NN 255
+#define NROOTS 32
+
diff --git a/char.h b/char.h
new file mode 100644
index 0000000..25efd65
--- /dev/null
+++ b/char.h
@@ -0,0 +1,24 @@
+/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+
+#define MODNN(x) modnn(rs,x)
+
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to) 
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
+
+
+
+
diff --git a/configure.in b/configure.in
new file mode 100644
index 0000000..4e4110b
--- /dev/null
+++ b/configure.in
@@ -0,0 +1,83 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_INIT(viterbi27.c)
+AC_CONFIG_HEADER(config.h)
+SO_NAME=3
+VERSION=3.0.0
+AC_SUBST(SO_NAME)
+AC_SUBST(VERSION)
+
+dnl Checks for programs.
+AC_PROG_CC
+if test $GCC != "yes"
+then
+	AC_MSG_ERROR([Need GNU C compiler])
+fi
+dnl Checks for libraries.
+AC_CHECK_LIB(c, malloc)
+
+dnl Checks for header files.
+AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h)
+if test -z "$HAVE_stdio.h"
+then
+	AC_MSG_ERROR([Need stdio.h!])
+fi
+if test -z "$HAVE_stdlib.h"
+then
+	AC_MSG_ERROR([Need stdlib.h!])
+fi
+if test -z "$HAVE_stdlib.h"
+then
+	AC_MSG_ERROR([Need memory.h!])
+fi
+if test -z "$HAVE_string.h"
+then
+	AC_MSG_ERROR([Need string.h])
+fi
+
+AC_CANONICAL_SYSTEM
+case $target_cpu in
+i386|i486|i586|i686)
+	ARCH_OPTION="-march=$target_cpu"
+	MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \
+	viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \
+	viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \
+	viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \
+	dotprod_mmx.o dotprod_mmx_assist.o \
+	dotprod_sse2.o dotprod_sse2_assist.o \
+	peakval_mmx.o peakval_mmx_assist.o \
+	peakval_sse.o peakval_sse_assist.o \
+	peakval_sse2.o peakval_sse2_assist.o \
+	sumsq.o sumsq_port.o \
+	sumsq_sse2.o sumsq_sse2_assist.o \
+	sumsq_mmx.o sumsq_mmx_assist.o \
+	cpu_features.o cpu_mode_x86.o"
+	;;
+powerpc*)
+	ARCH_OPTION="-fno-common -faltivec"
+	MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \
+	encode_rs_av.o \
+	dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o"
+	;;
+*)
+	MLIBS=
+esac
+case $target_os in
+darwin*)
+	SH_LIB=libfec.dylib
+	REBIND=""
+	;;
+*)
+	SH_LIB=libfec.so
+	REBIND=ldconfig
+	;;
+esac
+AC_SUBST(SH_LIB)
+AC_SUBST(REBIND)
+AC_SUBST(MLIBS)
+AC_SUBST(ARCH_OPTION)
+
+
+dnl Checks for library functions.
+AC_CHECK_FUNCS(getopt_long memset memmove)
+
+AC_OUTPUT(makefile)
diff --git a/cpu_features.s b/cpu_features.s
new file mode 100644
index 0000000..ef4ba4e
--- /dev/null
+++ b/cpu_features.s
@@ -0,0 +1,15 @@
+.text
+.global cpu_features
+	.type cpu_features,@function
+cpu_features:	
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	movl $1,%eax
+	cpuid
+	movl %edx,%eax
+	popl %edx
+	popl %ecx
+	popl %ebx
+	ret
+	
+\ No newline at end of file
diff --git a/cpu_mode_ppc.c b/cpu_mode_ppc.c
new file mode 100644
index 0000000..0071558
--- /dev/null
+++ b/cpu_mode_ppc.c
@@ -0,0 +1,40 @@
+/* Determine CPU support for SIMD on Power PC
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+#endif
+
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+		   "x86 Streaming SIMD Extensions (SSE)",
+		   "x86 Streaming SIMD Extensions 2 (SSE2)",
+		   "PowerPC G4/G5 Altivec/Velocity Engine"};
+
+enum cpu_mode Cpu_mode;
+
+void find_cpu_mode(void){
+
+  if(Cpu_mode != UNKNOWN)
+    return;
+
+#ifdef __VEC__
+  {
+  /* Ask the OS if we have Altivec support */
+  int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+  int hasVectorUnit = 0;
+  size_t length = sizeof(hasVectorUnit);
+  int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+  if(0 == error && hasVectorUnit)
+    Cpu_mode = ALTIVEC;
+  else
+    Cpu_mode = PORT;
+  }
+#else
+  Cpu_mode = PORT;
+#endif
+
+  fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
+}
diff --git a/cpu_mode_x86.c b/cpu_mode_x86.c
new file mode 100644
index 0000000..322018e
--- /dev/null
+++ b/cpu_mode_x86.c
@@ -0,0 +1,33 @@
+/* Determine CPU support for SIMD
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+		   "x86 Streaming SIMD Extensions (SSE)",
+		   "x86 Streaming SIMD Extensions 2 (SSE2)",
+		   "PowerPC G4/G5 Altivec/Velocity Engine"};
+
+enum cpu_mode Cpu_mode;
+
+void find_cpu_mode(void){
+
+  int f;
+  if(Cpu_mode != UNKNOWN)
+    return;
+
+  /* Figure out what kind of CPU we have */
+  f = cpu_features();
+  if(f & (1<<26)){ /* SSE2 is present */
+    Cpu_mode = SSE2;
+  } else if(f & (1<<25)){ /* SSE is present */
+    Cpu_mode = SSE;
+  } else if(f & (1<<23)){ /* MMX is present */
+    Cpu_mode = MMX;
+  } else { /* No SIMD at all */
+    Cpu_mode = PORT;
+  }
+  fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
+}
diff --git a/decode_rs.c b/decode_rs.c
new file mode 100644
index 0000000..d7f97b3
--- /dev/null
+++ b/decode_rs.c
@@ -0,0 +1,262 @@
+/* Reed-Solomon decoder
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#define NULL ((void *)0)
+#define	min(a,b)	((a) < (b) ? (a) : (b))
+
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#else
+#include "char.h"
+#endif
+
+int DECODE_RS(
+#ifdef FIXED
+data_t *data, int *eras_pos, int no_eras,int pad){
+#else
+void *p,data_t *data, int *eras_pos, int no_eras){
+  struct rs *rs = (struct rs *)p;
+#endif
+  int deg_lambda, el, deg_omega;
+  int i, j, r,k;
+  data_t u,q,tmp,num1,num2,den,discr_r;
+  data_t lambda[NROOTS+1], s[NROOTS];	/* Err+Eras Locator poly
+					 * and syndrome poly */
+  data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+  data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+  int syn_error, count;
+
+#ifdef FIXED
+  /* Check pad parameter for validity */
+  if(pad < 0 || pad >= NN)
+    return -1;
+#endif
+
+  /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+  for(i=0;i<NROOTS;i++)
+    s[i] = data[0];
+
+  for(j=1;j<NN-PAD;j++){
+    for(i=0;i<NROOTS;i++){
+      if(s[i] == 0){
+	s[i] = data[j];
+      } else {
+	s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+      }
+    }
+  }
+
+  /* Convert syndromes to index form, checking for nonzero condition */
+  syn_error = 0;
+  for(i=0;i<NROOTS;i++){
+    syn_error |= s[i];
+    s[i] = INDEX_OF[s[i]];
+  }
+
+  if (!syn_error) {
+    /* if syndrome is zero, data[] is a codeword and there are no
+     * errors to correct. So return data[] unmodified
+     */
+    count = 0;
+    goto finish;
+  }
+  memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+  lambda[0] = 1;
+
+  if (no_eras > 0) {
+    /* Init lambda to be the erasure locator polynomial */
+    lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+    for (i = 1; i < no_eras; i++) {
+      u = MODNN(PRIM*(NN-1-eras_pos[i]));
+      for (j = i+1; j > 0; j--) {
+	tmp = INDEX_OF[lambda[j - 1]];
+	if(tmp != A0)
+	  lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+      }
+    }
+
+#if DEBUG >= 1
+    /* Test code that verifies the erasure locator polynomial just constructed
+       Needed only for decoder debugging. */
+    
+    /* find roots of the erasure location polynomial */
+    for(i=1;i<=no_eras;i++)
+      reg[i] = INDEX_OF[lambda[i]];
+
+    count = 0;
+    for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+      q = 1;
+      for (j = 1; j <= no_eras; j++)
+	if (reg[j] != A0) {
+	  reg[j] = MODNN(reg[j] + j);
+	  q ^= ALPHA_TO[reg[j]];
+	}
+      if (q != 0)
+	continue;
+      /* store root and error location number indices */
+      root[count] = i;
+      loc[count] = k;
+      count++;
+    }
+    if (count != no_eras) {
+      printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+      count = -1;
+      goto finish;
+    }
+#if DEBUG >= 2
+    printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+    for (i = 0; i < count; i++)
+      printf("%d ", loc[i]);
+    printf("\n");
+#endif
+#endif
+  }
+  for(i=0;i<NROOTS+1;i++)
+    b[i] = INDEX_OF[lambda[i]];
+  
+  /*
+   * Begin Berlekamp-Massey algorithm to determine error+erasure
+   * locator polynomial
+   */
+  r = no_eras;
+  el = no_eras;
+  while (++r <= NROOTS) {	/* r is the step number */
+    /* Compute discrepancy at the r-th step in poly-form */
+    discr_r = 0;
+    for (i = 0; i < r; i++){
+      if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+	discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+      }
+    }
+    discr_r = INDEX_OF[discr_r];	/* Index form */
+    if (discr_r == A0) {
+      /* 2 lines below: B(x) <-- x*B(x) */
+      memmove(&b[1],b,NROOTS*sizeof(b[0]));
+      b[0] = A0;
+    } else {
+      /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+      t[0] = lambda[0];
+      for (i = 0 ; i < NROOTS; i++) {
+	if(b[i] != A0)
+	  t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+	else
+	  t[i+1] = lambda[i+1];
+      }
+      if (2 * el <= r + no_eras - 1) {
+	el = r + no_eras - el;
+	/*
+	 * 2 lines below: B(x) <-- inv(discr_r) *
+	 * lambda(x)
+	 */
+	for (i = 0; i <= NROOTS; i++)
+	  b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+      } else {
+	/* 2 lines below: B(x) <-- x*B(x) */
+	memmove(&b[1],b,NROOTS*sizeof(b[0]));
+	b[0] = A0;
+      }
+      memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+    }
+  }
+
+  /* Convert lambda to index form and compute deg(lambda(x)) */
+  deg_lambda = 0;
+  for(i=0;i<NROOTS+1;i++){
+    lambda[i] = INDEX_OF[lambda[i]];
+    if(lambda[i] != A0)
+      deg_lambda = i;
+  }
+  /* Find roots of the error+erasure locator polynomial by Chien search */
+  memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+  count = 0;		/* Number of roots of lambda(x) */
+  for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+    q = 1; /* lambda[0] is always 0 */
+    for (j = deg_lambda; j > 0; j--){
+      if (reg[j] != A0) {
+	reg[j] = MODNN(reg[j] + j);
+	q ^= ALPHA_TO[reg[j]];
+      }
+    }
+    if (q != 0)
+      continue; /* Not a root */
+    /* store root (index-form) and error location number */
+#if DEBUG>=2
+    printf("count %d root %d loc %d\n",count,i,k);
+#endif
+    root[count] = i;
+    loc[count] = k;
+    /* If we've already found max possible roots,
+     * abort the search to save time
+     */
+    if(++count == deg_lambda)
+      break;
+  }
+  if (deg_lambda != count) {
+    /*
+     * deg(lambda) unequal to number of roots => uncorrectable
+     * error detected
+     */
+    count = -1;
+    goto finish;
+  }
+  /*
+   * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+   * x**NROOTS). in index form. Also find deg(omega).
+   */
+  deg_omega = deg_lambda-1;
+  for (i = 0; i <= deg_omega;i++){
+    tmp = 0;
+    for(j=i;j >= 0; j--){
+      if ((s[i - j] != A0) && (lambda[j] != A0))
+	tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+    }
+    omega[i] = INDEX_OF[tmp];
+  }
+
+  /*
+   * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+   * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+   */
+  for (j = count-1; j >=0; j--) {
+    num1 = 0;
+    for (i = deg_omega; i >= 0; i--) {
+      if (omega[i] != A0)
+	num1  ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+    }
+    num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+    den = 0;
+    
+    /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+    for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+      if(lambda[i+1] != A0)
+	den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+    }
+#if DEBUG >= 1
+    if (den == 0) {
+      printf("\n ERROR: denominator = 0\n");
+      count = -1;
+      goto finish;
+    }
+#endif
+    /* Apply error to data */
+    if (num1 != 0 && loc[j] >= PAD) {
+      data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+    }
+  }
+ finish:
+  if(eras_pos != NULL){
+    for(i=0;i<count;i++)
+      eras_pos[i] = loc[i];
+  }
+  return count;
+}
diff --git a/decode_rs.h b/decode_rs.h
new file mode 100644
index 0000000..c165cf3
--- /dev/null
+++ b/decode_rs.h
@@ -0,0 +1,298 @@
+/* The guts of the Reed-Solomon decoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN data and parity symbols to be corrected in place
+ * retval - an integer lvalue into which the decoder's return code is written
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ *          which is the same as the number of parity symbols in a block.
+            Integer variable or literal.
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ *            elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ *            elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * FCR - An integer literal or variable specifying the first consecutive root of the
+ *       Reed-Solomon generator polynomial. Integer variable or literal.
+ * PRIM - The primitive root of the generator poly. Integer variable or literal.
+ * DEBUG - If set to 1 or more, do various internal consistency checking. Leave this
+ *         undefined for production code
+
+ * The memset(), memmove(), and memcpy() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+ */
+
+
+#if !defined(NROOTS)
+#error "NROOTS not defined"
+#endif
+
+#if !defined(NN)
+#error "NN not defined"
+#endif
+
+#if !defined(PAD)
+#error "PAD not defined"
+#endif
+
+#if !defined(ALPHA_TO)
+#error "ALPHA_TO not defined"
+#endif
+
+#if !defined(INDEX_OF)
+#error "INDEX_OF not defined"
+#endif
+
+#if !defined(MODNN)
+#error "MODNN not defined"
+#endif
+
+#if !defined(FCR)
+#error "FCR not defined"
+#endif
+
+#if !defined(PRIM)
+#error "PRIM not defined"
+#endif
+
+#if !defined(NULL)
+#define NULL ((void *)0)
+#endif
+
+#undef MIN
+#define	MIN(a,b)	((a) < (b) ? (a) : (b))
+#undef A0
+#define A0 (NN)
+
+{
+  int deg_lambda, el, deg_omega;
+  int i, j, r,k;
+  data_t u,q,tmp,num1,num2,den,discr_r;
+  data_t lambda[NROOTS+1], s[NROOTS];	/* Err+Eras Locator poly
+					 * and syndrome poly */
+  data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+  data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+  int syn_error, count;
+
+  /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+  for(i=0;i<NROOTS;i++)
+    s[i] = data[0];
+
+  for(j=1;j<NN-PAD;j++){
+    for(i=0;i<NROOTS;i++){
+      if(s[i] == 0){
+	s[i] = data[j];
+      } else {
+	s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+      }
+    }
+  }
+
+  /* Convert syndromes to index form, checking for nonzero condition */
+  syn_error = 0;
+  for(i=0;i<NROOTS;i++){
+    syn_error |= s[i];
+    s[i] = INDEX_OF[s[i]];
+  }
+
+  if (!syn_error) {
+    /* if syndrome is zero, data[] is a codeword and there are no
+     * errors to correct. So return data[] unmodified
+     */
+    count = 0;
+    goto finish;
+  }
+  memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+  lambda[0] = 1;
+
+  if (no_eras > 0) {
+    /* Init lambda to be the erasure locator polynomial */
+    lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+    for (i = 1; i < no_eras; i++) {
+      u = MODNN(PRIM*(NN-1-eras_pos[i]));
+      for (j = i+1; j > 0; j--) {
+	tmp = INDEX_OF[lambda[j - 1]];
+	if(tmp != A0)
+	  lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+      }
+    }
+
+#if DEBUG >= 1
+    /* Test code that verifies the erasure locator polynomial just constructed
+       Needed only for decoder debugging. */
+    
+    /* find roots of the erasure location polynomial */
+    for(i=1;i<=no_eras;i++)
+      reg[i] = INDEX_OF[lambda[i]];
+
+    count = 0;
+    for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+      q = 1;
+      for (j = 1; j <= no_eras; j++)
+	if (reg[j] != A0) {
+	  reg[j] = MODNN(reg[j] + j);
+	  q ^= ALPHA_TO[reg[j]];
+	}
+      if (q != 0)
+	continue;
+      /* store root and error location number indices */
+      root[count] = i;
+      loc[count] = k;
+      count++;
+    }
+    if (count != no_eras) {
+      printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+      count = -1;
+      goto finish;
+    }
+#if DEBUG >= 2
+    printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+    for (i = 0; i < count; i++)
+      printf("%d ", loc[i]);
+    printf("\n");
+#endif
+#endif
+  }
+  for(i=0;i<NROOTS+1;i++)
+    b[i] = INDEX_OF[lambda[i]];
+  
+  /*
+   * Begin Berlekamp-Massey algorithm to determine error+erasure
+   * locator polynomial
+   */
+  r = no_eras;
+  el = no_eras;
+  while (++r <= NROOTS) {	/* r is the step number */
+    /* Compute discrepancy at the r-th step in poly-form */
+    discr_r = 0;
+    for (i = 0; i < r; i++){
+      if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+	discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+      }
+    }
+    discr_r = INDEX_OF[discr_r];	/* Index form */
+    if (discr_r == A0) {
+      /* 2 lines below: B(x) <-- x*B(x) */
+      memmove(&b[1],b,NROOTS*sizeof(b[0]));
+      b[0] = A0;
+    } else {
+      /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+      t[0] = lambda[0];
+      for (i = 0 ; i < NROOTS; i++) {
+	if(b[i] != A0)
+	  t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+	else
+	  t[i+1] = lambda[i+1];
+      }
+      if (2 * el <= r + no_eras - 1) {
+	el = r + no_eras - el;
+	/*
+	 * 2 lines below: B(x) <-- inv(discr_r) *
+	 * lambda(x)
+	 */
+	for (i = 0; i <= NROOTS; i++)
+	  b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+      } else {
+	/* 2 lines below: B(x) <-- x*B(x) */
+	memmove(&b[1],b,NROOTS*sizeof(b[0]));
+	b[0] = A0;
+      }
+      memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+    }
+  }
+
+  /* Convert lambda to index form and compute deg(lambda(x)) */
+  deg_lambda = 0;
+  for(i=0;i<NROOTS+1;i++){
+    lambda[i] = INDEX_OF[lambda[i]];
+    if(lambda[i] != A0)
+      deg_lambda = i;
+  }
+  /* Find roots of the error+erasure locator polynomial by Chien search */
+  memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+  count = 0;		/* Number of roots of lambda(x) */
+  for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+    q = 1; /* lambda[0] is always 0 */
+    for (j = deg_lambda; j > 0; j--){
+      if (reg[j] != A0) {
+	reg[j] = MODNN(reg[j] + j);
+	q ^= ALPHA_TO[reg[j]];
+      }
+    }
+    if (q != 0)
+      continue; /* Not a root */
+    /* store root (index-form) and error location number */
+#if DEBUG>=2
+    printf("count %d root %d loc %d\n",count,i,k);
+#endif
+    root[count] = i;
+    loc[count] = k;
+    /* If we've already found max possible roots,
+     * abort the search to save time
+     */
+    if(++count == deg_lambda)
+      break;
+  }
+  if (deg_lambda != count) {
+    /*
+     * deg(lambda) unequal to number of roots => uncorrectable
+     * error detected
+     */
+    count = -1;
+    goto finish;
+  }
+  /*
+   * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+   * x**NROOTS). in index form. Also find deg(omega).
+   */
+  deg_omega = deg_lambda-1;
+  for (i = 0; i <= deg_omega;i++){
+    tmp = 0;
+    for(j=i;j >= 0; j--){
+      if ((s[i - j] != A0) && (lambda[j] != A0))
+	tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+    }
+    omega[i] = INDEX_OF[tmp];
+  }
+
+  /*
+   * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+   * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+   */
+  for (j = count-1; j >=0; j--) {
+    num1 = 0;
+    for (i = deg_omega; i >= 0; i--) {
+      if (omega[i] != A0)
+	num1  ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+    }
+    num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+    den = 0;
+    
+    /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+    for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+      if(lambda[i+1] != A0)
+	den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+    }
+#if DEBUG >= 1
+    if (den == 0) {
+      printf("\n ERROR: denominator = 0\n");
+      count = -1;
+      goto finish;
+    }
+#endif
+    /* Apply error to data */
+    if (num1 != 0 && loc[j] >= PAD) {
+      data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+    }
+  }
+ finish:
+  if(eras_pos != NULL){
+    for(i=0;i<count;i++)
+      eras_pos[i] = loc[i];
+  }
+  retval = count;
+}
diff --git a/decode_rs_8.c b/decode_rs_8.c
new file mode 100644
index 0000000..995b0d9
--- /dev/null
+++ b/decode_rs_8.c
@@ -0,0 +1,24 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "fixed.h"
+
+int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){
+  int retval;
+ 
+  if(pad < 0 || pad > 222){
+    return -1;
+  }
+
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/decode_rs_ccsds.c b/decode_rs_ccsds.c
new file mode 100644
index 0000000..0e246b4
--- /dev/null
+++ b/decode_rs_ccsds.c
@@ -0,0 +1,26 @@
+/* This function wraps around the fixed 8-bit decoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+
+int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){
+  int i,r;
+  data_t cdata[NN];
+
+  /* Convert data from dual basis to conventional */
+  for(i=0;i<NN-pad;i++)
+    cdata[i] = Tal1tab[data[i]];
+
+  r = decode_rs_8(cdata,eras_pos,no_eras,pad);
+
+  if(r > 0){
+    /* Convert from conventional to dual basis */
+    for(i=0;i<NN-pad;i++)
+      data[i] = Taltab[cdata[i]];
+  }
+  return r;
+}
diff --git a/decode_rs_char.c b/decode_rs_char.c
new file mode 100644
index 0000000..7105233
--- /dev/null
+++ b/decode_rs_char.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){
+  int retval;
+  struct rs *rs = (struct rs *)p;
+ 
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/decode_rs_int.c b/decode_rs_int.c
new file mode 100644
index 0000000..1ef1a1f
--- /dev/null
+++ b/decode_rs_int.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){
+  int retval;
+  struct rs *rs = (struct rs *)p;
+ 
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/dotprod.c b/dotprod.c
new file mode 100644
index 0000000..b3be913
--- /dev/null
+++ b/dotprod.c
@@ -0,0 +1,94 @@
+/* 16-bit signed integer dot product
+ * Switch to appropriate versions
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+void *initdp_port(signed short coeffs[],int len);
+long dotprod_port(void *p,signed short *b);
+void freedp_port(void *p);
+
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void *initdp_sse2(signed short coeffs[],int len);
+long dotprod_mmx(void *p,signed short *b);
+long dotprod_sse2(void *p,signed short *b);
+void freedp_mmx(void *p);
+void freedp_sse2(void *p);
+#endif
+
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+long dotprod_av(void *p,signed short *b);
+void freedp_av(void *p);
+#endif
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp(signed short coeffs[],int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return initdp_port(coeffs,len);
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return initdp_mmx(coeffs,len);
+  case SSE2:
+    return initdp_sse2(coeffs,len);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return initdp_av(coeffs,len);
+#endif
+  }
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp(void *p){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return freedp_mmx(p);
+  case SSE2:
+    return freedp_sse2(p);
+#endif
+#ifdef __VEC__
+  case ALTIVEC:
+    return freedp_av(p);
+#endif
+  }
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod(void *p,signed short a[]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return dotprod_port(p,a);
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return dotprod_mmx(p,a);
+  case SSE2:
+    return dotprod_sse2(p,a);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return dotprod_av(p,a);
+#endif
+  }
+}
+
+
diff --git a/dotprod.h b/dotprod.h
new file mode 100644
index 0000000..6b62b70
--- /dev/null
+++ b/dotprod.h
@@ -0,0 +1,15 @@
+/* Internal definitions for dotproduct function */
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+   * preshifted by 0,1,2,3 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   *
+   * SSE2 is similar, but with 8 words at a time
+   *
+   * On a non-MMX machine, only one copy is present
+   */
+  signed short *coeffs[8];
+};
diff --git a/dotprod_av.c b/dotprod_av.c
new file mode 100644
index 0000000..1f70471
--- /dev/null
+++ b/dotprod_av.c
@@ -0,0 +1,93 @@
+/* 16-bit signed integer dot product
+ * Altivec-assisted version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On an Altivec machine, these hold 8 copies of the coefficients,
+   * preshifted by 0,1,..7 words to meet all possible input data
+   */
+  signed short *coeffs[8];
+};
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_av(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 8 copies of coefficients, one for each data alignment,
+   * each aligned to 16-byte boundary
+   */
+  for(i=0;i<8;i++){
+    dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_av(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<8;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_av(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  vector signed short *ar,*d;
+  vector signed int sums0,sums1,sums2,sums3;
+  union { vector signed int v; signed int w[4];} s;
+  int nblocks;
+    
+  /* round ar down to beginning of 16-byte block containing 0th element of
+   * input buffer. Then set d to one of 8 sets of shifted coefficients
+   */
+  ar = (vector signed short *)((int)a & ~15);
+  al = ((int)a & 15)/sizeof(signed short);
+  d = (vector signed short *)dp->coeffs[al];
+  
+  nblocks = (dp->len+al-1)/8+1;
+  
+  /* Sum into four vectors each holding four 32-bit partial sums */
+  sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
+  while(nblocks >= 4){
+    sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
+    sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
+    sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
+    sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
+    nblocks -= 4;
+  }
+  sums0 = vec_adds(sums0,sums1);
+  sums2 = vec_adds(sums2,sums3);
+  sums0 = vec_adds(sums0,sums2);
+  while(nblocks-- > 0){
+    sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
+  }
+  /* Sum 4 partial sums into final result */
+  s.v = vec_sums(sums0,(vector signed int)(0));
+  
+  return s.w[3];
+}
+
+
diff --git a/dotprod_mmx.c b/dotprod_mmx.c
new file mode 100644
index 0000000..c516afe
--- /dev/null
+++ b/dotprod_mmx.c
@@ -0,0 +1,81 @@
+/* 16-bit signed integer dot product
+ * MMX assisted version; also for SSE
+ *
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+   * preshifted by 0,1,2,3 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   */
+  signed short *coeffs[4];
+};
+long dotprod_mmx_assist(signed short *a,signed short *b,int cnt);
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_mmx(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j;
+
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 4 copies of coefficients, one for each data alignment */
+  for(i=0;i<4;i++){
+    dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4,
+					   4*sizeof(signed short));
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_mmx(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<4;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_mmx(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  signed short *ar;
+      
+  /* Round input data address down to 8 byte boundary
+   * NB: depending on the alignment of a[], memory
+   * before a[] will be accessed. The contents don't matter since they'll
+   * be multiplied by zero coefficients. I can't conceive of any
+   * situation where this could cause a segfault since memory protection
+   * in the x86 machines is done on much larger boundaries
+   */
+  ar = (signed short *)((int)a & ~7);
+  
+  /* Choose one of 4 sets of pre-shifted coefficients. al is both the
+   * index into dp->coeffs[] and the number of 0 words padded onto
+   * that coefficients array for alignment purposes
+   */
+  al = a - ar;
+  
+  /* Call assembler routine to do the work, passing number of 4-word blocks */
+  return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1);
+}
+
diff --git a/dotprod_mmx_assist.s b/dotprod_mmx_assist.s
new file mode 100644
index 0000000..25deffd
--- /dev/null
+++ b/dotprod_mmx_assist.s
@@ -0,0 +1,83 @@
+# SIMD MMX dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+#	long sum = 0; 
+#	cnt *= 4; 
+#	while(cnt--)
+#		sum += *a++ + *b++;
+#	return sum;
+# }
+# a and b should also be 64-bit aligned, or speed will suffer greatly
+# Copyright 1999, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+	
+	.text
+	.global dotprod_mmx_assist
+	.type dotprod_mmx_assist,@function
+dotprod_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %ecx
+	pushl %ebx
+	movl 8(%ebp),%esi	# a
+	movl 12(%ebp),%edi	# b
+	movl 16(%ebp),%ecx	# cnt
+	pxor %mm0,%mm0		# clear running sum (in two 32-bit halves)
+	
+# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
+	.align 16
+.Loop1:	subl $4,%ecx
+	jl   .Loop1Done
+	
+	movq (%esi),%mm1	# mm1 = a[3],a[2],a[1],a[0]
+ 	pmaddwd (%edi),%mm1	# mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
+	paddd %mm1,%mm0
+	
+	movq 8(%esi),%mm1
+	pmaddwd 8(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 16(%esi),%mm1
+	pmaddwd 16(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 24(%esi),%mm1
+	addl $32,%esi	
+	pmaddwd 24(%edi),%mm1
+	addl $32,%edi	
+	paddd %mm1,%mm0
+
+	jmp .Loop1
+.Loop1Done:
+	
+	addl $4,%ecx	
+	
+# MMX dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2:	subl $1,%ecx
+	jl   .Loop2Done
+	
+	movq (%esi),%mm1
+	addl $8,%esi
+	pmaddwd (%edi),%mm1
+	addl $8,%edi
+	paddd %mm1,%mm0
+	jmp .Loop2
+.Loop2Done:
+	
+	movd %mm0,%ebx		# right-hand word to ebx
+	punpckhdq %mm0,%mm0	# left-hand word to right side of %mm0
+	movd %mm0,%eax
+	addl %ebx,%eax		# running sum now in %eax
+	emms			# done with MMX
+	
+	popl %ebx
+	popl %ecx
+	popl %edi
+	popl %esi
+	movl %ebp,%esp
+	popl %ebp
+	ret
diff --git a/dotprod_port.c b/dotprod_port.c
new file mode 100644
index 0000000..ef635ec
--- /dev/null
+++ b/dotprod_port.c
@@ -0,0 +1,58 @@
+/* 16-bit signed integer dot product
+ * Portable C version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  signed short *coeffs;
+};
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_port(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int j;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Just one copy of the coefficients for the C version */
+  dp->coeffs = (signed short *)calloc(len,sizeof(signed short));
+  for(j=0;j<len;j++)
+    dp->coeffs[j] = coeffs[j];
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_port(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+
+  if(dp->coeffs != NULL)
+      free(dp->coeffs);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_port(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  long corr;
+  int i;
+
+  corr = 0;
+  for(i=0;i<dp->len;i++){
+    corr += (long)a[i] * dp->coeffs[i];
+  }
+  return corr;
+}
+
+
diff --git a/dotprod_sse2.c b/dotprod_sse2.c
new file mode 100644
index 0000000..1fddd18
--- /dev/null
+++ b/dotprod_sse2.c
@@ -0,0 +1,72 @@
+/* 16-bit signed integer dot product
+ * SSE2 version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a SSE2 machine, these hold 8 copies of the coefficients,
+   * preshifted by 0,1,..7 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   */
+  signed short *coeffs[8];
+};
+
+long dotprod_sse2_assist(signed short *a,signed short *b,int cnt);
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_sse2(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j,blksize;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 8 copies of coefficients, one for each data alignment,
+   * each aligned to 16-byte boundary
+   */
+  for(i=0;i<8;i++){
+    blksize = (1+(len+i-1)/8) * 8*sizeof(signed short);
+    posix_memalign((void **)&dp->coeffs[i],16,blksize);
+    memset(dp->coeffs[i],0,blksize);
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_sse2(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<8;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_sse2(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  signed short *ar;
+  
+  ar = (signed short *)((int)a & ~15);
+  al = a - ar;
+  
+  /* Call assembler routine to do the work, passing number of 8-word blocks */
+  return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1);
+}
diff --git a/dotprod_sse2_assist.s b/dotprod_sse2_assist.s
new file mode 100644
index 0000000..47348fa
--- /dev/null
+++ b/dotprod_sse2_assist.s
@@ -0,0 +1,85 @@
+# SIMD SSE2 dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+#	long sum = 0; 
+#	cnt *= 8; 
+#	while(cnt--)
+#		sum += *a++ + *b++;
+#	return sum;
+# }
+# a and b must be 128-bit aligned
+# Copyright 2001, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+	
+	.text
+	.global dotprod_sse2_assist
+	.type dotprod_sse2_assist,@function
+dotprod_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %ecx
+	pushl %ebx
+	movl 8(%ebp),%esi	# a
+	movl 12(%ebp),%edi	# b
+	movl 16(%ebp),%ecx	# cnt
+	pxor %xmm0,%xmm0		# clear running sum (in two 32-bit halves)
+	
+# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
+	.align 16
+.Loop1:	subl $4,%ecx
+	jl   .Loop1Done
+	
+	movdqa (%esi),%xmm1
+ 	pmaddwd (%edi),%xmm1
+	paddd %xmm1,%xmm0
+	
+	movdqa 16(%esi),%xmm1
+	pmaddwd 16(%edi),%xmm1
+	paddd %xmm1,%xmm0
+
+	movdqa 32(%esi),%xmm1
+	pmaddwd 32(%edi),%xmm1
+	paddd %xmm1,%xmm0
+
+	movdqa 48(%esi),%xmm1
+	addl $64,%esi	
+	pmaddwd 48(%edi),%xmm1
+	addl $64,%edi	
+	paddd %xmm1,%xmm0
+
+	jmp .Loop1
+.Loop1Done:
+	
+	addl $4,%ecx	
+	
+# SSE2 dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2:	subl $1,%ecx
+	jl   .Loop2Done
+	
+	movdqa (%esi),%xmm1
+	addl $16,%esi
+	pmaddwd (%edi),%xmm1
+	addl $16,%edi
+	paddd %xmm1,%xmm0
+	jmp .Loop2
+.Loop2Done:
+
+	movdqa %xmm0,%xmm1
+	psrldq $8,%xmm0
+	paddd %xmm1,%xmm0
+	movd %xmm0,%eax		# right-hand word to eax
+	psrldq $4,%xmm0
+	movd %xmm0,%ebx
+	addl %ebx,%eax
+
+	popl %ebx
+	popl %ecx
+	popl %edi
+	popl %esi
+	movl %ebp,%esp
+	popl %ebp
+	ret
diff --git a/dsp.3 b/dsp.3
new file mode 100644
index 0000000..e9794da
--- /dev/null
+++ b/dsp.3
@@ -0,0 +1,63 @@
+.TH DSP 3
+.SH NAME
+initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted
+digital signal processing primitives
+.SH SYNOPSIS
+.nf
+.ft
+#include "fec.h"
+
+void *initdp(signed short *coeffs,int len);
+long dotprod(void *p,signed short *a);
+void freedp(void *p);
+
+unsigned long long sumsq(signed short *in,int cnt);
+
+int peakval(signed short *b,int cnt);
+
+.SH DESCRIPTION
+These functions provide several basic primitives useful in digital
+signal processing (DSP), especially in modems.  The \fBinitdp\fR,
+\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot
+product useful in correlation and filtering operations on signed
+16-bit integers. \fBsumsq\fR computes the sum
+of the squares of an array of signed 16-bit integers,
+useful for measuring the energy of a signal. \fBpeakval\fR returns the
+absolute value of the largest magitude element in the input array,
+useful for scaling a signal's amplitude.
+
+Each function uses IA32 or PowerPC Altivec instructions when
+available; otherwise, a portable C version is used.
+
+.SH USAGE
+To create a FIR filter or correlator, call \fBinitdp\fR with the
+coefficients in \fBcoeff\fR and their number in \fBlen\fR.  This
+creates the appropriate data structures and returns a handle.
+
+To compute a dot product, pass the handle from \fBinitdp\fR and the
+input array to \fBdotprod\fR. No length field is needed as the number
+of samples will be taken from the \fBlen\fR parameter originally given
+to \fBinitdp\fR. There must be at least as many samples in the input
+array as there were coefficients passed to \fBinitdp\fR.
+
+When the filter or correlator is no longer needed, the data structures
+may be freed by passing the handle to \fBfreedp\fR.
+
+The user is responsible for scaling the inputs to \fBinitdp\fR and
+\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently
+wrap around in the event of overflow.
+
+To compute the sum of the squares of an array of signed 16-bit
+integers, use sumsq\fR. This returns a 64 bit sum.
+
+\fBpeakval\fR computes the absolute value of each 16-bit element in
+the input array and returns the largest.
+
+.SH RETURN VALUES
+
+\fBinitdp\fR returns a handle that points to a control block, or NULL in
+the event of an error (such as a memory allocation failure). \fBsumsq\fR
+and \fBpeakval\fR have no error returns.
+
+.SH AUTHOR and COPYRIGHT
+Phil Karn, KA9Q (karn@ka9q.net)
diff --git a/dtest.c b/dtest.c
new file mode 100644
index 0000000..394cb03
--- /dev/null
+++ b/dtest.c
@@ -0,0 +1,99 @@
+/* Test dot-product function */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include "config.h"
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {"trials",0,NULL,'n'},
+  {NULL},
+};
+#endif
+
+int main(int argc,char *argv[]){
+  short coeffs[512];
+  short input[2048];
+  int trials=1000,d;
+  int errors = 0;
+
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"apmstn:")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    }
+  }
+
+  while(trials--){
+    long port_result;
+    long simd_result;
+    int ntaps;
+    int i;
+    int csum = 0;
+    int offset;
+    void *dp_simd,*dp_port;
+
+    /* Generate set of coefficients
+     * limit sum of absolute values to 32767 to avoid overflow
+     */
+    memset(coeffs,0,sizeof(coeffs));
+    for(i=0;i<512;i++){
+      double gv;
+
+      gv = normal_rand(0.,100.);
+      if(csum + fabs(gv) > 32767)
+	break;
+      coeffs[i] = gv;
+      csum += fabs(gv);
+    }
+    ntaps = i;
+
+    /* Compare results to portable C version for a bunch of random data buffers and offsets */
+    dp_simd = initdp(coeffs,ntaps);
+    dp_port = initdp_port(coeffs,ntaps);
+    
+    for(i=0;i<2048;i++)
+      input[i] = random();
+    
+    offset = random() & 511;
+
+    simd_result = dotprod(dp_simd,input+offset);
+    port_result = dotprod_port(dp_port,input+offset);
+    if(simd_result != port_result){
+      errors++;
+    }
+  }
+  printf("dtest: %d errors\n",errors);
+  exit(0);
+}
diff --git a/encode_rs.c b/encode_rs.c
new file mode 100644
index 0000000..0649094
--- /dev/null
+++ b/encode_rs.c
@@ -0,0 +1,52 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#else
+#include "char.h"
+#endif
+
+void ENCODE_RS(
+#ifdef FIXED
+data_t *data, data_t *bb,int pad){
+#else
+void *p,data_t *data, data_t *bb){
+  struct rs *rs = (struct rs *)p;
+#endif
+  int i, j;
+  data_t feedback;
+
+#ifdef FIXED
+  /* Check pad parameter for validity */
+  if(pad < 0 || pad >= NN)
+    return;
+#endif
+
+  memset(bb,0,NROOTS*sizeof(data_t));
+
+  for(i=0;i<NN-NROOTS-PAD;i++){
+    feedback = INDEX_OF[data[i] ^ bb[0]];
+    if(feedback != A0){      /* feedback term is non-zero */
+#ifdef UNNORMALIZED
+      /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+       * always be for the polynomials constructed by init_rs()
+       */
+      feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+#endif
+      for(j=1;j<NROOTS;j++)
+	bb[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+    }
+    /* Shift */
+    memmove(&bb[0],&bb[1],sizeof(data_t)*(NROOTS-1));
+    if(feedback != A0)
+      bb[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+    else
+      bb[NROOTS-1] = 0;
+  }
+}
diff --git a/encode_rs.h b/encode_rs.h
new file mode 100644
index 0000000..2c157f9
--- /dev/null
+++ b/encode_rs.h
@@ -0,0 +1,58 @@
+/* The guts of the Reed-Solomon encoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN-NROOTS-PAD and type data_t to be encoded
+ * data_t parity[] - an array of NROOTS and type data_t to be written with parity symbols
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ *          which is the same as the number of parity symbols in a block.
+            Integer variable or literal.
+	    * 
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ *            elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ *            elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * GENPOLY - an array of NROOTS+1 elements containing the generator polynomial in index form
+
+ * The memset() and memmove() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+
+#undef A0
+#define A0 (NN) /* Special reserved value encoding zero in index form */
+
+{
+  int i, j;
+  data_t feedback;
+
+  memset(parity,0,NROOTS*sizeof(data_t));
+
+  for(i=0;i<NN-NROOTS-PAD;i++){
+    feedback = INDEX_OF[data[i] ^ parity[0]];
+    if(feedback != A0){      /* feedback term is non-zero */
+#ifdef UNNORMALIZED
+      /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+       * always be for the polynomials constructed by init_rs()
+       */
+      feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+#endif
+      for(j=1;j<NROOTS;j++)
+	parity[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+    }
+    /* Shift */
+    memmove(&parity[0],&parity[1],sizeof(data_t)*(NROOTS-1));
+    if(feedback != A0)
+      parity[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+    else
+      parity[NROOTS-1] = 0;
+  }
+}
diff --git a/encode_rs_8.c b/encode_rs_8.c
new file mode 100644
index 0000000..5aaecca
--- /dev/null
+++ b/encode_rs_8.c
@@ -0,0 +1,109 @@
+/* Reed-Solomon encoder
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#include "fixed.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+#endif
+
+
+static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode;
+
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad);
+#if __vec__
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad);
+#endif
+#if __i386__
+int cpu_features(void);
+#endif
+
+void encode_rs_8(data_t *data, data_t *parity,int pad){
+  if(cpu_mode == UNKNOWN){
+#ifdef __i386__
+    int f;
+    /* Figure out what kind of CPU we have */
+    f = cpu_features();
+    if(f & (1<<26)){ /* SSE2 is present */
+      cpu_mode = SSE2;
+    } else if(f & (1<<25)){ /* SSE is present */
+      cpu_mode = SSE;
+    } else if(f & (1<<23)){ /* MMX is present */
+      cpu_mode = MMX;
+    } else { /* No SIMD at all */
+      cpu_mode = PORT;
+    }
+#elif __VEC__
+    /* Ask the OS if we have Altivec support */
+    int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+    int hasVectorUnit = 0;
+    size_t length = sizeof(hasVectorUnit);
+    int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+    if(0 == error && hasVectorUnit)
+      cpu_mode = ALTIVEC;
+    else
+      cpu_mode = PORT;
+#else
+    cpu_mode = PORT;
+#endif
+  }
+  switch(cpu_mode){
+#if __vec__
+  case ALTIVEC:
+    encode_rs_8_av(data,parity,pad);
+    return;
+#endif
+#if __i386__
+  case MMX:
+  case SSE:
+  case SSE2:
+#endif
+  default:
+    encode_rs_8_c(data,parity,pad);
+    return;
+  }
+}
+
+#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */
+
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form the other half by reversing this one
+ */
+extern static union { vector unsigned char v; unsigned char c[16]; } table[256];
+
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad){
+  union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+  int i;
+
+  shift_register.v[0] = (vector unsigned char)(0);
+  shift_register.v[1] = (vector unsigned char)(0);
+  
+  for(i=0;i<NN-NROOTS-pad;i++){
+    vector unsigned char feedback0,feedback1;
+    unsigned char f;
+
+    f = data[i] ^ shift_register.c[31];
+    feedback1 = table[f].v;
+    feedback0 = vec_perm(feedback1,feedback1,reverse);
+
+    /* Shift right one byte */
+    shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+    shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+    shift_register.c[0] = f;
+  }
+  for(i=0;i<NROOTS;i++)
+    parity[NROOTS-i-1] = shift_register.c[i];
+}
+#endif
+
+/* Portable C version */
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad){
+
+#include "encode_rs.h"
+
+}
diff --git a/encode_rs_av.c b/encode_rs_av.c
new file mode 100644
index 0000000..32e528f
--- /dev/null
+++ b/encode_rs_av.c
@@ -0,0 +1,61 @@
+/* Fast Reed-Solomon encoder for (255,223) CCSDS code on PowerPC G4/G5 using Altivec instructions
+ * Copyright 2004, Phil Karn KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <string.h>
+#include "fixed.h"
+
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form it by reversing these on the fly
+ */
+static union { vector unsigned char v; unsigned char c[16]; } table[256];
+
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+
+void rs_init_av(){
+  int i,j;
+
+  /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */
+  for(j=0;j<16;j++){
+    table[0].c[j] = 0;
+    for(i=1;i<256;i++){
+      table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])];
+    }
+  }
+#if 0
+  for(i=0;i<256;i++){
+    printf("table[%3d] = %3vu\n",i,table[i].v);
+  }
+#endif
+}
+
+void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){
+  union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+  int i;
+
+  shift_register.v[0] = (vector unsigned char)(0);
+  shift_register.v[1] = (vector unsigned char)(0);
+  
+  for(i=0;i<NN-NROOTS-pad;i++){
+    vector unsigned char feedback0,feedback1;
+    unsigned char f;
+
+    f = data[i] ^ shift_register.c[31];
+    feedback1 = table[f].v;
+    feedback0 = vec_perm(feedback1,feedback1,reverse);
+
+    /* Shift right one byte */
+    shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+    shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+    shift_register.c[0] = f;
+  }
+  for(i=0;i<NROOTS;i++)
+    parity[NROOTS-i-1] = shift_register.c[i];
+}
diff --git a/encode_rs_ccsds.c b/encode_rs_ccsds.c
new file mode 100644
index 0000000..5a2ec70
--- /dev/null
+++ b/encode_rs_ccsds.c
@@ -0,0 +1,24 @@
+/* This function wraps around the fixed 8-bit encoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * fixed bug Aug 2007
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+
+void encode_rs_ccsds(data_t *data,data_t *parity,int pad){
+  int i;
+  data_t cdata[NN-NROOTS];
+
+  /* Convert data from dual basis to conventional */
+  for(i=0;i<NN-NROOTS-pad;i++)
+    cdata[i] = Tal1tab[data[i]];
+
+  encode_rs_8(cdata,parity,pad);
+
+  /* Convert parity from conventional to dual basis */
+  for(i=0;i<NROOTS;i++)
+    parity[i] = Taltab[parity[i]];
+}
diff --git a/encode_rs_char.c b/encode_rs_char.c
new file mode 100644
index 0000000..a9bf2b8
--- /dev/null
+++ b/encode_rs_char.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+void encode_rs_char(void *p,data_t *data, data_t *parity){
+  struct rs *rs = (struct rs *)p;
+
+#include "encode_rs.h"
+
+}
diff --git a/encode_rs_int.c b/encode_rs_int.c
new file mode 100644
index 0000000..3c9ce78
--- /dev/null
+++ b/encode_rs_int.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+void encode_rs_int(void *p,data_t *data, data_t *parity){
+  struct rs *rs = (struct rs *)p;
+
+#include "encode_rs.h"
+
+}
diff --git a/exercise.c b/exercise.c
new file mode 100644
index 0000000..8ae008c
--- /dev/null
+++ b/exercise.c
@@ -0,0 +1,122 @@
+/* Exercise an RS codec a specified number of times using random
+ * data and error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef FIXED
+#include "fixed.h"
+#define EXERCISE exercise_8
+#elif defined(CCSDS)
+#include "fixed.h"
+#include "ccsds.h"
+#define EXERCISE exercise_ccsds
+#elif defined(BIGSYM)
+#include "int.h"
+#define EXERCISE exercise_int
+#else
+#include "char.h"
+#define EXERCISE exercise_char
+#endif
+
+#ifdef FIXED
+#define PRINTPARM printf("(255,223):");
+#elif defined(CCSDS)
+#define PRINTPARM printf("CCSDS (255,223):");
+#else
+#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots);
+#endif
+
+/* Exercise the RS codec passed as an argument */
+int EXERCISE(
+#if !defined(CCSDS) && !defined(FIXED)
+void *p,
+#endif
+int trials){
+#if !defined(CCSDS) && !defined(FIXED)
+  struct rs *rs = (struct rs *)p;
+#endif
+  data_t block[NN],tblock[NN];
+  int i;
+  int errors;
+  int errlocs[NN];
+  int derrlocs[NROOTS];
+  int derrors;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+
+  while(trials-- != 0){
+    /* Test up to the error correction capacity of the code */
+    for(errors=0;errors <= NROOTS/2;errors++){
+
+      /* Load block with random data and encode */
+      for(i=0;i<NN-NROOTS;i++)
+	block[i] = random() & NN;
+      
+#if defined(CCSDS) || defined(FIXED)
+      ENCODE_RS(&block[0],&block[NN-NROOTS],0);
+#else
+      ENCODE_RS(rs,&block[0],&block[NN-NROOTS]);
+#endif
+
+      /* Make temp copy, seed with errors */
+      memcpy(tblock,block,sizeof(tblock));
+      memset(errlocs,0,sizeof(errlocs));
+      memset(derrlocs,0,sizeof(derrlocs));
+      erasures=0;
+      for(i=0;i<errors;i++){
+	do {
+	  errval = random() & NN;
+	} while(errval == 0); /* Error value must be nonzero */
+
+	do {
+	  errloc = random() % NN;
+	} while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+
+	errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+	if(random() & 1) /* 50-50 chance */
+	  derrlocs[erasures++] = errloc;
+#endif
+	tblock[errloc] ^= errval;
+      }
+
+      /* Decode the errored block */
+#if defined(CCSDS) || defined(FIXED)
+      derrors = DECODE_RS(tblock,derrlocs,erasures,0);
+#else
+      derrors = DECODE_RS(rs,tblock,derrlocs,erasures);
+#endif
+
+      if(derrors != errors){
+	PRINTPARM
+	printf(" decoder says %d errors, true number is %d\n",derrors,errors);
+	decoder_errors++;
+      }
+      for(i=0;i<derrors;i++){
+	if(errlocs[derrlocs[i]] == 0){
+	  PRINTPARM
+	  printf(" decoder indicates error in location %d without error\n",derrlocs[i]);
+	  decoder_errors++;
+	}
+      }
+      if(memcmp(tblock,block,sizeof(tblock)) != 0){
+	PRINTPARM
+	printf(" uncorrected errors! output ^ input:");
+	decoder_errors++;
+	for(i=0;i<NN;i++)
+	  printf(" %02x",tblock[i] ^ block[i]);
+	printf("\n");
+      }
+    }
+  }
+  return decoder_errors;
+}
diff --git a/fec.c b/fec.c
new file mode 100644
index 0000000..35960c3
--- /dev/null
+++ b/fec.c
@@ -0,0 +1,66 @@
+/* Utility routines for FEC support
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+
+#include <stdio.h>
+#include "fec.h"
+
+unsigned char Partab[256];
+int P_init;
+
+/* Create 256-entry odd-parity lookup table
+ * Needed only on non-ia32 machines
+ */
+void partab_init(void){
+  int i,cnt,ti;
+
+  /* Initialize parity lookup table */
+  for(i=0;i<256;i++){
+    cnt = 0;
+    ti = i;
+    while(ti){
+      if(ti & 1)
+	cnt++;
+      ti >>= 1;
+    }
+    Partab[i] = cnt & 1;
+  }
+  P_init=1;
+}
+
+/* Lookup table giving count of 1 bits for integers 0-255 */
+int Bitcnt[] = {
+ 0, 1, 1, 2, 1, 2, 2, 3,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
diff --git a/fec.h b/fec.h
new file mode 100644
index 0000000..08e8454
--- /dev/null
+++ b/fec.h
@@ -0,0 +1,347 @@
+/* User include file for libfec
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifndef _FEC_H_
+#define _FEC_H_
+
+/* r=1/2 k=7 convolutional encoder polynomials
+ * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB
+ * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted
+ */
+#define	V27POLYA	0x6d
+#define	V27POLYB	0x4f
+
+void *create_viterbi27(int len);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi27_av(int len);
+void set_viterbi27_polynomial_av(int polys[2]);
+int init_viterbi27_av(void *p,int starting_state);
+int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_av(void *p);
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi27_mmx(int len);
+void set_viterbi27_polynomial_mmx(int polys[2]);
+int init_viterbi27_mmx(void *p,int starting_state);
+int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_mmx(void *p);
+int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi27_sse(int len);
+void set_viterbi27_polynomial_sse(int polys[2]);
+int init_viterbi27_sse(void *p,int starting_state);
+int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse(void *p);
+int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi27_sse2(int len);
+void set_viterbi27_polynomial_sse2(int polys[2]);
+int init_viterbi27_sse2(void *p,int starting_state);
+int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse2(void *p);
+int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi27_port(int len);
+void set_viterbi27_polynomial_port(int polys[2]);
+int init_viterbi27_port(void *p,int starting_state);
+int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_port(void *p);
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits);
+
+/* r=1/2 k=9 convolutional encoder polynomials */
+#define	V29POLYA	0x1af
+#define	V29POLYB	0x11d
+
+void *create_viterbi29(int len);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi29_av(int len);
+void set_viterbi29_polynomial_av(int polys[2]);
+int init_viterbi29_av(void *p,int starting_state);
+int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_av(void *p);
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi29_mmx(int len);
+void set_viterbi29_polynomial_mmx(int polys[2]);
+int init_viterbi29_mmx(void *p,int starting_state);
+int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_mmx(void *p);
+int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi29_sse(int len);
+void set_viterbi29_polynomial_sse(int polys[2]);
+int init_viterbi29_sse(void *p,int starting_state);
+int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse(void *p);
+int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi29_sse2(int len);
+void set_viterbi29_polynomial_sse2(int polys[2]);
+int init_viterbi29_sse2(void *p,int starting_state);
+int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse2(void *p);
+int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi29_port(int len);
+void set_viterbi29_polynomial_port(int polys[2]);
+int init_viterbi29_port(void *p,int starting_state);
+int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_port(void *p);
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits);
+
+/* r=1/3 k=9 convolutional encoder polynomials */
+#define	V39POLYA	0x1ed
+#define	V39POLYB	0x19b
+#define	V39POLYC	0x127
+
+void *create_viterbi39(int len);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi39_av(int len);
+void set_viterbi39_polynomial_av(int polys[3]);
+int init_viterbi39_av(void *p,int starting_state);
+int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_av(void *p);
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi39_mmx(int len);
+void set_viterbi39_polynomial_mmx(int polys[3]);
+int init_viterbi39_mmx(void *p,int starting_state);
+int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_mmx(void *p);
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi39_sse(int len);
+void set_viterbi39_polynomial_sse(int polys[3]);
+int init_viterbi39_sse(void *p,int starting_state);
+int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse(void *p);
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi39_sse2(int len);
+void set_viterbi39_polynomial_sse2(int polys[3]);
+int init_viterbi39_sse2(void *p,int starting_state);
+int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse2(void *p);
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi39_port(int len);
+void set_viterbi39_polynomial_port(int polys[3]);
+int init_viterbi39_port(void *p,int starting_state);
+int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_port(void *p);
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits);
+
+
+/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion
+ * dfree = 56
+ * These bits may be left-right flipped from some textbook representations;
+ * here I have the bits entering the shift register from the right (low) end
+ *
+ * Some other spacecraft use the same code, but with the polynomials in a different order.
+ * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion,
+ * so use set_viterbi615_polynomial() as appropriate.
+ */
+#define	V615POLYA	042631
+#define	V615POLYB	047245
+#define V615POLYC       056507
+#define V615POLYD       073363
+#define V615POLYE       077267
+#define V615POLYF       064537
+
+void *create_viterbi615(int len);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi615_av(int len);
+void set_viterbi615_polynomial_av(int polys[6]);
+int init_viterbi615_av(void *p,int starting_state);
+int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_av(void *p);
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi615_mmx(int len);
+void set_viterbi615_polynomial_mmx(int polys[6]);
+int init_viterbi615_mmx(void *p,int starting_state);
+int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_mmx(void *p);
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi615_sse(int len);
+void set_viterbi615_polynomial_sse(int polys[6]);
+int init_viterbi615_sse(void *p,int starting_state);
+int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse(void *p);
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi615_sse2(int len);
+void set_viterbi615_polynomial_sse2(int polys[6]);
+int init_viterbi615_sse2(void *p,int starting_state);
+int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse2(void *p);
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits);
+
+#endif
+
+void *create_viterbi615_port(int len);
+void set_viterbi615_polynomial_port(int polys[6]);
+int init_viterbi615_port(void *p,int starting_state);
+int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_port(void *p);
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits);
+
+
+/* General purpose RS codec, 8-bit symbols */
+void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity);
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+		   int no_eras);
+void *init_rs_char(int symsize,int gfpoly,
+		   int fcr,int prim,int nroots,
+		   int pad);
+void free_rs_char(void *rs);
+
+/* General purpose RS codec, integer symbols */
+void encode_rs_int(void *rs,int *data,int *parity);
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+void *init_rs_int(int symsize,int gfpoly,int fcr,
+		  int prim,int nroots,int pad);
+void free_rs_int(void *rs);
+
+/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis)
+ * symbol representation
+ */
+void encode_rs_8(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad);
+
+/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad);
+
+/* Tables to map from conventional->dual (Taltab) and
+ * dual->conventional (Tal1tab) bases
+ */
+extern unsigned char Taltab[],Tal1tab[];
+
+
+/* CPU SIMD instruction set available */
+extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode;
+void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */
+
+/* Determine parity of argument: 1 = odd, 0 = even */
+#ifdef __i386__
+static inline int parityb(unsigned char x){
+  __asm__ __volatile__ ("test %1,%1;setpo %0" : "=g"(x) : "r" (x));
+  return x;
+}
+#else
+void partab_init();
+
+static inline int parityb(unsigned char x){
+  extern unsigned char Partab[256];
+  extern int P_init;
+  if(!P_init){
+    partab_init();
+  }
+  return Partab[x];
+}
+#endif
+
+
+static inline int parity(int x){
+  /* Fold down to one byte */
+  x ^= (x >> 16);
+  x ^= (x >> 8);
+  return parityb(x);
+}
+
+/* Useful utilities for simulation */
+double normal_rand(double mean, double std_dev);
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip);
+
+extern int Bitcnt[];
+
+/* Dot product functions */
+void *initdp(signed short coeffs[],int len);
+void freedp(void *dp);
+long dotprod(void *dp,signed short a[]);
+
+void *initdp_port(signed short coeffs[],int len);
+void freedp_port(void *dp);
+long dotprod_port(void *dp,signed short a[]);
+
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void freedp_mmx(void *dp);
+long dotprod_mmx(void *dp,signed short a[]);
+
+void *initdp_sse(signed short coeffs[],int len);
+void freedp_sse(void *dp);
+long dotprod_sse(void *dp,signed short a[]);
+
+void *initdp_sse2(signed short coeffs[],int len);
+void freedp_sse2(void *dp);
+long dotprod_sse2(void *dp,signed short a[]);
+#endif
+
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+void freedp_av(void *dp);
+long dotprod_av(void *dp,signed short a[]);
+#endif
+
+/* Sum of squares - accepts signed shorts, produces unsigned long long */
+unsigned long long sumsq(signed short *in,int cnt);
+unsigned long long sumsq_port(signed short *in,int cnt);
+
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *in,int cnt);
+unsigned long long sumsq_sse(signed short *in,int cnt);
+unsigned long long sumsq_sse2(signed short *in,int cnt);
+#endif
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *in,int cnt);
+#endif
+
+
+/* Low-level data structures and routines */
+
+int cpu_features(void);
+
+#endif /* _FEC_H_ */
+
+
+
diff --git a/fixed.h b/fixed.h
new file mode 100644
index 0000000..0ff27b2
--- /dev/null
+++ b/fixed.h
@@ -0,0 +1,33 @@
+/* Stuff specific to the CCSDS (255,223) RS codec
+ * (255,223) code over GF(256). Note: the conventional basis is still
+ * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c
+ *
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+
+static inline int mod255(int x){
+  while (x >= 255) {
+    x -= 255;
+    x = (x >> 8) + (x & 255);
+  }
+  return x;
+}
+#define MODNN(x) mod255(x)
+
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+
+#define MM 8
+#define NN 255
+#define ALPHA_TO CCSDS_alpha_to
+#define INDEX_OF CCSDS_index_of
+#define GENPOLY CCSDS_poly
+#define NROOTS 32
+#define FCR 112
+#define PRIM 11
+#define IPRIM 116
+#define PAD pad
+
diff --git a/gen_ccsds.c b/gen_ccsds.c
new file mode 100644
index 0000000..e1e2e26
--- /dev/null
+++ b/gen_ccsds.c
@@ -0,0 +1,39 @@
+/* Generate tables for CCSDS code
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "char.h"
+#include "rs-common.h"
+#include "fec.h"
+
+int main(){
+  struct rs *rs;
+  int i;
+
+  rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */
+  assert(rs != NULL);
+  printf("char CCSDS_alpha_to[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",rs->alpha_to[i]);
+  }
+  printf("\n};\n\nchar CCSDS_index_of[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("%3d,",rs->index_of[i]);
+  }
+  printf("\n};\n\nchar CCSDS_poly[] = {");
+  for(i=0;i<33;i++){
+    if((i % 16) == 0)
+      printf("\n");
+
+    printf("%3d,",rs->genpoly[i]);
+  }
+  printf("\n};\n");
+  exit(0);
+}
diff --git a/gen_ccsds_tal.c b/gen_ccsds_tal.c
new file mode 100644
index 0000000..fc75503
--- /dev/null
+++ b/gen_ccsds_tal.c
@@ -0,0 +1,53 @@
+/* Conversion lookup tables from conventional alpha to Berlekamp's
+ * dual-basis representation. Used in the CCSDS version only.
+ * taltab[] -- convert conventional to dual basis
+ * tal1tab[] -- convert dual basis to conventional
+
+ * Note: the actual RS encoder/decoder works with the conventional basis.
+ * So data is converted from dual to conventional basis before either
+ * encoding or decoding and then converted back.
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#define DTYPE unsigned char
+DTYPE Taltab[256],Tal1tab[256];
+
+static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b };
+
+/* Generate conversion lookup tables between conventional alpha representation
+ * (@**7, @**6, ...@**0)
+ *  and Berlekamp's dual basis representation
+ * (l0, l1, ...l7)
+ */
+int main(){
+  int i,j,k;
+
+  for(i=0;i<256;i++){/* For each value of input */
+    Taltab[i] = 0;
+    for(j=0;j<8;j++) /* for each column of matrix */
+      for(k=0;k<8;k++){ /* for each row of matrix */
+	if(i & (1<<k))
+	   Taltab[i] ^= tal[7-k] & (1<<j);
+      }
+    Tal1tab[Taltab[i]] = i;
+  }
+  printf("unsigned char Taltab[] = {\n");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",Taltab[i]);
+  }
+  printf("\n};\n\nunsigned char Tal1tab[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",Tal1tab[i]);
+  }
+  printf("\n};\n");
+  exit(0);
+}
+
diff --git a/init_rs.c b/init_rs.c
new file mode 100644
index 0000000..ef1cf47
--- /dev/null
+++ b/init_rs.c
@@ -0,0 +1,39 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+#if !defined(NULL)
+#define NULL ((void *)0)
+#endif
+
+#include "rs-common.h"
+
+void free_rs(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_common(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/init_rs.h b/init_rs.h
new file mode 100644
index 0000000..2b2ae98
--- /dev/null
+++ b/init_rs.h
@@ -0,0 +1,106 @@
+/* Common code for intializing a Reed-Solomon control block (char or int symbols)
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#undef NULL
+#define NULL ((void *)0)
+
+{
+  int i, j, sr,root,iprim;
+
+  rs = NULL;
+  /* Check parameter ranges */
+  if(symsize < 0 || symsize > 8*sizeof(data_t)){
+    goto done;
+  }
+
+  if(fcr < 0 || fcr >= (1<<symsize))
+    goto done;
+  if(prim <= 0 || prim >= (1<<symsize))
+    goto done;
+  if(nroots < 0 || nroots >= (1<<symsize))
+    goto done; /* Can't have more roots than symbol values! */
+  if(pad < 0 || pad >= ((1<<symsize) -1 - nroots))
+    goto done; /* Too much padding */
+
+  rs = (struct rs *)calloc(1,sizeof(struct rs));
+  if(rs == NULL)
+    goto done;
+
+  rs->mm = symsize;
+  rs->nn = (1<<symsize)-1;
+  rs->pad = pad;
+
+  rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+  if(rs->alpha_to == NULL){
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+  rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+  if(rs->index_of == NULL){
+    free(rs->alpha_to);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+
+  /* Generate Galois field lookup tables */
+  rs->index_of[0] = A0; /* log(zero) = -inf */
+  rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */
+  sr = 1;
+  for(i=0;i<rs->nn;i++){
+    rs->index_of[sr] = i;
+    rs->alpha_to[i] = sr;
+    sr <<= 1;
+    if(sr & (1<<symsize))
+      sr ^= gfpoly;
+    sr &= rs->nn;
+  }
+  if(sr != 1){
+    /* field generator polynomial is not primitive! */
+    free(rs->alpha_to);
+    free(rs->index_of);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+
+  /* Form RS code generator polynomial from its roots */
+  rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1));
+  if(rs->genpoly == NULL){
+    free(rs->alpha_to);
+    free(rs->index_of);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+  rs->fcr = fcr;
+  rs->prim = prim;
+  rs->nroots = nroots;
+
+  /* Find prim-th root of 1, used in decoding */
+  for(iprim=1;(iprim % prim) != 0;iprim += rs->nn)
+    ;
+  rs->iprim = iprim / prim;
+
+  rs->genpoly[0] = 1;
+  for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) {
+    rs->genpoly[i+1] = 1;
+
+    /* Multiply rs->genpoly[] by  @**(root + x) */
+    for (j = i; j > 0; j--){
+      if (rs->genpoly[j] != 0)
+	rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)];
+      else
+	rs->genpoly[j] = rs->genpoly[j-1];
+    }
+    /* rs->genpoly[0] can never be zero */
+    rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)];
+  }
+  /* convert rs->genpoly[] to index form for quicker encoding */
+  for (i = 0; i <= nroots; i++)
+    rs->genpoly[i] = rs->index_of[rs->genpoly[i]];
+ done:;
+
+}
diff --git a/init_rs_char.c b/init_rs_char.c
new file mode 100644
index 0000000..a51099a
--- /dev/null
+++ b/init_rs_char.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+void free_rs_char(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/init_rs_int.c b/init_rs_int.c
new file mode 100644
index 0000000..a6036c2
--- /dev/null
+++ b/init_rs_int.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+void free_rs_int(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/int.h b/int.h
new file mode 100644
index 0000000..46e865d
--- /dev/null
+++ b/int.h
@@ -0,0 +1,22 @@
+/* Stuff specific to the general (integer) version of the Reed-Solomon codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned int data_t;
+
+#define MODNN(x) modnn(rs,x)
+
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to) 
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
+
+
diff --git a/lesser.txt b/lesser.txt
new file mode 100644
index 0000000..b1e3f5a
--- /dev/null
+++ b/lesser.txt
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/makefile.in b/makefile.in
new file mode 100644
index 0000000..53fdfcb
--- /dev/null
+++ b/makefile.in
@@ -0,0 +1,242 @@
+# Makefile prototype for configure
+# Copyright 2004 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+# @configure_input@
+srcdir = @srcdir@
+prefix = @prefix@
+exec_prefix=@exec_prefix@
+VPATH = @srcdir@
+CC=@CC@
+LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \
+	viterbi39.o viterbi39_port.o \
+	viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \
+	decode_rs_char.o decode_rs_int.o decode_rs_8.o \
+	init_rs_char.o init_rs_int.o ccsds_tab.o \
+	encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \
+	dotprod.o dotprod_port.o \
+	peakval.o peakval_port.o \
+	sumsq.o sumsq_port.o
+
+CFLAGS=@CFLAGS@ -I. -Wall @ARCH_OPTION@
+
+SHARED_LIB=@SH_LIB@
+
+all: libfec.a $(SHARED_LIB)
+
+test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest
+	@echo "Correctness tests:"
+	./vtest27 -e 3.0 -n 1000 -v
+	./vtest29 -e 2.5 -n 1000 -v
+	./vtest39 -e 2.5 -n 1000 -v
+	./vtest615 -e 1.0 -n 100 -v
+	./rstest
+	./dtest
+	./sumsq_test
+	./peaktest
+	@echo "Speed tests:"
+	./vtest27
+	./vtest29
+	./vtest39
+	./vtest615
+
+install: all
+	mkdir -p @libdir@ 
+	install -m 644 -p $(SHARED_LIB) libfec.a @libdir@
+#	(cd @libdir@;ln -f -s $(SHARED_LIB) libfec.so)
+	@REBIND@
+	mkdir -p @includedir@
+	install -m 644 -p fec.h @includedir@
+	mkdir -m 0755 -p @mandir@/man3
+	install -m 644 -p simd-viterbi.3 rs.3 dsp.3 @mandir@/man3
+
+peaktest: peaktest.o libfec.a
+	gcc -g -o $@ $^
+
+sumsq_test: sumsq_test.o libfec.a
+	gcc -g -o $@ $^
+
+dtest: dtest.o libfec.a
+	gcc -g -o $@ $^ -lm
+
+vtest27: vtest27.o libfec.a
+	gcc -g -o $@ $^ -lm
+
+vtest29: vtest29.o libfec.a
+	gcc -g -o $@ $^ -lm
+
+vtest39: vtest39.o libfec.a
+	gcc -g -o $@ $^ -lm
+
+vtest615: vtest615.o libfec.a
+	gcc -g -o $@ $^ -lm
+
+rstest: rstest.o libfec.a
+	gcc -g -o $@ $^
+
+rs_speedtest: rs_speedtest.o libfec.a
+	gcc -g -o $@ $^	
+
+# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug?
+vtest27.o: vtest27.c fec.h
+	gcc -g -c $<
+
+vtest29.o: vtest29.c fec.h
+	gcc -g -c $<
+
+vtest39.o: vtest39.c fec.h
+	gcc -g -c $<
+
+vtest615.o: vtest615.c fec.h
+	gcc -g -c $<
+
+libfec.a: $(LIBS)
+	ar rv $@ $^
+	ranlib libfec.a
+
+# for Darwin
+libfec.dylib: $(LIBS)
+	$(CC) -dynamiclib -install_name $@ -o $@ $^
+
+# for Linux et al
+libfec.so: $(LIBS)
+	gcc -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc
+
+dotprod.o: dotprod.c fec.h
+
+dotprod_port.o: dotprod_port.c fec.h
+
+viterbi27.o: viterbi27.c fec.h
+
+viterbi27_port.o: viterbi27_port.c fec.h
+
+viterbi29.o: viterbi29.c fec.h
+
+viterbi39.o: viterbi39.c fec.h
+
+viterbi39_port.o: viterbi39_port.c fec.h
+
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+
+viterbi39_sse.o: viterbi39_sse.c fec.h
+
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+
+encode_rs_char.o: encode_rs_char.c char.h rs-common.h
+
+encode_rs_int.o: encode_rs_int.c int.h rs-common.h
+
+encode_rs_8.o: encode_rs_8.c fixed.h
+
+encode_rs_av.o: encode_rs_av.c fixed.h
+
+decode_rs_char.o: decode_rs_char.c char.h rs-common.h
+
+decode_rs_int.o: decode_rs_int.c int.h rs-common.h
+
+decode_rs_8.o: decode_rs_8.c fixed.h
+
+init_rs_char.o: init_rs_char.c char.h rs-common.h
+
+init_rs_int.o: init_rs_int.c int.h rs-common.h
+
+ccsds_tab.o: ccsds_tab.c
+
+ccsds_tab.c: gen_ccsds
+	./gen_ccsds > ccsds_tab.c
+
+gen_ccsds: gen_ccsds.o init_rs_char.o
+	gcc -o $@ $^
+
+gen_ccsds.o: gen_ccsds.c
+	gcc  $(CFLAGS) -c -o $@ $<
+
+ccsds_tal.o: ccsds_tal.c
+
+ccsds_tal.c: gen_ccsds_tal
+	./gen_ccsds_tal > ccsds_tal.c
+
+exercise_char.o: exercise.c
+	gcc $(CFLAGS) -c -o $@ $<
+
+exercise_int.o: exercise.c
+	gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $<
+
+exercise_8.o: exercise.c
+	gcc -DFIXED=1 $(CFLAGS) -c -o $@ $<
+
+exercise_ccsds.o: exercise.c
+	gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $<
+
+viterbi27.o: viterbi27.c fec.h
+
+viterbi27_port.o: viterbi27_port.c fec.h
+
+viterbi27_av.o: viterbi27_av.c fec.h
+
+viterbi27_mmx.o: viterbi27_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi27_sse.o: viterbi27_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi27_sse2.o: viterbi27_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi29.o: viterbi29.c fec.h
+
+viterbi29_port.o: viterbi29_port.c fec.h
+
+viterbi29_av.o: viterbi29_av.c fec.h
+
+viterbi29_mmx.o: viterbi29_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi29_sse.o: viterbi29_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi29_sse2.o: viterbi29_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi39.o: viterbi39.c fec.h
+
+viterbi39_port.o: viterbi39_port.c fec.h
+
+viterbi39_av.o: viterbi39_av.c fec.h
+
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi39_sse.o: viterbi39_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi615.o: viterbi615.c fec.h
+
+viterbi615_port.o: viterbi615_port.c fec.h
+
+viterbi615_av.o: viterbi615_av.c fec.h
+
+viterbi615_mmx.o: viterbi615_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi615_sse.o: viterbi615_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi615_sse2.o: viterbi615_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+cpu_mode_x86.o: cpu_mode_x86.c fec.h
+
+cpu_mode_ppc.o: cpu_mode_ppc.c fec.h
+
+
+clean:
+	rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core
+	rm -rf autom4te.cache
+
+distclean: clean
+	rm -f config.log config.cache config.status config.h makefile
+
diff --git a/mmxbfly27.s b/mmxbfly27.s
new file mode 100644
index 0000000..4abbf48
--- /dev/null
+++ b/mmxbfly27.s
@@ -0,0 +1,148 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; 
+*/
+	# MMX (64-bit SIMD) version
+	# requires Pentium-MMX, Pentium-II or better
+
+	# These are offsets into struct v27, defined in viterbi27_mmx.c
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+	.text	
+	.global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
+	.type update_viterbi27_blk_mmx,@function
+	.align 16
+	
+update_viterbi27_blk_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	movl 12(%ebp),%ebx	# ebx = syms
+	movw (%ebx),%ax		# ax = second symbol : first symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	movb %ah,%bl
+	andl $255,%eax
+	andl $255,%ebx
+
+	# shift into first array index dimension slot
+	shll $5,%eax
+	shll $5,%ebx
+
+	# each invocation of this macro will do 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# Compute branch metrics
+	movq (Mettab27_1+8*\GROUP)(%eax),%mm3
+	movq fifteens,%mm0	
+
+	paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
+	paddb ones,%mm3  # emulate pavgb - this may not be necessary
+	psrlq $1,%mm3
+	pand %mm0,%mm3
+
+	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
+	movq %mm6,%mm1	
+	movq %mm2,%mm7
+	
+	paddb %mm3,%mm6
+	paddb %mm3,%mm2
+	pxor  %mm0,%mm3		 # invert branch metric
+	paddb %mm3,%mm7		 # path metric for inverted symbols
+	paddb %mm3,%mm1
+
+	# live registers 1 2 6 7
+	# Compare mm6 and mm7;  mm1 and mm2
+	pxor %mm3,%mm3	
+	movq %mm6,%mm4
+	movq %mm1,%mm5	
+	psubb %mm7,%mm4		# mm4 = mm6 - mm7
+	psubb %mm2,%mm5		# mm5 = mm1 - mm2
+	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
+	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		
+
+	# live registers 1 2 4 5 6 7
+	# select survivors
+	movq %mm4,%mm0
+	pand %mm4,%mm7	
+	movq %mm5,%mm3	
+	pand %mm5,%mm2	
+	pandn %mm6,%mm0
+	pandn %mm1,%mm3	
+	por %mm0,%mm7		# mm7 = first set of survivors
+	por %mm3,%mm2		# mm2 = second set of survivors	
+
+	# live registers 2 4 5 7
+	# interleave & store decisions in mm4, mm5
+	# interleave & store new branch metrics in mm2, mm7		
+	movq %mm4,%mm3
+	movq %mm7,%mm0	
+	punpckhbw %mm5,%mm4
+	punpcklbw %mm5,%mm3
+	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
+	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
+	movq %mm4,(16*\GROUP+8)(%edx)
+	movq %mm3,(16*\GROUP)(%edx)
+	movq %mm7,(16*\GROUP)(%edi)
+	movq %mm0,(16*\GROUP+8)(%edi)	
+
+	.endm
+
+# invoke macro 4 times for a total of 32 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+
+	addl $64,%edx		# bump decision pointer			
+
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+fifteens:	
+	.byte 15,15,15,15,15,15,15,15
+	
+	.align 8
+ones:	.byte 1,1,1,1,1,1,1,1
diff --git a/mmxbfly29.s b/mmxbfly29.s
new file mode 100644
index 0000000..e37cab8
--- /dev/null
+++ b/mmxbfly29.s
@@ -0,0 +1,161 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits); 
+*/
+
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+	.text	
+	.global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2
+	.type update_viterbi29_blk_mmx,@function
+	.align 16
+	
+	# MMX (64-bit SIMD) version
+	# requires Pentium-MMX, Pentium-II or better
+
+update_viterbi29_blk_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	movl 12(%ebp),%ebx	# ebx = syms
+	movw (%ebx),%ax		# ax = second symbol : first symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	movb %ah,%bl
+	andl $255,%eax
+	andl $255,%ebx
+	
+	# shift into first array index dimension slot
+	shll $7,%eax
+	shll $7,%ebx
+
+	# each invocation of this macro will do 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# Compute branch metrics
+	movq (Mettab29_1+8*\GROUP)(%eax),%mm3
+	movq fifteens,%mm0	
+	paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3
+	paddb ones,%mm3  # emulate pavgb - this may not be necessary
+	psrlq $1,%mm3
+	pand %mm0,%mm3
+
+	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1
+	movq %mm6,%mm1	
+	movq %mm2,%mm7
+	
+	paddb %mm3,%mm6
+	paddb %mm3,%mm2
+	pxor  %mm0,%mm3		 # invert branch metric
+	paddb %mm3,%mm7		 # path metric for inverted symbols
+	paddb %mm3,%mm1
+
+	# live registers 1 2 6 7
+	# Compare mm6 and mm7;  mm1 and mm2
+	pxor %mm3,%mm3	
+	movq %mm6,%mm4
+	movq %mm1,%mm5	
+	psubb %mm7,%mm4		# mm4 = mm6 - mm7
+	psubb %mm2,%mm5		# mm5 = mm1 - mm2
+	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
+	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		
+
+	# live registers 1 2 4 5 6 7
+	# select survivors
+	movq %mm4,%mm0
+	pand %mm4,%mm7	
+	movq %mm5,%mm3	
+	pand %mm5,%mm2	
+	pandn %mm6,%mm0
+	pandn %mm1,%mm3	
+	por %mm0,%mm7		# mm7 = first set of survivors
+	por %mm3,%mm2		# mm2 = second set of survivors	
+
+	# live registers 2 4 5 7
+	# interleave & store decisions in mm4, mm5
+	# interleave & store new branch metrics in mm2, mm7		
+	movq %mm4,%mm3
+	movq %mm7,%mm0	
+	punpckhbw %mm5,%mm4
+	punpcklbw %mm5,%mm3
+	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
+	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
+	movq %mm4,(16*\GROUP+8)(%edx)
+	movq %mm3,(16*\GROUP)(%edx)
+	movq %mm7,(16*\GROUP)(%edi)
+	movq %mm0,(16*\GROUP+8)(%edi)	
+
+	.endm
+
+# invoke macro 16 times for a total of 128 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+	butterfly GROUP=4
+	butterfly GROUP=5
+	butterfly GROUP=6
+	butterfly GROUP=7
+	butterfly GROUP=8
+	butterfly GROUP=9
+	butterfly GROUP=10
+	butterfly GROUP=11
+	butterfly GROUP=12
+	butterfly GROUP=13
+	butterfly GROUP=14
+	butterfly GROUP=15
+
+	addl $256,%edx		# bump decision pointer			
+
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+fifteens:	
+	.byte 15,15,15,15,15,15,15,15
+
+	.align 8
+ones:	.byte 1,1,1,1,1,1,1,1
diff --git a/peak_mmx_assist.s b/peak_mmx_assist.s
new file mode 100644
index 0000000..dae831f
--- /dev/null
+++ b/peak_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak value in signed 16-bit input samples
+#  int peakval_mmx(signed short *in,int cnt);	
+	.global peakval_mmx
+	.type peakval_mmx,@function
+	.align 16
+peakval_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	movq %mm7,%mm6		# copy previous peak
+	pcmpgtw %mm0,%mm6	# ff == old peak greater
+	pand %mm6,%mm7		# select old peaks that are greater
+	pandn %mm0,%mm6		# select new values that are greater
+	por %mm6,%mm7
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movd %mm7,%eax
+	psrlq $16,%mm7
+	andl $0xffff,%eax
+	
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl  3f
+	movl %edx,%eax
+3:		
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 4f
+	movl %edx,%eax
+4:		
+	movd %mm7,%edx
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 5f
+	movl %edx,%eax
+5:	
+	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
diff --git a/peak_sse2_assist.s b/peak_sse2_assist.s
new file mode 100644
index 0000000..1dee3a8
--- /dev/null
+++ b/peak_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse2(signed short *in,int cnt);
+	.global peakval_sse2
+	.type peakval_sse2,@function
+	.align 16
+peakval_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %xmm7,%xmm7	# clear peak
+	
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0
+	movaps %xmm0,%xmm1	
+	psraw $15,%xmm1		# xmm1 = 1's if negative, 0's if positive
+	pxor %xmm1,%xmm0	# complement negatives
+	psubw %xmm1,%xmm0	# add 1 to negatives
+	pmaxsw %xmm0,%xmm7	# store peak
+	
+	addl $16,%esi
+	jmp 1b
+
+2:	movaps %xmm7,%xmm0
+	psrldq $8,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $32,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $16,%xmm0
+	pmaxsw %xmm0,%xmm7	# min value in low word of %xmm7
+	
+	movd %xmm7,%eax
+	andl $0xffff,%eax
+
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/peak_sse_assist.s b/peak_sse_assist.s
new file mode 100644
index 0000000..ea6fce8
--- /dev/null
+++ b/peak_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse(signed short *in,int cnt);
+	.global peakval_sse
+	.type peakval_sse,@function
+	.align 16
+peakval_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	pmaxsw %mm0,%mm7	# store peak
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movq %mm7,%mm0
+	psrlq $32,%mm0
+	pmaxsw %mm0,%mm7
+	movq %mm7,%mm0
+	psrlq $16,%mm0
+	pmaxsw %mm0,%mm7	# min value in low word of %mm7
+	
+	movd %mm7,%eax
+	andl $0xffff,%eax
+
+	emms
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/peaktest.c b/peaktest.c
new file mode 100644
index 0000000..fa4b280
--- /dev/null
+++ b/peaktest.c
@@ -0,0 +1,38 @@
+/* Verify correctness of the peak routine
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+
+int peakval(signed short *,int);
+int peakval_port(signed short *,int);
+
+int main(){
+  int i,s;
+  int result,rresult;
+  signed short samples[NSAMP];
+
+  srandom(time(NULL));
+
+  for(i=0;i<NSAMP;i++){
+    do {
+      s = random() & 0x0fff;
+    } while(s == 0x8000);
+    samples[i] = s;
+  }
+  samples[5] = 25000;
+
+  rresult = peakval_port(&samples[OFFSET],NSAMP-OFFSET);
+  result = peakval(&samples[OFFSET],NSAMP-OFFSET);
+  if(result == rresult){
+    printf("OK\n");
+  } else {
+    printf("peak mismatch: %d != %d\n",result,rresult);
+  }
+  exit(0);
+}
diff --git a/peakval.c b/peakval.c
new file mode 100644
index 0000000..811a3a9
--- /dev/null
+++ b/peakval.c
@@ -0,0 +1,39 @@
+/* Switch to appropriate version of peakval routine
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_port(signed short *b,int cnt);
+#ifdef __i386__
+int peakval_mmx(signed short *b,int cnt);
+int peakval_sse(signed short *b,int cnt);
+int peakval_sse2(signed short *b,int cnt);
+#endif
+
+#ifdef __VEC__
+int peakval_av(signed short *b,int cnt);
+#endif
+
+int peakval(signed short *b,int cnt){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return peakval_port(b,cnt);
+#ifdef __i386__
+  case MMX:
+    return peakval_mmx(b,cnt);
+  case SSE:
+    return peakval_sse(b,cnt);
+  case SSE2:
+    return peakval_sse2(b,cnt);
+#endif
+#ifdef __VEC__
+  case ALTIVEC:
+    return peakval_av(b,cnt);
+#endif
+  }
+}
diff --git a/peakval_av.c b/peakval_av.c
new file mode 100644
index 0000000..ae54c10
--- /dev/null
+++ b/peakval_av.c
@@ -0,0 +1,61 @@
+/* Return the largest absolute value of a vector of signed shorts
+
+ * This is the Altivec SIMD version.
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include "fec.h"
+
+signed short peakval_av(signed short *in,int cnt){
+  vector signed short x;
+  int pad;
+  union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s;
+  vector signed short smallest,largest;
+
+  smallest = (vector signed short)(0);
+  largest = (vector signed short)(0);
+  if((pad = (int)in & 15)!=0){
+    /* Load unaligned leading word */
+    x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+    if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+      s.c[15] = (8-cnt)<<4;
+      x = vec_sro(x,s.cv);
+    }
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+    in += 8-pad/2;
+    cnt -= 8-pad/2;
+  }
+  /* Everything is now aligned, rip through most of the block */
+  while(cnt >= 8){
+    x = vec_ld(0,in);
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+    in += 8;
+    cnt -= 8;
+  }
+  /* Handle trailing fragment, if any */
+  if(cnt > 0){
+    x = vec_ld(0,in);
+    s.c[15] = (8-cnt)<<4;
+    x = vec_sro(x,s.cv);
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+  }
+  /* Combine and extract result */
+  largest = vec_max(largest,vec_abs(smallest));
+
+  s.c[15] = 64; /* Shift right four 16-bit words */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.c[15] = 32; /* Shift right two 16-bit words */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.c[15] = 16; /* Shift right one 16-bit word */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.hv = largest;
+  return s.s[7];
+}
diff --git a/peakval_mmx.c b/peakval_mmx.c
new file mode 100644
index 0000000..436fe88
--- /dev/null
+++ b/peakval_mmx.c
@@ -0,0 +1,34 @@
+/* Wrapper for the MMX version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+
+int peakval_mmx_assist(signed short *,int);
+
+int peakval_mmx(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 7) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_mmx_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~3;
+  cnt &= 3;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/peakval_mmx_assist.s b/peakval_mmx_assist.s
new file mode 100644
index 0000000..553cb79
--- /dev/null
+++ b/peakval_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak value in signed 16-bit input samples
+#  int peakval_mmx_assist(signed short *in,int cnt);	
+	.global peakval_mmx_assist
+	.type peakval_mmx_assist,@function
+	.align 16
+peakval_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	movq %mm7,%mm6		# copy previous peak
+	pcmpgtw %mm0,%mm6	# ff == old peak greater
+	pand %mm6,%mm7		# select old peaks that are greater
+	pandn %mm0,%mm6		# select new values that are greater
+	por %mm6,%mm7
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movd %mm7,%eax
+	psrlq $16,%mm7
+	andl $0xffff,%eax
+	
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl  3f
+	movl %edx,%eax
+3:		
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 4f
+	movl %edx,%eax
+4:		
+	movd %mm7,%edx
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 5f
+	movl %edx,%eax
+5:	
+	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
diff --git a/peakval_port.c b/peakval_port.c
new file mode 100644
index 0000000..07ab316
--- /dev/null
+++ b/peakval_port.c
@@ -0,0 +1,16 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_port(signed short *b,int len){
+  int peak = 0;
+  int a,i;
+
+  for(i=0;i<len;i++){
+    a = abs(b[i]);
+    if(a > peak)
+      peak = a;
+  }
+  return peak;
+}
diff --git a/peakval_sse.c b/peakval_sse.c
new file mode 100644
index 0000000..9868b7f
--- /dev/null
+++ b/peakval_sse.c
@@ -0,0 +1,35 @@
+/* IA-32 SSE version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_sse_assist(signed short *,int);
+
+int peakval_sse(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 7) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_sse_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~3;
+  cnt &= 3;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/peakval_sse2.c b/peakval_sse2.c
new file mode 100644
index 0000000..79d9059
--- /dev/null
+++ b/peakval_sse2.c
@@ -0,0 +1,34 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_sse2_assist(signed short *,int);
+
+int peakval_sse2(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 15) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_sse2_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~7;
+  cnt &= 7;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/peakval_sse2_assist.s b/peakval_sse2_assist.s
new file mode 100644
index 0000000..c7a58e7
--- /dev/null
+++ b/peakval_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse2_assist(signed short *in,int cnt);
+	.global peakval_sse2_assist
+	.type peakval_sse2_assist,@function
+	.align 16
+peakval_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %xmm7,%xmm7	# clear peak
+	
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0
+	movaps %xmm0,%xmm1	
+	psraw $15,%xmm1		# xmm1 = 1's if negative, 0's if positive
+	pxor %xmm1,%xmm0	# complement negatives
+	psubw %xmm1,%xmm0	# add 1 to negatives
+	pmaxsw %xmm0,%xmm7	# store peak
+	
+	addl $16,%esi
+	jmp 1b
+
+2:	movaps %xmm7,%xmm0
+	psrldq $8,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $32,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $16,%xmm0
+	pmaxsw %xmm0,%xmm7	# min value in low word of %xmm7
+	
+	movd %xmm7,%eax
+	andl $0xffff,%eax
+
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/peakval_sse_assist.s b/peakval_sse_assist.s
new file mode 100644
index 0000000..827c800
--- /dev/null
+++ b/peakval_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse_assist(signed short *in,int cnt);
+	.global peakval_sse_assist
+	.type peakval_sse_assist,@function
+	.align 16
+peakval_sse_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	pmaxsw %mm0,%mm7	# store peak
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movq %mm7,%mm0
+	psrlq $32,%mm0
+	pmaxsw %mm0,%mm7
+	movq %mm7,%mm0
+	psrlq $16,%mm0
+	pmaxsw %mm0,%mm7	# min value in low word of %mm7
+	
+	movd %mm7,%eax
+	andl $0xffff,%eax
+
+	emms
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/rs-common.h b/rs-common.h
new file mode 100644
index 0000000..e64eb39
--- /dev/null
+++ b/rs-common.h
@@ -0,0 +1,26 @@
+/* Stuff common to all the general-purpose Reed-Solomon codecs
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+/* Reed-Solomon codec control block */
+struct rs {
+  int mm;              /* Bits per symbol */
+  int nn;              /* Symbols per block (= (1<<mm)-1) */
+  data_t *alpha_to;     /* log lookup table */
+  data_t *index_of;     /* Antilog lookup table */
+  data_t *genpoly;      /* Generator polynomial */
+  int nroots;     /* Number of generator roots = number of parity symbols */
+  int fcr;        /* First consecutive root, index form */
+  int prim;       /* Primitive element, index form */
+  int iprim;      /* prim-th root of 1, index form */
+  int pad;        /* Padding bytes in shortened block */
+};
+
+static inline int modnn(struct rs *rs,int x){
+  while (x >= rs->nn) {
+    x -= rs->nn;
+    x = (x >> rs->mm) + (x & rs->nn);
+  }
+  return x;
+}
diff --git a/rs.3 b/rs.3
new file mode 100644
index 0000000..5d71503
--- /dev/null
+++ b/rs.3
@@ -0,0 +1,198 @@
+.TH REED-SOLOMON 3
+.SH NAME
+init_rs_int, encode_rs_int, decode_rs_int, free_rs_int,
+init_rs_char, encode_rs_char, decode_rs_char, free_rs_char,
+encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds
+\- Reed-Solomon encoding/decoding
+.SH SYNOPSIS
+.nf
+.ft B
+#include "fec.h"
+
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+     int nroots,int pad);
+
+void encode_rs_int(void *rs,int *data,int *parity);
+
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+
+void free_rs_int(void *rs);
+
+
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+     int nroots,int pad);
+
+void encode_rs_char(void *rs,unsigned char *data,
+     unsigned char *parity);
+
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+     int no_eras);
+
+void free_rs_char(void *rs);
+
+
+void encode_rs_8(unsigned char *data,unsigned char *parity,
+     int pad);
+
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,
+     int pad);
+
+
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,
+     int pad);
+
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,
+     int pad);
+
+unsigned char Taltab[256];
+unsigned char Tal1tab[256];
+
+.fi
+
+.SH DESCRIPTION
+These functions implement Reed-Solomon error control encoding and
+decoding. For optimal performance in a variety of applications, three
+sets of functions are supplied. To access these functions, add "-lfec"
+to your linker command line.
+
+The functions with names ending in \fB_int\fR handle data in integer arrays,
+permitting arbitrarily large codewords limited only by machine
+resources.
+
+The functions with names ending in \fB_char\fR take unsigned char arrays and can
+handle codes with symbols of 8 bits or less (i.e., with codewords of
+255 symbols or less).
+
+\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific
+(255,223) code with 8-bit symbols specified by the CCSDS:
+a field generator of 1 + X + X^2 + X^7 + X^8 and a code
+generator with first consecutive root = 112 and a primitive element of
+11. These functions use the conventional
+polynomial form, \fInot\fR the dual-basis specified in
+the CCSDS standard, to represent symbols. This code may be
+shortened by giving a non-zero \fBpad\fR value to produce a
+(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the
+specified number of zeroes at the front of the full codeword.
+
+For full CCSDS compatibility, \fBencode_rs_ccsds\fR and
+\fBdecode_rs_ccsds\fR are provided. These functions use two lookup
+tables, \fBTaltab\fR to convert from conventional to dual-basis, and
+\fBTal1tab\fR to perform the inverse mapping from dual-basis to
+conventional form, before and after calls to \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
+
+The \fB_8\fR and \fB_ccsds\fR functions do not require initialization.
+
+To use the general purpose RS encoder or decoder (i.e.,
+the \fB_char\fR or \fB_int\fR versions), the user must first
+call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The
+arguments are as follows:
+
+\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR
+or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a
+huge code would exhaust memory limits on a 32-bit machine). The resulting
+Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols,
+each containing \fBsymsize\fR bits. The codeword may be shortened with the
+\fBpad\fR parameter described below.
+
+\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients,
+with the 0th coefficient in the low order bit. The polynomial
+\fImust\fR be primitive; if not, the call will fail and NULL will be
+returned.
+
+\fBfcr\fR gives, in index form, the first consecutive root of the
+Reed Solomon code generator polynomial.
+
+\fBprim\fR gives, in index form, the primitive element in the Galois field
+used to generate the Reed Solomon code generator polynomial.
+
+\fBnroots\fR gives the number of roots in the Reed Solomon code
+generator polynomial. This equals the number of parity symbols
+per code block.
+
+\fBpad\fR gives the number of leading symbols in the codeword
+that are implicitly padded to zero in a shortened code block. 
+
+The resulting Reed-Solomon code has parameters (N,K), where
+N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR.
+
+The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept
+the pointer returned by \fBinit_rs_char\fR or
+\fBinit_rs_int\fR, respectively, to
+encode a block of data using the specified code.
+The input data array is expected to
+contain K symbols (of \fBsymsize\fR bits each, right justified
+in each char or int) and \fBnroots\fR parity symbols will be placed
+into the \fBparity\fR array, right justified.
+
+The \fBdecode_\fR functions correct
+the errors in a Reed-Solomon codeword of N symbols up to the capability of the code.
+An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR
+array to assist the decoder; this parameter may be NULL if no erasures
+are given. The number of erased symbols must be given in the \fBno_eras\fR
+parameter.
+
+To maximize performance, the encode and decode functions perform no
+"sanity checking" of their inputs. Decoder failure may result if
+\fBeras_pos\fR contains duplicate entries, and both encoder and
+decoder will fail if an input symbol exceeds its allowable range.
+(Symbol range overflow cannot occur with the \fB_8\fR or
+\fB_ccsds\fR functions,
+or with the \fB_char\fR functions when 8-bit symbols are specified.)
+
+The decoder corrects the symbols "in place", returning the number
+of symbols in error. If the codeword is uncorrectable, -1 is returned
+and the data block is unchanged. If \fBeras_pos\fR is non-null, it is
+used to return a list of corrected symbol positions, in no particular
+order.  This means that the
+array passed through this parameter \fImust\fR have at least \fBnroots\fR
+elements to prevent a possible buffer overflow.
+
+The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal
+space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions,
+respecitively.
+
+The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have
+corresponding \fBinit\fR and \fBfree\fR, nor do they take the
+\fBrs\fR argument accepted by the other functions as their parameters
+are statically compiled. These functions implement a code
+equivalent to calling
+
+\fBinit_rs_char\fR(8,0x187,112,11,32,pad);
+
+and using the resulting pointer with \fBencode_rs_char\fR and
+\fBdecode_rs_char\fR.
+
+.SH RETURN VALUES
+\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal
+control structure that must be passed to the corresponding encode, decode
+and free functions. These functions return NULL on error.
+
+The \fBdecode_\fR functions return a count of corrected
+symbols, or -1 if the block was uncorrectible.
+
+.SH AUTHOR
+Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert
+Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy
+(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar
+Welz (dwelz@web.de).
+
+.SH COPYRIGHT
+Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the
+GNU Lesser General Public License (LGPL).
+
+.SH SEE ALSO
+CCSDS 101.0-B-6: Telemetry Channel Coding.
+http://www.ccsds.org/documents/101x0b6.pdf
+
+.SH NOTE
+CCSDS chose the "dual basis" symbol representation because it
+simplified the implementation of a Reed-Solomon encoder in dedicated
+hardware. However, this approach holds no advantages for a software
+implementation on a general purpose computer, so use of the dual basis
+is recommended only if compatibility with the CCSDS standard is needed,
+e.g., to decode data from an existing spacecraft using the CCSDS
+standard. If you just want a fast (255,223) RS codec without needing
+to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
+
diff --git a/rs_speedtest.c b/rs_speedtest.c
new file mode 100644
index 0000000..225f160
--- /dev/null
+++ b/rs_speedtest.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "fec.h"
+
+int main(){
+  unsigned char block[255];
+  int i;
+  void *rs;
+  struct rusage start,finish;
+  double extime;
+  int trials = 10000;
+
+  for(i=0;i<223;i++)
+    block[i] = 0x01;
+
+  rs = init_rs_char(8,0x187,112,11,32,0);
+  encode_rs_char(rs,block,&block[223]);
+
+  getrusage(RUSAGE_SELF,&start);
+  for(i=0;i<trials;i++){
+#if 0
+    block[0] ^= 0xff; /* Introduce an error */
+    block[2] ^= 0xff; /* Introduce an error */
+#endif
+    decode_rs_char(rs,block,NULL,0);
+  }
+  getrusage(RUSAGE_SELF,&finish);
+  extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+  
+  printf("Execution time for %d Reed-Solomon blocks using general decoder: %.2f sec\n",trials,extime);
+  printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+
+
+  encode_rs_8(block,&block[223],0);
+  getrusage(RUSAGE_SELF,&start);
+  for(i=0;i<trials;i++){
+#if 0
+    block[0] ^= 0xff; /* Introduce an error */
+    block[2] ^= 0xff; /* Introduce an error */
+#endif
+    decode_rs_8(block,NULL,0,0);
+  }
+  getrusage(RUSAGE_SELF,&finish);
+  extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+  printf("Execution time for %d Reed-Solomon blocks using CCSDS decoder: %.2f sec\n",trials,extime);
+  printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+
+  exit(0);
+}
+
diff --git a/rstest.c b/rstest.c
new file mode 100644
index 0000000..539b40a
--- /dev/null
+++ b/rstest.c
@@ -0,0 +1,296 @@
+/* Test the Reed-Solomon codecs
+ * for various block sizes and with random data and random error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "fec.h"
+
+
+struct etab {
+  int symsize;
+  int genpoly;
+  int fcs;
+  int prim;
+  int nroots;
+  int ntrials;
+} Tab[] = {
+  {2, 0x7,     1,   1, 1, 10 },
+  {3, 0xb,     1,   1, 2, 10 },
+  {4, 0x13,    1,   1, 4, 10 },
+  {5, 0x25,    1,   1, 6, 10 },
+  {6, 0x43,    1,   1, 8, 10 },
+  {7, 0x89,    1,   1, 10, 10 },
+  {8, 0x11d,   1,   1, 32, 10 },
+  {8, 0x187,   112,11, 32, 10 }, /* Duplicates CCSDS codec */
+  {9, 0x211,   1,   1, 32, 10 },
+  {10,0x409,   1,   1, 32, 10 },
+  {11,0x805,   1,   1, 32, 10 },
+  {12,0x1053,  1,   1, 32, 5 },
+  {13,0x201b,  1,   1, 32, 2 },
+  {14,0x4443,  1,   1, 32, 1 },
+  {15,0x8003,  1,   1, 32, 1 },
+  {16,0x1100b, 1,   1, 32, 1 },
+  {0, 0, 0, 0, 0},
+};
+
+int exercise_char(struct etab *e);
+int exercise_int(struct etab *e);
+int exercise_8(void);
+
+int main(){
+  int i;
+
+  srandom(time(NULL));
+
+  printf("Testing fixed CCSDS encoder...\n");
+  exercise_8();
+  for(i=0;Tab[i].symsize != 0;i++){
+    int nn,kk;
+
+    nn = (1<<Tab[i].symsize) - 1;
+    kk = nn - Tab[i].nroots;
+    printf("Testing (%d,%d) code...\n",nn,kk);
+    if(Tab[i].symsize <= 8)
+      exercise_char(&Tab[i]);
+    else
+      exercise_int(&Tab[i]);
+  }
+  exit(0);
+}
+
+int exercise_8(void){
+  int nn = 255;
+  unsigned char block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+
+  /* Compute code parameters */
+  kk = 223;
+
+
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors<=(nn-kk)/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_8(block,&block[kk],0);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_8(tblock,derrlocs,erasures,0);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+  return decoder_errors;
+}
+
+
+int exercise_char(struct etab *e){
+  int nn = (1<<e->symsize) - 1;
+  unsigned char block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+  void *rs;
+
+  if(e->symsize > 8)
+    return -1;
+
+  /* Compute code parameters */
+  kk = nn - e->nroots;
+
+  rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+  if(rs == NULL){
+    printf("init_rs_char failed!\n");
+    return -1;
+  }
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors <= e->nroots/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_char(rs,block,&block[kk]);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_char(rs,tblock,derrlocs,erasures);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+
+  free_rs_char(rs);
+  return 0;
+}
+
+int exercise_int(struct etab *e){
+  int nn = (1<<e->symsize) - 1;
+  int block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+  void *rs;
+
+  /* Compute code parameters */
+  kk = nn - e->nroots;
+
+  rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+  if(rs == NULL){
+    printf("init_rs_int failed!\n");
+    return -1;
+  }
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors <= e->nroots/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_int(rs,block,&block[kk]);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_int(rs,tblock,derrlocs,erasures);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+
+  free_rs_int(rs);
+  return 0;
+}
diff --git a/sim.c b/sim.c
new file mode 100644
index 0000000..151b04c
--- /dev/null
+++ b/sim.c
@@ -0,0 +1,43 @@
+#include <math.h>
+#include <stdlib.h>
+#include "fec.h"
+
+#define	MAX_RANDOM	0x7fffffff
+
+/* Generate gaussian random double with specified mean and std_dev */
+double normal_rand(double mean, double std_dev)
+{
+  double fac,rsq,v1,v2;
+  static double gset;
+  static int iset;
+
+  if(iset){
+    /* Already got one */
+    iset = 0;
+    return mean + std_dev*gset;
+  }
+  /* Generate two evenly distributed numbers between -1 and +1
+   * that are inside the unit circle
+   */
+  do {
+    v1 = 2.0 * (double)random() / MAX_RANDOM - 1;
+    v2 = 2.0 * (double)random() / MAX_RANDOM - 1;
+    rsq = v1*v1 + v2*v2;
+  } while(rsq >= 1.0 || rsq == 0.0);
+  fac = sqrt(-2.0*log(rsq)/rsq);
+  gset = v1*fac;
+  iset++;
+  return mean + std_dev*v2*fac;
+}
+
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){
+  int sample;
+    
+  sample = offset + gain*normal_rand(sym?amp:-amp,1.0);
+  /* Clip to 8-bit offset range */
+  if(sample < 0)
+    sample = 0;
+  else if(sample > clip)
+    sample = clip;
+  return sample;
+}
diff --git a/simd-viterbi.3 b/simd-viterbi.3
new file mode 100644
index 0000000..4c67593
--- /dev/null
+++ b/simd-viterbi.3
@@ -0,0 +1,247 @@
+.TH SIMD-VITERBI 3
+.SH NAME
+create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk,
+chainback_viterbi27, delete_viterbi27,
+create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk,
+chainback_viterbi29, delete_viterbi29,
+create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk,
+chainback_viterbi39, delete_viterbi39,
+create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk,
+chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders
+.SH SYNOPSIS
+.nf
+.ft B
+#include "fec.h"
+void *create_viterbi27(int blocklen);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi29(int blocklen);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi39(int blocklen);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi615(int blocklen);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+.fi
+.SH DESCRIPTION
+These functions implement high performance Viterbi decoders for four
+convolutional codes: a rate 1/2 constraint length 7 (k=7) code
+("viterbi27"), a rate 1/2 k=9 code ("viterbi29"),
+a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615").
+The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve
+decoding speed.
+
+On the IA32 there are three different SIMD instruction sets. The first
+and most common is MMX, introduced on later Intel Pentiums and then on
+the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe,
+etc).  SSE was introduced on the Pentium III and later implemented in
+the AMD Athlon 4 (AMD calls it "3D Now!  Professional"). Most
+recently, SSE2 was introduced in the Intel Pentium 4, and has been
+adopted by more recent AMD CPUs. The presence of SSE2 implies the
+existence of SSE, which in turn implies MMX.
+
+Altivec is the PowerPC SIMD instruction set. It is roughly comparable
+to SSE2. Altivec was introduced to the general public in the Apple
+Macintosh G4; it is also present in the G5. Altivec is actually a
+Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it
+"VMX". All refer to the same thing.
+
+When built for the IA32 or PPC architectures, the functions
+automatically use the most powerful SIMD instruction set available. If
+no SIMD instructions are available, or if the library is built for a
+non-IA32, non-PPC machine, a portable C version is executed
+instead.
+
+.SH USAGE
+Four versions of each function are provided, one for each code.
+In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39"
+or "viterbi615" as desired. 
+
+Before Viterbi decoding can begin, an instance must first be created with
+\fBcreate_viterbi()\fR.  This function creates and returns a pointer to
+an internal control structure
+containing the path metrics and the branch
+decisions. \fBcreate_viterbi()\fR takes one argument that gives the
+length of the data block in bits. You \fImust not\fR attempt to
+decode a block longer than the length given to \fBcreate_viterbi()\fR.
+
+Before decoding a new frame,
+\fBinit_viterbi()\fR must be called to reset the decoder state.
+It accepts the instance pointer returned by
+\fBcreate_viterbi()\fR and the initial starting state of the
+convolutional encoder (usually 0). If the initial starting state is unknown or
+incorrect, the decoder will still function but the decoded data may be
+incorrect at the start of the block.
+
+Blocks of received symbols are processed with calls to
+\fBupdate_viterbi_blk()\fR.  The \fBnbits\fR parameter specifies the
+number of \fIdata bits\fR (not channel symbols) represented by the
+\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in
+\fBsyms\fR is twice \fInbits\fR, and so on.)
+Each symbol is expected to range
+from 0 through 255, with 0 corresponding to a "strong 0" and 255
+corresponding to a "strong 1". The caller is responsible for
+determining the proper pairing of input symbols (commonly known as
+decoder symbol phasing).
+
+At the end of the block, the data is recovered with a call to
+\fBchainback_viterbi()\fR. The arguments are the pointer to the
+decoder instance, a pointer to a user-supplied buffer into which the
+decoded data is to be written, the number of data bits (not bytes)
+that are to be decoded, and the terminal state of the convolutional
+encoder at the end of the frame (usually 0). If the terminal state is
+incorrect or unknown, the decoded data bits at the end of the frame
+may be unreliable. The decoded data is written in big-endian order,
+i.e., the first bit in the frame is written into the high order bit of
+the first byte in the buffer. If the frame is not an integral number
+of bytes long, the low order bits of the last byte in the frame will
+be unused.
+
+Note that the decoders assume the use of a tail, i.e., the encoding
+and transmission of a sufficient number of padding bits beyond the end
+of the user data to force the convolutional encoder into the known
+terminal state given to \fBchainback_viterbi()\fR. The tail is
+always one bit less than the constraint length of the code, so the k=7
+code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits
+(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail
+symbols).
+
+The tail bits are not included in the length arguments to
+\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if
+the block contains 1000 user bits, then this would be the length
+parameter given to \fBcreate_viterbi27()\fR and
+\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called
+with a total of 2012 symbols - the last 12 encoded symbols
+representing the tail bits.
+
+After the call to \fBchainback_viterbi()\fR, the decoder may be reset
+with a call to \fBinit_viterbi()\fR and another block can be decoded.
+Alternatively, \fBdelete_viterbi()\fR can be called to free all resources
+used by the Viterbi decoder.
+
+The \fBset_viterbi_polynomial()\fR function allows use of other than the default
+code generator polynomials. Although only one set of polynomials are generally
+used with each code, there can are different conventions as to their order and
+symbol polarity, and these functions simplifies their use.
+
+The default polynomials for the viterbi27 routes
+are those of the NASA-JPL convention \fIwithout\fR symbol inversion.
+The NASA-JPL convention normally inverts the first symbol.
+The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second.
+.sp
+To set the NASA-JPL convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[2] = { -V27POLYA,V27POLYB };
+set_viterbi27_polynomial(polys);
+.ft R
+.fi
+.sp
+and to set the CCSDS convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[2] = { V27POLYB,-V27POLYA };
+set_viterbi27_polynomial(polys);
+.ft R
+.fi
+.sp
+The default polynomials for the viterbi615 routines
+are those used by the Cassini spacecraft \fIwithout\fR
+symbol inversion. Mars Pathfinder (MPF) and STEREO
+swap the third and fourth polynomials.
+Both conventions invert the
+first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions.
+.sp
+To set the Cassini convention with symbol inversion, do the following:
+
+.nf
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF };
+set_viterbi615_polynomial(polys);
+.ft R
+.fi
+.sp
+and to set the MPF/STEREO convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF };
+set_viterbi615_polynomial(polys);
+.ft R
+.fi
+
+For performance reasons, calling this function changes the code
+generator polynomials for \fIall\fR instances of corresponding Viterbi decoder,
+including those already created.
+
+.SH ERROR PERFORMANCE
+These decoders have all been extensively tested and found to provide
+performance consistent with that expected for soft-decision Viterbi
+decoding with 8-bit symbols.
+
+Due to internal differences, the implementations
+vary slightly in error performance. In
+general, the portable C versions exhibit the best error performance
+because they use full-sized branch metrics, and the MMX versions
+exhibit the worst because they use 8-bit branch metrics with modulo
+comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and
+r=1/2 k=9 codes use unsigned
+8-bit branch metrics, and are almost as good as the C versions.  The
+r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD
+versions.
+
+.SH DIRECT ACCESS TO SPECIFIC FUNCTION VERSIONS
+Calling the functions listed above automatically calls the appropriate
+version of the function depending on the CPU type and available SIMD
+instructions. A particular version can also be called directly by
+appending the appropriate suffix to the function name. The available
+suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX,
+SSE, SSE2, Altivec and portable versions, respectively. For example,
+the SSE2 version of the update_viterbi27_blk() function can be invoked
+as update_viterbi27_blk_sse2().
+
+Naturally, the _av functions are only available on the PowerPC and the
+_mmx, _sse and _sse2 versions are only available on IA-32. Calling
+a SIMD-enabled function on a CPU that doesn't support the appropriate
+set of instructions will result in an illegal instruction exception.
+
+.SH RETURN VALUES
+\fBcreate_viterbi\fR returns a pointer to the structure containing
+the decoder state. 
+The other functions return -1 on error, 0 otherwise.
+
+.SH AUTHOR & COPYRIGHT
+Phil Karn, KA9Q (karn@ka9q.net)
+
+.SH LICENSE
+This software may be used under the terms of the GNU Limited General Public License (LGPL).
+
+
diff --git a/sqtest.c b/sqtest.c
new file mode 100644
index 0000000..b2abb09
--- /dev/null
+++ b/sqtest.c
@@ -0,0 +1,42 @@
+/* Verify correctness of the sum-of-square routines */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+
+long long sumsq_wq(signed short *in,int cnt);
+long long sumsq_wq_ref(signed short *in,int cnt);
+
+int main(){
+  int i;
+  long long result,rresult;
+  signed short samples[NSAMP];
+
+  srandom(time(NULL));
+
+  for(i=0;i<NSAMP;i++)
+    samples[i] = random() & 0xffff;
+
+  rresult = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+  result = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+  if(result == rresult){
+    printf("OK\n");
+  } else {
+    printf("sum mismatch: %lld != %lld\n",result,rresult);
+  }
+  exit(0);
+}
+
+long long sumsq_wq_ref(signed short *in,int cnt){
+  long long sum = 0;
+  int i;
+
+  for(i=0;i<cnt;i++){
+    sum += (long)in[i] * in[i];
+  }
+  return sum;
+}
+
diff --git a/sse2bfly27.s b/sse2bfly27.s
new file mode 100644
index 0000000..27422a2
--- /dev/null
+++ b/sse2bfly27.s
@@ -0,0 +1,202 @@
+/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2003 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; 
+*/
+	# SSE2 (128-bit integer SIMD) version
+	# Requires Pentium 4 or better
+
+	# These are offsets into struct v27, defined in viterbi27.h
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+	.text	
+	.global update_viterbi27_blk_sse2,Branchtab27_sse2
+	.type update_viterbi27_blk_sse2,@function
+	.align 16
+	
+update_viterbi27_blk_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%xmm5		# xmm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones,%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
+	movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	
+	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
+	# This is okay for this
+	# code because the worst-case metric spread (at high Eb/No) is only 120,
+	# well within the range of our unsigned 8-bit path metrics, and even within
+	# the range of signed 8-bit path metrics
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4
+
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+32)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0	# note use of saturating arithmetic
+	paddusb %xmm4,%xmm3	# this shouldn't be necessary, but why not?
+	
+	# negate branch metrics
+	pxor %xmm7,%xmm4
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2	
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%edi)
+	movdqa %xmm4,(32*\GROUP)(%edi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shll $16,%ebx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orl %eax,%ebx
+	movl %ebx,(4*\GROUP)(%edx)
+	.endm
+
+	# invoke macro 2 times for a total of 32 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+
+	addl $8,%edx		# bump decision pointer
+		
+	# See if we have to normalize. This requires an explanation. We don't want
+	# our path metrics to exceed 255 on the *next* iteration. Since the
+	# largest branch metric is 30, that means we don't want any to exceed 225
+	# on *this* iteration. Rather than look them all, we just pick an arbitrary one
+	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
+	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
+	
+	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
+	# show that a higher threshold could be used without affecting BER performance
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $105,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics. We can't just pick an arbitrary small constant because
+	# the minimum metric might be zero!
+	movdqa (%edi),%xmm0
+	movdqa %xmm0,%xmm4	
+	movdqa 16(%edi),%xmm1
+	pminub %xmm1,%xmm4
+	movdqa 32(%edi),%xmm2
+	pminub %xmm2,%xmm4	
+	movdqa 48(%edi),%xmm3	
+	pminub %xmm3,%xmm4
+
+	# crunch down to single lowest metric
+	movdqa %xmm4,%xmm5
+	psrldq $8,%xmm5     # the count to psrldq is bytes, not bits!
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $32,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $16,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $8,%xmm5
+	pminub %xmm5,%xmm4	# now in lowest byte of %xmm4
+
+	punpcklbw %xmm4,%xmm4	# lowest 2 bytes
+	pshuflw $0,%xmm4,%xmm4  # lowest 8 bytes
+	punpcklqdq %xmm4,%xmm4	# all 16 bytes
+	
+	# xmm4 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	psubusb %xmm4,%xmm0
+	psubusb %xmm4,%xmm1
+	psubusb %xmm4,%xmm2
+	psubusb %xmm4,%xmm3	
+	movdqa %xmm0,(%edi)
+	movdqa %xmm1,16(%edi)	
+	movdqa %xmm2,32(%edi)	
+	movdqa %xmm3,48(%edi)	
+	
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 16
+
+thirtyones:
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
diff --git a/sse2bfly29.s b/sse2bfly29.s
new file mode 100644
index 0000000..0fa1742
--- /dev/null
+++ b/sse2bfly29.s
@@ -0,0 +1,245 @@
+/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; 
+*/
+
+	# SSE2 (128-bit integer SIMD) version
+	# Requires Pentium 4 or better
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+
+	.text	
+	.global update_viterbi29_blk_sse2,Branchtab29_sse2
+	.type update_viterbi29_blk_sse2,@function
+	.align 16
+	
+update_viterbi29_blk_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+	
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%xmm5		# xmm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	movdqa thirtyones,%xmm7
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones,%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa Branchtab29_sse2+(16*\GROUP),%xmm4
+	movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4	# xmm4 contains branch metrics
+	
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+128)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0
+	paddusb %xmm4,%xmm3
+	
+	# invert branch metrics
+	pxor %xmm7,%xmm4
+	
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%edi)
+	movdqa %xmm4,(32*\GROUP)(%edi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shll $16,%ebx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orl %eax,%ebx
+	movl %ebx,(4*\GROUP)(%edx)
+	.endm
+
+	# invoke macro 8 times for a total of 128 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+	butterfly GROUP=4
+	butterfly GROUP=5
+	butterfly GROUP=6
+	butterfly GROUP=7
+
+	addl $32,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $50,%eax		# is it greater than 50?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movdqa (%edi),%xmm0
+	pminub 16(%edi),%xmm0
+	pminub 32(%edi),%xmm0
+	pminub 48(%edi),%xmm0
+	pminub 64(%edi),%xmm0
+	pminub 80(%edi),%xmm0
+	pminub 96(%edi),%xmm0	
+	pminub 112(%edi),%xmm0	
+	pminub 128(%edi),%xmm0
+	pminub 144(%edi),%xmm0
+	pminub 160(%edi),%xmm0
+	pminub 176(%edi),%xmm0
+	pminub 192(%edi),%xmm0
+	pminub 208(%edi),%xmm0
+	pminub 224(%edi),%xmm0
+	pminub 240(%edi),%xmm0							
+
+	# crunch down to single lowest metric
+	movdqa %xmm0,%xmm1
+	psrldq $8,%xmm0     # the count to psrldq is bytes, not bits!
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $32,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $16,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $8,%xmm0
+	pminub %xmm1,%xmm0
+
+	punpcklbw %xmm0,%xmm0	# lowest 2 bytes
+	pshuflw $0,%xmm0,%xmm0  # lowest 8 bytes
+	punpcklqdq %xmm0,%xmm0	# all 16 bytes
+
+	# xmm0 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	movdqa (%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,(%edi)
+	movdqa 16(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,16(%edi)	
+	movdqa 32(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,32(%edi)	
+	movdqa 48(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,48(%edi)	
+	movdqa 64(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,64(%edi)	
+	movdqa 80(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,80(%edi)	
+	movdqa 96(%edi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,96(%edi)	
+	movdqa 112(%edi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,112(%edi)	
+	movdqa 128(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,128(%edi)	
+	movdqa 144(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,144(%edi)	
+	movdqa 160(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,160(%edi)	
+	movdqa 176(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,176(%edi)	
+	movdqa 192(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,192(%edi)	
+	movdqa 208(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,208(%edi)	
+	movdqa 224(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,224(%edi)	
+	movdqa 240(%edi),%xmm1							
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,240(%edi)	
+	
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+	
+	.data
+	.align 16
+thirtyones:	
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
diff --git a/ssebfly27.s b/ssebfly27.s
new file mode 100644
index 0000000..7f445da
--- /dev/null
+++ b/ssebfly27.s
@@ -0,0 +1,205 @@
+/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2001 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; 
+*/
+
+	# SSE (64-bit integer SIMD) version
+	# Requires Pentium III or better
+
+	# These are offsets into struct v27, defined in viterbi27.h
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+.text	
+.global update_viterbi27_blk_sse,Branchtab27_sse
+	.type update_viterbi27_blk_sse,@function
+	.align 16
+	
+update_viterbi27_blk_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# %ebx = syms
+	movb (%ebx),%al
+	movd %eax,%mm6		# mm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%mm5		# mm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
+	punpcklbw %mm5,%mm5
+	movq thirtyones,%mm7
+
+	pshufw $0,%mm6,%mm6	# copy low word to upper 3
+	pshufw $0,%mm5,%mm5
+	# mm6 now contains first symbol in each byte, mm5 the second
+
+	# each invocation of this macro does 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movq Branchtab27_sse+(8*\GROUP),%mm4
+	movq Branchtab27_sse+32+(8*\GROUP),%mm3
+	pxor %mm6,%mm4
+	pxor %mm5,%mm3
+	pavgb %mm3,%mm4			# mm4 contains branch metrics
+	psrlw $3,%mm4
+	pand %mm7,%mm4
+	
+	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+32)(%esi),%mm3	# Incoming path metric, high bit = 1
+	movq %mm0,%mm2
+	movq %mm3,%mm1
+	paddusb %mm4,%mm0
+	paddusb %mm4,%mm3
+	
+	# invert branch metrics. This works only because they're 5 bits
+	pxor %mm7,%mm4
+	
+	paddusb %mm4,%mm1
+	paddusb %mm4,%mm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %mm1,%mm0
+	pminub %mm3,%mm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %mm0,%mm1
+	pcmpeqb %mm2,%mm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movq %mm0,%mm4
+	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
+	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
+	movq %mm0,(16*\GROUP+8)(%edi)
+	movq %mm4,(16*\GROUP)(%edi)
+
+	# interleave decisions, accumulate into %ebx
+	movq %mm1,%mm4
+	punpckhbw %mm3,%mm1
+	punpcklbw %mm3,%mm4
+	# Due to an error in the Intel instruction set ref (the register
+	# fields are swapped), gas assembles pmovmskb incorrectly
+	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
+	shll $((16*\GROUP+8)&31),%eax
+	orl %eax,%ebx
+	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
+	shll $((16*\GROUP)&31),%eax
+	orl %eax,%ebx
+	.endm
+
+	# invoke macro 4 times for a total of 32 butterflies
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=0
+	butterfly GROUP=1
+	movl %ebx,(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=2
+	butterfly GROUP=3
+	movl %ebx,4(%edx)	# stash second 32 decisions
+
+	addl $8,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmpl $150,%eax		# is it greater than 150?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movq (%edi),%mm0
+	pminub 8(%edi),%mm0
+	pminub 16(%edi),%mm0
+	pminub 24(%edi),%mm0
+	pminub 32(%edi),%mm0
+	pminub 40(%edi),%mm0
+	pminub 48(%edi),%mm0
+	pminub 56(%edi),%mm0
+	# mm0 contains 8 smallest metrics
+	# crunch down to single lowest metric
+	movq %mm0,%mm1
+	psrlq $32,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $16,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $8,%mm0
+	pminub %mm1,%mm0
+	punpcklbw %mm0,%mm0	# expand to all 8 bytes
+	pshufw $0,%mm0,%mm0
+
+	# mm0 now contains lowest metric in all 8 bytes
+	# subtract it from every output metric
+	# Trashes %mm7
+	.macro PSUBUSBM REG,MEM
+	movq \MEM,%mm7
+	psubusb \REG,%mm7
+	movq %mm7,\MEM
+	.endm
+	
+	PSUBUSBM %mm0,(%edi)
+	PSUBUSBM %mm0,8(%edi)
+	PSUBUSBM %mm0,16(%edi)
+	PSUBUSBM %mm0,24(%edi)
+	PSUBUSBM %mm0,32(%edi)
+	PSUBUSBM %mm0,40(%edi)
+	PSUBUSBM %mm0,48(%edi)
+	PSUBUSBM %mm0,56(%edi)
+
+	movd %mm0,%eax
+	and $0xff,%eax
+
+done:	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+
+	ret
+
+	.data
+	
+	.align 16
+thirtyones:
+	.byte 31,31,31,31,31,31,31,31
+	
+	
+
diff --git a/ssebfly29.s b/ssebfly29.s
new file mode 100644
index 0000000..d7d2149
--- /dev/null
+++ b/ssebfly29.s
@@ -0,0 +1,271 @@
+/* Intel SIMD SSE implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); 
+*/
+	# SSE (64-bit integer SIMD) version
+	# Requires Pentium III or better
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+	.text	
+	.global update_viterbi29_blk_sse,Branchtab29_sse
+	.type update_viterbi29_blk_sse,@function
+	.align 16
+	
+update_viterbi29_blk_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+	
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%mm6		# mm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%mm5		# mm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
+	punpcklbw %mm5,%mm5
+
+	movq thirtyones,%mm7
+	pshufw $0,%mm6,%mm6	# copy low word to upper 3
+	pshufw $0,%mm5,%mm5
+	# mm6 now contains first symbol in each byte, mm5 the second
+
+	# each invocation of this macro does 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movq Branchtab29_sse+(8*\GROUP),%mm4
+	movq Branchtab29_sse+128+(8*\GROUP),%mm3
+	pxor %mm6,%mm4
+	pxor %mm5,%mm3
+	pavgb %mm3,%mm4			# mm4 contains branch metrics
+	psrlw $3,%mm4
+	pand %mm7,%mm4
+
+	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+128)(%esi),%mm3	# Incoming path metric, high bit = 1
+	movq %mm0,%mm2
+	movq %mm3,%mm1
+	paddusb %mm4,%mm0
+	paddusb %mm4,%mm3
+	
+	# invert branch metrics. This works only because they're 5 bits
+	pxor %mm7,%mm4
+	
+	paddusb %mm4,%mm1
+	paddusb %mm4,%mm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %mm1,%mm0
+	pminub %mm3,%mm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %mm0,%mm1
+	pcmpeqb %mm2,%mm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movq %mm0,%mm4
+	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
+	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
+	movq %mm0,(16*\GROUP+8)(%edi)
+	movq %mm4,(16*\GROUP)(%edi)
+
+	# interleave decisions, accumulate into %ebx
+	movq %mm1,%mm4
+	punpckhbw %mm3,%mm1
+	punpcklbw %mm3,%mm4
+	# Due to an error in the Intel instruction set ref (the register
+	# fields are swapped), gas assembles pmovmskb incorrectly
+	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
+	shll $((16*\GROUP+8)&31),%eax
+	orl %eax,%ebx
+	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
+	shll $((16*\GROUP)&31),%eax
+	orl %eax,%ebx
+	.endm
+
+	# invoke macro 16 times for a total of 128 butterflies
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=0
+	butterfly GROUP=1
+	movl %ebx,(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=2
+	butterfly GROUP=3
+	movl %ebx,4(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=4
+	butterfly GROUP=5
+	movl %ebx,8(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=6
+	butterfly GROUP=7
+	movl %ebx,12(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=8
+	butterfly GROUP=9
+	movl %ebx,16(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=10
+	butterfly GROUP=11
+	movl %ebx,20(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=12
+	butterfly GROUP=13
+	movl %ebx,24(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=14
+	butterfly GROUP=15
+	movl %ebx,28(%edx)	# stash second 32 decisions
+
+	addl $32,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $50,%eax		# is it greater than 50?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movq (%edi),%mm0
+	pminub 8(%edi),%mm0
+	pminub 16(%edi),%mm0
+	pminub 24(%edi),%mm0
+	pminub 32(%edi),%mm0
+	pminub 40(%edi),%mm0
+	pminub 48(%edi),%mm0
+	pminub 56(%edi),%mm0
+	pminub 64(%edi),%mm0
+	pminub 72(%edi),%mm0
+	pminub 80(%edi),%mm0	
+	pminub 88(%edi),%mm0
+	pminub 96(%edi),%mm0
+	pminub 104(%edi),%mm0
+	pminub 112(%edi),%mm0
+	pminub 120(%edi),%mm0
+	pminub 128(%edi),%mm0
+	pminub 136(%edi),%mm0
+	pminub 144(%edi),%mm0
+	pminub 152(%edi),%mm0
+	pminub 160(%edi),%mm0
+	pminub 168(%edi),%mm0
+	pminub 176(%edi),%mm0
+	pminub 184(%edi),%mm0
+	pminub 192(%edi),%mm0
+	pminub 200(%edi),%mm0
+	pminub 208(%edi),%mm0
+	pminub 216(%edi),%mm0
+	pminub 224(%edi),%mm0
+	pminub 232(%edi),%mm0
+	pminub 240(%edi),%mm0
+	pminub 248(%edi),%mm0
+	# mm0 contains 8 smallest metrics
+	# crunch down to single lowest metric
+	movq %mm0,%mm1
+	psrlq $32,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $16,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $8,%mm0
+	pminub %mm1,%mm0
+	movq 8(%edi),%mm1	# reload
+	punpcklbw %mm0,%mm0	# expand to all 8 bytes
+	pshufw $0,%mm0,%mm0
+
+	# mm0 now contains lowest metric in all 8 bytes
+	# subtract it from every output metric
+	# Trashes %mm7
+	.macro PSUBUSBM REG,MEM
+	movq \MEM,%mm7
+	psubusb \REG,%mm7
+	movq %mm7,\MEM
+	.endm
+	
+	PSUBUSBM %mm0,(%edi)
+	PSUBUSBM %mm0,8(%edi)
+	PSUBUSBM %mm0,16(%edi)
+	PSUBUSBM %mm0,24(%edi)
+	PSUBUSBM %mm0,32(%edi)
+	PSUBUSBM %mm0,40(%edi)
+	PSUBUSBM %mm0,48(%edi)
+	PSUBUSBM %mm0,56(%edi)
+	PSUBUSBM %mm0,64(%edi)
+	PSUBUSBM %mm0,72(%edi)
+	PSUBUSBM %mm0,80(%edi)	
+	PSUBUSBM %mm0,88(%edi)
+	PSUBUSBM %mm0,96(%edi)
+	PSUBUSBM %mm0,104(%edi)
+	PSUBUSBM %mm0,112(%edi)
+	PSUBUSBM %mm0,120(%edi)
+	PSUBUSBM %mm0,128(%edi)
+	PSUBUSBM %mm0,136(%edi)
+	PSUBUSBM %mm0,144(%edi)
+	PSUBUSBM %mm0,152(%edi)
+	PSUBUSBM %mm0,160(%edi)
+	PSUBUSBM %mm0,168(%edi)
+	PSUBUSBM %mm0,176(%edi)
+	PSUBUSBM %mm0,184(%edi)
+	PSUBUSBM %mm0,192(%edi)
+	PSUBUSBM %mm0,200(%edi)
+	PSUBUSBM %mm0,208(%edi)
+	PSUBUSBM %mm0,216(%edi)
+	PSUBUSBM %mm0,224(%edi)
+	PSUBUSBM %mm0,232(%edi)
+	PSUBUSBM %mm0,240(%edi)
+	PSUBUSBM %mm0,248(%edi)
+
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+thirtyones:	
+	.byte 31,31,31,31,31,31,31,31
+	
+
diff --git a/sumsq.c b/sumsq.c
new file mode 100644
index 0000000..9ed6a39
--- /dev/null
+++ b/sumsq.c
@@ -0,0 +1,40 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+unsigned long long sumsq_port(signed short *,int);
+
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *,int);
+unsigned long long sumsq_sse(signed short *,int);
+unsigned long long sumsq_sse2(signed short *,int);
+#endif
+
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *,int);
+#endif
+
+unsigned long long sumsq(signed short *in,int cnt){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return sumsq_port(in,cnt);
+#ifdef __i386__
+  case SSE:
+  case MMX:
+    return sumsq_mmx(in,cnt);
+  case SSE2:
+    return sumsq_sse2(in,cnt);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return sumsq_av(in,cnt);
+#endif
+  }
+}
diff --git a/sumsq_av.c b/sumsq_av.c
new file mode 100644
index 0000000..53c6acf
--- /dev/null
+++ b/sumsq_av.c
@@ -0,0 +1,78 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * This is the Altivec SIMD version. It's a little hairy because Altivec
+ * does not do 64-bit operations directly, so we have to accumulate separate
+ * 32-bit sums and carries
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include "fec.h"
+
+unsigned long long sumsq_av(signed short *in,int cnt){
+  long long sum;
+  vector signed short x;
+  vector unsigned int sums,carries,s1,s2;
+  int pad;
+  union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s;
+
+  carries = sums = (vector unsigned int)(0);
+  if((pad = (int)in & 15)!=0){
+    /* Load unaligned leading word */
+    x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+    if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+      s.c[15] = (8-cnt)<<4;
+      x = vec_sro(x,s.cv);
+    }
+    sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    in += 8-pad/2;
+    cnt -= 8-pad/2;
+  }
+  /* Everything is now aligned, rip through most of the block */
+  while(cnt >= 8){
+    x = vec_ld(0,in);
+    /* A single vec_msum cannot overflow, but we have to sum it with
+     * the earlier terms separately to handle the carries
+     * The cast to unsigned is OK because squares are always positive
+     */
+    s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    carries = vec_add(carries,vec_addc(sums,s1));
+    sums = vec_add(sums,s1);
+    in += 8;
+    cnt -= 8;
+  }
+  /* Handle trailing fragment, if any */
+  if(cnt > 0){
+    x = vec_ld(0,in);
+    s.c[15] = (8-cnt)<<4;
+    x = vec_sro(x,s.cv);
+    s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    carries = vec_add(carries,vec_addc(sums,s1));
+    sums = vec_add(sums,s1);
+  }
+  /* Combine 4 sub-sums and carries */
+  s.c[15] = 64; /* Shift right two 32-bit words */
+  s1 = vec_sro(sums,s.cv);
+  s2 = vec_sro(carries,s.cv);
+  carries = vec_add(carries,vec_addc(sums,s1));
+  sums = vec_add(sums,s1);
+  carries = vec_add(carries,s2);
+
+  s.c[15] = 32; /* Shift right one 32-bit word */
+  s1 = vec_sro(sums,s.cv);
+  s2 = vec_sro(carries,s.cv);
+  carries = vec_add(carries,vec_addc(sums,s1));
+  sums = vec_add(sums,s1);
+  carries = vec_add(carries,s2);
+
+  /* Extract sum and carries from right-hand words and combine into result */
+  s.iv = sums;
+  sum = s.w[3];
+
+  s.iv = carries;
+  sum += (long long)s.w[3] << 32;
+
+  return sum;
+}
+
diff --git a/sumsq_mmx.c b/sumsq_mmx.c
new file mode 100644
index 0000000..e766831
--- /dev/null
+++ b/sumsq_mmx.c
@@ -0,0 +1,35 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ *  MMX-assisted version (also used on SSE)
+
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+
+long long sumsq_mmx_assist(signed short *,int);
+
+long long sumsq_mmx(signed short *in,int cnt){
+  long long sum = 0;
+
+  /* Handle stuff before the next 8-byte boundary */
+  while(((int)in & 7) != 0 && cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  sum += sumsq_mmx_assist(in,cnt);
+  in += cnt & ~7;
+  cnt &= 7;
+
+  /* Handle up to 7 words at end */
+  while(cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  return sum;
+}
diff --git a/sumsq_mmx_assist.s b/sumsq_mmx_assist.s
new file mode 100644
index 0000000..b3bac66
--- /dev/null
+++ b/sumsq_mmx_assist.s
@@ -0,0 +1,83 @@
+# MMX assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+
+# Evaluate sum of squares of signed 16-bit input samples
+#  long long sumsq_mmx_assist(signed short *in,int cnt);	
+	.global sumsq_mmx_assist
+	.type sumsq_mmx_assist,@function
+	.align 16
+sumsq_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	xor %eax,%eax
+	xor %edx,%edx
+
+	# Since 4 * 32767**2 < 2**32, we can accumulate two at a time
+1:	subl $8,%ecx
+	jl 2f
+	movq (%esi),%mm0	# S0 S1 S2 S3
+	pmaddwd %mm0,%mm0	# (S0^2+S1^2) (S2^2+S3^2)
+	movq 8(%esi),%mm6	# S4 S5 S6 S7
+	pmaddwd %mm6,%mm6	# (S4^2+S5^2) (S6^2+S7^2)
+	paddd %mm6,%mm0		# (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
+	movd %mm0,%ebx
+	addl %ebx,%eax
+	adcl $0,%edx
+	psrlq $32,%mm0
+	movd %mm0,%ebx
+	addl %ebx,%eax
+	adcl $0,%edx
+	addl $16,%esi
+	jmp 1b
+	
+2:	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
+# Evaluate sum of squares of signed 16-bit input samples
+#  long sumsq_wd_mmx_assist(signed short *in,int cnt);
+#  Quick version, only safe for small numbers of small input values...
+	.global sumsq_wd_mmx_assist
+	.type sumsq_wd_mmx_assist,@function
+	.align 16
+sumsq_wd_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	pxor %mm2,%mm2		# zero sum
+
+1:	subl $8,%ecx
+	jl 2f
+	movq (%esi),%mm0	# S0 S1 S2 S3
+	pmaddwd %mm0,%mm0	# (S0*S0+S1*S1) (S2*S2+S3*S3)
+	movq 8(%esi),%mm1
+	pmaddwd %mm1,%mm1
+	paddd %mm1,%mm2
+	paddd %mm0,%mm2		# accumulate
+
+	addl $16,%esi
+	jmp 1b	
+
+2:	movd %mm2,%eax		# even sum	
+	psrlq $32,%mm2
+	movd %mm2,%edx		# odd sum
+	addl %edx,%eax
+	emms
+	popl %esi
+	popl %ebp
+	ret
diff --git a/sumsq_port.c b/sumsq_port.c
new file mode 100644
index 0000000..6d0b4c1
--- /dev/null
+++ b/sumsq_port.c
@@ -0,0 +1,16 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ *  Portable C version
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+unsigned long long sumsq_port(signed short *in,int cnt){
+  long long sum = 0;
+  int i;
+
+  for(i=0;i<cnt;i++){
+    sum += (int)in[i] * (int)in[i];
+  }
+  return sum;
+}
diff --git a/sumsq_sse2.c b/sumsq_sse2.c
new file mode 100644
index 0000000..b05d2e9
--- /dev/null
+++ b/sumsq_sse2.c
@@ -0,0 +1,33 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+
+long long sumsq_sse2_assist(signed short *,int);
+
+long long sumsq_sse2(signed short *in,int cnt){
+  long long sum = 0;
+
+  /* Handle stuff before the next 8-byte boundary */
+  while(((int)in & 15) != 0 && cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  sum += sumsq_sse2_assist(in,cnt);
+  in += cnt & ~7;
+  cnt &= 7;
+
+  /* Handle up to 7 trailing words */
+  while(cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  return sum;
+}
diff --git a/sumsq_sse2_assist.s b/sumsq_sse2_assist.s
new file mode 100644
index 0000000..d1c4ee7
--- /dev/null
+++ b/sumsq_sse2_assist.s
@@ -0,0 +1,49 @@
+# SSE2 assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+# Evaluate sum of squares of signed 16-bit input samples
+#  long long sumsq_sse2_assist(signed short *in,int cnt);	
+	.global sumsq_sse2_assist
+	.type sumsq_sse2_assist,@function
+	.align 16
+sumsq_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	pxor %xmm2,%xmm2		# zero sum
+	movaps low,%xmm3		# load mask
+
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0	# S0 S1 S2 S3 S4 S5 S6 S7
+	pmaddwd %xmm0,%xmm0	# (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
+	movaps %xmm0,%xmm1
+	pand %xmm3,%xmm1	# (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
+	paddq %xmm1,%xmm2	# sum even-numbered dwords
+	psrlq $32,%xmm0		# (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
+	paddq %xmm0,%xmm2	# sum odd-numbered dwords
+	addl $16,%esi
+	jmp 1b	
+
+2:	movaps %xmm2,%xmm0
+	psrldq $8,%xmm0
+	paddq %xmm2,%xmm0	# combine 64-bit sums
+
+	movd %xmm0,%eax		# low 32 bits of sum
+	psrldq $4,%xmm0
+	movd %xmm0,%edx		# high 32 bits of sum
+	
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 16
+low:	.byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
diff --git a/sumsq_test.c b/sumsq_test.c
new file mode 100644
index 0000000..4debd47
--- /dev/null
+++ b/sumsq_test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "config.h"
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  signed short *buf;
+  int i,d,trial,trials=10000;
+  int bufsize = 2048;
+  long long port_sum,simd_sum;
+  time_t t;
+  int timetrials=0;
+
+  find_cpu_mode();
+  time(&t);
+  srandom(t);
+
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      bufsize = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    case 'T':
+      timetrials++;
+      break;
+    }
+  }
+
+  buf = (signed short *)calloc(bufsize,sizeof(signed short));
+  if(timetrials){
+    for(trial=0;trial<trials;trial++){
+      (void)sumsq(buf,bufsize);
+    }
+  } else {
+    for(trial=0;trial<trials;trial++){
+      int length,offset;
+
+      offset = random() & 7;
+      length = (random() % bufsize) - offset;
+      if(length <= 0)
+	continue;
+      for(i=0;i<bufsize;i++)
+	buf[i] = random();
+      
+      port_sum = sumsq_port(buf+offset,length);
+      simd_sum = sumsq(buf+offset,length);
+      if(port_sum != simd_sum){
+	printf("offset %d len %d port_sum = %lld simd_sum = %lld ",offset,length,port_sum,simd_sum);
+	
+	printf("ERROR! diff = %lld\n",simd_sum-port_sum);
+      }
+    }
+  }
+  exit(0);
+}
diff --git a/viterbi27.c b/viterbi27.c
new file mode 100644
index 0000000..554da92
--- /dev/null
+++ b/viterbi27.c
@@ -0,0 +1,161 @@
+/* K=7 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi27_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi27_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi27_mmx(len);
+  case SSE:
+    return create_viterbi27_sse(len);
+  case SSE2:
+    return create_viterbi27_sse2(len);
+#endif
+  }
+}
+
+void set_viterbi27_polynomial(int polys[2]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi27_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi27_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi27_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi27_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi27_polynomial_sse2(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi27_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi27_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi27_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi27_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi27_sse2(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi27_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi27_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi27_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi27_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi27_sse2(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi27_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi27_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi27_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi27_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi27_sse2(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){
+  if(p == NULL)
+    return -1;
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    update_viterbi27_blk_port(p,syms,nbits);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    update_viterbi27_blk_av(p,syms,nbits);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    update_viterbi27_blk_mmx(p,syms,nbits);
+    break;
+  case SSE:
+    update_viterbi27_blk_sse(p,syms,nbits);
+    break;
+  case SSE2:
+    update_viterbi27_blk_sse2(p,syms,nbits);
+    break;
+#endif
+  }
+  return 0;
+}
diff --git a/viterbi27_av.c b/viterbi27_av.c
new file mode 100644
index 0000000..98d7344
--- /dev/null
+++ b/viterbi27_av.c
@@ -0,0 +1,210 @@
+/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "fec.h"
+
+typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t;
+typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t;
+
+static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_av(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<4;i++)
+    vp->metrics1.v[i] = (vector unsigned char)(63);
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_av(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_av(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA,V27POLYB };
+    set_viterbi27_polynomial_av(polys);
+  }
+  if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+
+  if(p == NULL)
+    return -1;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = d[nbits].c[endstate>>2] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_av(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* Process received symbols */
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    vector unsigned char survivor0,survivor1,sym0v,sym1v;
+    vector bool char decision0,decision1;
+    vector unsigned char metric,m_metric,m0,m1,m2,m3;
+    void *tmp;
+
+    /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */
+    sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms));
+
+    sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */
+    sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */
+    syms += 2;
+
+    /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */
+
+    /* Form first set of 16 branch metrics */
+    metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v));
+    metric = vec_sr(metric,(vector unsigned char)(3));
+    m_metric = vec_sub((vector unsigned char)(31),metric);
+    
+    /* Form first set of path metrics */
+    m0 = vec_adds(vp->old_metrics->v[0],metric);
+    m3 = vec_adds(vp->old_metrics->v[2],metric);
+    m1 = vec_adds(vp->old_metrics->v[2],m_metric);
+    m2 = vec_adds(vp->old_metrics->v[0],m_metric);
+    
+    /* Form second set of 16 branch metrics */
+    metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v));
+    metric = vec_sr(metric,(vector unsigned char)(3));
+    m_metric = vec_sub((vector unsigned char)(31),metric);
+
+    /* Compare and select first set */
+    decision0 = vec_cmpgt(m0,m1);
+    decision1 = vec_cmpgt(m2,m3);
+    survivor0 = vec_min(m0,m1);
+    survivor1 = vec_min(m2,m3);
+    
+    /* Compute second set of path metrics */
+    m0 = vec_adds(vp->old_metrics->v[1],metric);
+    m3 = vec_adds(vp->old_metrics->v[3],metric);
+    m1 = vec_adds(vp->old_metrics->v[3],m_metric);
+    m2 = vec_adds(vp->old_metrics->v[1],m_metric);
+
+    /* Interleave and store first decisions and survivors */
+    d->v[0] = vec_mergeh(decision0,decision1);
+    d->v[1] = vec_mergel(decision0,decision1);
+    vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1);
+    vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1);
+    
+    /* Compare and select second set */
+    decision0 = vec_cmpgt(m0,m1);
+    decision1 = vec_cmpgt(m2,m3);
+    survivor0 = vec_min(m0,m1);
+    survivor1 = vec_min(m2,m3);
+
+    /* Interleave and store second set of decisions and survivors */
+    d->v[2] = vec_mergeh(decision0,decision1);
+    d->v[3] = vec_mergel(decision0,decision1);
+    vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1);
+    vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1);
+   
+    /* renormalize if necessary */
+    if(vp->new_metrics->c[0] >= 105){
+      vector unsigned char scale0,scale1;
+
+      /* Find smallest metric and splat */
+      scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]);
+      scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]);
+      scale0 = vec_min(scale0,scale1);
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+
+      /* Now subtract from all metrics */
+      vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0);
+      vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0);
+      vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0);
+      vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+
+  return 0;
+}
+
diff --git a/viterbi27_mmx.c b/viterbi27_mmx.c
new file mode 100644
index 0000000..a6d5125
--- /dev/null
+++ b/viterbi27_mmx.c
@@ -0,0 +1,115 @@
+/* K=7 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+
+typedef union { char c[64]; __m64 v[8];} decision_t;
+typedef union { unsigned char c[64]; __m64 v[8];} metric_t;
+
+unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16)));
+unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_mmx(void *p,int starting_state){
+  struct v27 *vp = (struct v27 *)p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_mmx(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    int symbol;
+    for(symbol = 0;symbol < 256;symbol++){
+      int sym;
+
+      sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+      Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+
+      sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+      Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+    }
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_mmx(int len){
+  struct v27 *vp;
+  int polys[2] = { V27POLYA, V27POLYB };
+  
+  if(Init == 0){
+    set_viterbi27_polynomial_mmx(polys);
+  }
+  if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_mmx(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+  struct v27 *vp = (struct v27 *)p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;
+  endstate &= 63;
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate>>2] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_mmx(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/viterbi27_port.c b/viterbi27_port.c
new file mode 100644
index 0000000..7cac2b3
--- /dev/null
+++ b/viterbi27_port.c
@@ -0,0 +1,191 @@
+/* K=7 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+
+typedef union { unsigned int w[64]; } metric_t;
+typedef union { unsigned long w[2];} decision_t;
+static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_port(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_port(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_port(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+    set_viterbi27_polynomial_port(polys);
+  }
+  if((vp = malloc(sizeof(struct v27))) == NULL)
+     return NULL;
+  if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_port(vp,0);
+
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_port(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+32] + (510 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-510);\
+    m1 += (metric+metric-510);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  void *tmp;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    unsigned char sym0,sym1;
+
+    d->w[0] = d->w[1] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    
+    BFLY(0);
+    BFLY(1);
+    BFLY(2);
+    BFLY(3);
+    BFLY(4);
+    BFLY(5);
+    BFLY(6);
+    BFLY(7);
+    BFLY(8);
+    BFLY(9);
+    BFLY(10);
+    BFLY(11);
+    BFLY(12);
+    BFLY(13);
+    BFLY(14);
+    BFLY(15);
+    BFLY(16);
+    BFLY(17);
+    BFLY(18);
+    BFLY(19);
+    BFLY(20);
+    BFLY(21);
+    BFLY(22);
+    BFLY(23);
+    BFLY(24);
+    BFLY(25);
+    BFLY(26);
+    BFLY(27);
+    BFLY(28);
+    BFLY(29);
+    BFLY(30);
+    BFLY(31);
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }    
+  vp->dp = d;
+  return 0;
+}
diff --git a/viterbi27_sse.c b/viterbi27_sse.c
new file mode 100644
index 0000000..cd1f287
--- /dev/null
+++ b/viterbi27_sse.c
@@ -0,0 +1,113 @@
+/* K=7 r=1/2 Viterbi decoder for SSE
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[64]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in ssebfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+
+    set_viterbi27_polynomial_sse(polys);
+  }
+  if((vp = malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27(vp,0);
+  return vp;
+}
+
+void set_viterbi27_polynomial_sse(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/viterbi27_sse2.c b/viterbi27_sse2.c
new file mode 100644
index 0000000..bc01710
--- /dev/null
+++ b/viterbi27_sse2.c
@@ -0,0 +1,180 @@
+/* K=7 r=1/2 Viterbi decoder for SSE2
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[64]; __m128i v[4]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse2(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_sse2(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse2(int len){
+  void *p;
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+    set_viterbi27_polynomial_sse2(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27)))
+    return NULL;
+  vp = (struct v27 *)p;
+
+  if((p = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi27_sse2(vp,0);
+
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse2(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+#if 0
+/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */
+void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m128i sym0v,sym1v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi8(syms[0]);
+    sym1v = _mm_set1_epi8(syms[1]);
+    syms += 2;
+
+    for(i=0;i<2;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics */
+      metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v));
+      /* There's no packed bytes right shift in SSE2, so we use the word version and mask
+       * (I'm *really* starting to like Altivec...)
+       */
+      metric = _mm_srli_epi16(metric,3);
+      metric = _mm_and_si128(metric,_mm_set1_epi8(31));
+      m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_epi8(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric);
+      m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric);
+      m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select, using modulo arithmetic */
+      decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128());
+      decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128());
+      survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0));
+      survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2));
+ 
+      /* Pack each set of decisions into 16 bits */
+      d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1));
+      d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+}
+#endif
diff --git a/viterbi29.c b/viterbi29.c
new file mode 100644
index 0000000..80cbb33
--- /dev/null
+++ b/viterbi29.c
@@ -0,0 +1,152 @@
+/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi29_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi29_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi29_mmx(len);
+  case SSE:
+    return create_viterbi29_sse(len);
+  case SSE2:
+    return create_viterbi29_sse2(len);
+#endif
+  }
+}
+
+void set_viterbi29_polynomial(int polys[2]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi29_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi29_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi29_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi29_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi29_polynomial_sse2(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi29_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi29_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi29_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi29_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi29_sse2(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi29_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi29_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi29_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi29_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi29_sse2(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi29_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi29_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi29_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi29_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi29_sse2(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi29_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi29_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi29_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi29_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi29_blk_sse2(p,syms,nbits);
+#endif
+    }
+}
diff --git a/viterbi29_av.c b/viterbi29_av.c
new file mode 100644
index 0000000..31c8d27
--- /dev/null
+++ b/viterbi29_av.c
@@ -0,0 +1,190 @@
+/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <sys/sysctl.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t;
+typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t;
+
+static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_av(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16;i++)
+    vp->metrics1.v[i] = (vector unsigned char)(63);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_av(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_av(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = { V29POLYA,V29POLYB };
+    set_viterbi29_polynomial_av(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = d[nbits].c[endstate] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_av(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v29 *vp = p;
+  decision_t *d;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+
+  while(nbits--){
+    vector unsigned char sym1v,sym2v;
+    void *tmp;
+    
+    /* All this seems necessary just to load a byte into all elements of a vector! */
+    sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */
+    sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */
+    sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */
+    syms += 2;
+    
+    for(i=0;i<8;i++){
+      vector bool char decision0,decision1;
+      vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics */
+      metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v));
+      metric = vec_sr(metric,(vector unsigned char)(3));
+      m_metric = (vector unsigned char)(31) - metric;
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[8+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[8+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select first set */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+
+      /* Interleave and store decisions and survivors */
+      d->v[2*i] = vec_mergeh(decision0,decision1);
+      d->v[2*i+1] = vec_mergel(decision0,decision1);
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+    }
+    d++;
+    /* renormalize if necessary */
+    if(vp->new_metrics->c[0] >= 50){
+      int i;
+      vector unsigned char scale0,scale1;
+
+      /* Find smallest metric and splat */
+      scale0 = vp->new_metrics->v[0];
+      scale1 = vp->new_metrics->v[1];
+      for(i=2;i<16;i+=2){
+	scale0 = vec_min(scale0,vp->new_metrics->v[i]);
+	scale1 = vec_min(scale1,vp->new_metrics->v[i+1]);
+      }
+      scale0 = vec_min(scale0,scale1);
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+
+      /* Now subtract from all metrics */
+      for(i=0;i<16;i++)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0);
+    }
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return 0;
+}
diff --git a/viterbi29_mmx.c b/viterbi29_mmx.c
new file mode 100644
index 0000000..563f40a
--- /dev/null
+++ b/viterbi29_mmx.c
@@ -0,0 +1,118 @@
+/* K=9 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+
+typedef union { char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned char c[256]; __m64 v[32];} metric_t;
+
+unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8)));
+unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_mmx(int len){
+  struct v29 *vp;
+
+  if(Init == 0){
+    int polys[2] = {V29POLYA,V29POLYB};
+
+    set_viterbi29_polynomial_mmx(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29(vp,0);
+  return vp;
+}
+
+void set_viterbi29_polynomial_mmx(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    int symbol;
+
+    for(symbol = 0;symbol < 256;symbol++){
+      int sym;
+
+      sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+      Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+
+      sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+      Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+    }
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_mmx(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+  struct v29 *vp = (struct v29 *)p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  endstate &= 255;
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_mmx(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/viterbi29_port.c b/viterbi29_port.c
new file mode 100644
index 0000000..292dce8
--- /dev/null
+++ b/viterbi29_port.c
@@ -0,0 +1,166 @@
+/* K=9 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+
+static union { unsigned char c[128]; } Branchtab29[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_port(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_port(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_port(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = {V29POLYA,V29POLYB};
+    set_viterbi29_polynomial_port(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29_port(vp,0);
+
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi29_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_port(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+128] + (510 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-510);\
+    m1 += (metric+metric-510);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    void *tmp;
+    unsigned char sym0,sym1;
+    int i;
+
+    for(i=0;i<8;i++)
+      d->w[i] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    
+    for(i=0;i<128;i++)
+      BFLY(i);
+
+    d++;
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }  
+  vp->dp = d;
+  return 0;
+}
diff --git a/viterbi29_sse.c b/viterbi29_sse.c
new file mode 100644
index 0000000..4a92e5f
--- /dev/null
+++ b/viterbi29_sse.c
@@ -0,0 +1,114 @@
+/* K=9 r=1/2 Viterbi decoder for SSE
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char w[256]; __m64 v[32];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t;
+
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = { V29POLYA,V29POLYB };
+
+    set_viterbi29_polynomial_sse(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29(vp,0);
+  return vp;
+}
+
+void set_viterbi29_polynomial_sse(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 200;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/viterbi29_sse2.c b/viterbi29_sse2.c
new file mode 100644
index 0000000..4c7336c
--- /dev/null
+++ b/viterbi29_sse2.c
@@ -0,0 +1,119 @@
+/* K=9 r=1/2 Viterbi decoder for SSE2
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <emmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; __m128i v[16];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse2(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  for(i=0;i<256;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_sse2(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse2(int len){
+  void *p;
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = {V29POLYA,V29POLYB};
+
+    set_viterbi29_polynomial(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29)))
+    return NULL;
+  vp = (struct v29 *)p;
+  if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi29_sse2(vp,0);
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi29_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse2(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/viterbi39.c b/viterbi39.c
new file mode 100644
index 0000000..ac28c2c
--- /dev/null
+++ b/viterbi39.c
@@ -0,0 +1,153 @@
+/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi39_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi39_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi39_mmx(len);
+  case SSE:
+    return create_viterbi39_sse(len);
+  case SSE2:
+    return create_viterbi39_sse2(len);
+#endif
+  }
+}
+
+void set_viterbi39_polynomial(int polys[3]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi39_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi39_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi39_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi39_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi39_polynomial_sse2(polys);
+    break;
+#endif
+  }
+}
+
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi39_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi39_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi39_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi39_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi39_sse2(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi39_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi39_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi39_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi39_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi39_sse2(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi39_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi39_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi39_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi39_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi39_sse2(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi39_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi39_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi39_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi39_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi39_blk_sse2(p,syms,nbits);
+#endif
+    }
+}
diff --git a/viterbi39_av.c b/viterbi39_av.c
new file mode 100644
index 0000000..2deed51
--- /dev/null
+++ b/viterbi39_av.c
@@ -0,0 +1,251 @@
+/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t;
+typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t;
+
+static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_av(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  for(i=0;i<32;i++)
+    vp->metrics1.v[i] = (vector unsigned short)(1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_av(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_av(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_av(polys);
+  }
+  vp = (struct v39 *)malloc(sizeof(struct v39));
+  vp->decisions = malloc(sizeof(decision_t)*(len+8));
+  init_viterbi39_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_av(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+  vector unsigned char decisions = (vector unsigned char)(0);
+
+  while(nbits--){
+    vector unsigned short symv,sym0v,sym1v,sym2v;
+    vector unsigned char s;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+
+    symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s);    /* Unsigned byte->word unpack */ 
+    sym0v = vec_splat(symv,0);
+    sym1v = vec_splat(symv,1);
+    sym2v = vec_splat(symv,2);
+    syms += 3;
+    
+    for(i=0;i<16;i++){
+      vector bool short decision0,decision1;
+      vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * the metrics are in the range 0-765
+       */
+      m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v));
+      m1 = vec_xor(Branchtab39[2].v[i],sym2v);
+      metric = vec_add(m0,m1);
+      m_metric = vec_sub((vector unsigned short)(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[16+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[16+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+    
+      /* Store decisions and survivors.
+       * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+       * a funny interleaved fashion that we undo in the chainback function.
+       */
+      decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+
+      /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+       * 0xff is equivalent to adding 1, which sets the lsb.
+       */
+      decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+
+      if((i % 8) == 7){
+	/* We've accumulated a total of 128 decisions, stash and start again */
+	d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+      }
+    }
+#if 0
+    /* Experimentally determine metric spread
+     * The results are fixed for a given code and input symbol size
+     */
+    {
+      int i;
+      vector unsigned short min_metric;
+      vector unsigned short max_metric;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      int minimum,maximum;
+      static int max_spread = 0;
+
+      min_metric = max_metric = vp->new_metrics->v[0];
+      for(i=1;i<32;i++){
+	min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+	max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+      }
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+
+      t.v = min_metric;
+      minimum = t.s[0];
+      t.v = max_metric;
+      maximum = t.s[0];
+      if(maximum-minimum > max_spread){
+	max_spread = maximum-minimum;
+	printf("metric spread = %d\n",max_spread);
+      }
+    }
+#endif
+
+    /* Renormalize if necessary. This deserves some explanation.
+     * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825
+     * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+     * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+     * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+
+     * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+     * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+
+     * All this is borne out by experiment.
+     */
+    if(vp->new_metrics->s[0] >= USHRT_MAX-5000){
+      vector unsigned short scale;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      
+      /* Find smallest metric and splat */
+      scale = vp->new_metrics->v[0];
+      for(i=1;i<32;i++)
+	scale = vec_min(scale,vp->new_metrics->v[i]);
+
+      scale = vec_min(scale,vec_sld(scale,scale,8));
+      scale = vec_min(scale,vec_sld(scale,scale,4));
+      scale = vec_min(scale,vec_sld(scale,scale,2));
+
+      /* Subtract it from all metrics
+       * Work backwards to try to improve the cache hit ratio, assuming LRU
+       */
+      for(i=31;i>=0;i--)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+      t.v = scale;
+      path_metric += t.s[0];
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
diff --git a/viterbi39_mmx.c b/viterbi39_mmx.c
new file mode 100644
index 0000000..875391a
--- /dev/null
+++ b/viterbi39_mmx.c
@@ -0,0 +1,185 @@
+/* K=9 r=1/3 Viterbi decoder for x86 MMX
+ * Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned short s[256]; __m64 v[64];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_mmx(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = 1000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_mmx(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_mmx(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA,V39POLYB,V39POLYC };
+    set_viterbi39_polynomial_mmx(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_mmx(vp,0);
+  return vp;
+}
+
+
+
+/* Viterbi chainback */
+int chainback_viterbi39_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_mmx(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric = 0;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->dp;
+  
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    syms += 3;
+
+    for(i=0;i<32;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric);
+      m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric);
+      m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select
+       * There's no packed min instruction in MMX, so we use modulo arithmetic
+       * to form the decisions and then do the select the hard way
+       */
+      decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+      decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+      survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+      survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ 
+      /* Merge decisions and store as bytes */
+      d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    if(vp->new_metrics->s[0] < vp->old_metrics->s[0])
+      path_metric += 65536; /* Hack: wraparound probably occured */
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return path_metric;
+}
diff --git a/viterbi39_port.c b/viterbi39_port.c
new file mode 100644
index 0000000..5685c90
--- /dev/null
+++ b/viterbi39_port.c
@@ -0,0 +1,168 @@
+/* K=9 r=1/3 Viterbi decoder in portable C
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+
+static union { unsigned char c[128]; } Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_port(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_port(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+    Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_port(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = {V39POLYA,V39POLYB,V39POLYC};
+    set_viterbi39_polynomial_port(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_port(vp,0);
+
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi39_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_port(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \
+     (Branchtab39[2].c[i] ^ sym2);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+128] + (765 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-765);\
+    m1 += (metric+metric-765);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    void *tmp;
+    unsigned char sym0,sym1,sym2;
+    int i;
+
+    for(i=0;i<8;i++)
+      d->w[i] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    sym2 = *syms++;
+
+    for(i=0;i<128;i++)
+      BFLY(i);
+
+    d++;
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }  
+  vp->dp = d;
+  return 0;
+}
diff --git a/viterbi39_sse.c b/viterbi39_sse.c
new file mode 100644
index 0000000..c2f2865
--- /dev/null
+++ b/viterbi39_sse.c
@@ -0,0 +1,201 @@
+/* K=9 r=1/3 Viterbi decoder for x86 SSE
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+typedef union { signed short s[256]; __m64 v[64];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_sse(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){
+    return NULL;
+  }
+  if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_sse(vp,0);
+  return vp;
+}
+
+void set_viterbi39_polynomial_sse(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    /*    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric - SHRT_MIN;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric = 0;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    syms += 3;
+
+    for(i=0;i<32;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-765
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric);
+      m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric);
+      m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_pi16(m0,m1);
+      survivor1 = _mm_min_pi16(m2,m3);
+      decision0 = _mm_cmpeq_pi16(survivor0,m1);
+      decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ 
+      /* Pack decisions into 8 bits and store */
+      d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-255 branch metrics is 12750
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+      int i,adjust;
+      __m64 adjustv;
+      union { __m64 v; signed short w[4]; } t;
+
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<64;i++)
+	adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));    
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      path_metric += adjust;
+      adjustv = _mm_set1_pi16(adjust);
+      
+      for(i=0;i<64;i++)
+	vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return path_metric;
+}
diff --git a/viterbi39_sse2.c b/viterbi39_sse2.c
new file mode 100644
index 0000000..f13794e
--- /dev/null
+++ b/viterbi39_sse2.c
@@ -0,0 +1,200 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
+typedef union { signed short s[256]; __m128i v[32];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse2(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse2(int len){
+  void *p;
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_sse2(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
+    return NULL;
+
+  vp = (struct v39 *)p;
+  if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi39_sse2(vp,0);
+  return vp;
+}
+
+void set_viterbi39_polynomial_sse2(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse2(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+
+  while(nbits--){
+    __m128i sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi16(syms[0]);
+    sym1v = _mm_set1_epi16(syms[1]);
+    sym2v = _mm_set1_epi16(syms[2]);
+    syms += 3;
+
+    /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+    for(i=0;i<16;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-765
+       */
+      m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
+      m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
+      m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_epi16(m0,m1);
+      survivor1 = _mm_min_epi16(m2,m3);
+      decision0 = _mm_cmpeq_epi16(survivor0,m1);
+      decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ 
+      /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+      d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+      int i,adjust;
+      __m128i adjustv;
+      union { __m128i v; signed short w[8]; } t;
+      
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<32;i++)
+	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      path_metric += adjust;
+      adjustv = _mm_set1_epi16(adjust);
+
+      /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+       * This is okay since it can't overflow anyway
+       */
+      for(i=0;i<32;i++)
+	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
+
+
diff --git a/viterbi615.c b/viterbi615.c
new file mode 100644
index 0000000..6dda51f
--- /dev/null
+++ b/viterbi615.c
@@ -0,0 +1,155 @@
+/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615(int len){
+
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi615_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi615_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi615_mmx(len);
+  case SSE:
+    return create_viterbi615_sse(len);
+  case SSE2:
+    return create_viterbi615_sse2(len);
+#endif
+  }
+}
+
+void set_viterbi615_polynomial(int polys[6]){
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi615_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi615_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi615_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi615_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi615_polynomial_sse2(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi615_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi615_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi615_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi615_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi615_sse2(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi615_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi615_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi615_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi615_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi615_sse2(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi615_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi615_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi615_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi615_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi615_sse2(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi615_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi615_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi615_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi615_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi615_blk_sse2(p,syms,nbits);
+#endif
+    }
+}
+
diff --git a/viterbi615_av.c b/viterbi615_av.c
new file mode 100644
index 0000000..4a6ce9c
--- /dev/null
+++ b/viterbi615_av.c
@@ -0,0 +1,257 @@
+/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t;
+typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t;
+
+static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_av(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+
+  for(i=0;i<2048;i++)
+    vp->metrics1.v[i] = (vector unsigned short)(5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_av(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_av(polys);
+  }
+  vp = (struct v615 *)malloc(sizeof(struct v615));
+  vp->decisions = malloc(sizeof(decision_t)*(len+14));
+  init_viterbi615_av(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_av(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi615_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  endstate %= 16384;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_av(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+  vector unsigned char decisions = (vector unsigned char)(0);
+
+  while(nbits--){
+    vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    vector unsigned char s;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+
+    symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s);    /* Unsigned byte->word unpack */ 
+    sym0v = vec_splat(symv,0);
+    sym1v = vec_splat(symv,1);
+    sym2v = vec_splat(symv,2);
+    sym3v = vec_splat(symv,3);
+    sym4v = vec_splat(symv,4);
+    sym5v = vec_splat(symv,5);
+    syms += 6;
+    
+    for(i=0;i<1024;i++){
+      vector bool short decision0,decision1;
+      vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v));
+      m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v));
+      m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v));
+      metric = vec_add(m0,m1);
+      metric = vec_add(metric,m2);
+      m_metric = vec_sub((vector unsigned short)(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[1024+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+    
+      /* Store decisions and survivors.
+       * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+       * a funny interleaved fashion that we undo in the chainback function.
+       */
+      decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+
+      /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+       * 0xff is equivalent to adding 1, which sets the lsb.
+       */
+      decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+
+      if((i % 8) == 7){
+	/* We've accumulated a total of 128 decisions, stash and start again */
+	d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+      }
+    }
+#if 0
+    /* Experimentally determine metric spread
+     * The results are fixed for a given code and input symbol size
+     */
+    {
+      int i;
+      vector unsigned short min_metric;
+      vector unsigned short max_metric;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      int minimum,maximum;
+      static int max_spread = 0;
+
+      min_metric = max_metric = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++){
+	min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+	max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+      }
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+
+      t.v = min_metric;
+      minimum = t.s[0];
+      t.v = max_metric;
+      maximum = t.s[0];
+      if(maximum-minimum > max_spread){
+	max_spread = maximum-minimum;
+	printf("metric spread = %d\n",max_spread);
+      }
+    }
+#endif
+
+    /* Renormalize if necessary. This deserves some explanation.
+
+     * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750.
+     * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+     * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+     * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+
+     * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+     * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+
+     * All this is borne out by experiment.
+     */
+    if(vp->new_metrics->s[0] >= USHRT_MAX-12750){
+      vector unsigned short scale;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      
+      /* Find smallest metric and splat */
+      scale = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++)
+	scale = vec_min(scale,vp->new_metrics->v[i]);
+
+      scale = vec_min(scale,vec_sld(scale,scale,8));
+      scale = vec_min(scale,vec_sld(scale,scale,4));
+      scale = vec_min(scale,vec_sld(scale,scale,2));
+
+      /* Subtract it from all metrics
+       * Work backwards to try to improve the cache hit ratio, assuming LRU
+       */
+      for(i=2047;i>=0;i--)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+      t.v = scale;
+      path_metric += t.s[0];
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
diff --git a/viterbi615_mmx.c b/viterbi615_mmx.c
new file mode 100644
index 0000000..89a56f7
--- /dev/null
+++ b/viterbi615_mmx.c
@@ -0,0 +1,183 @@
+/* K=15 r=1/6 Viterbi decoder for x86 MMX
+ * Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t;
+typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_mmx(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = 5000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_mmx(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_mmx(polys);
+  }
+
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615_mmx(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_mmx(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_mmx(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->dp;
+  
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    sym3v = _mm_set1_pi16(syms[3]);
+    sym4v = _mm_set1_pi16(syms[4]);
+    sym5v = _mm_set1_pi16(syms[5]);
+    syms += 6;
+
+    for(i=0;i<2048;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric);
+      m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric);
+      m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select
+       * There's no packed min instruction in MMX, so we use modulo arithmetic
+       * to form the decisions and then do the select the hard way
+       */
+      decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+      decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+      survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+      survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ 
+      /* Merge decisions and store as bytes */
+      d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return 0;
+}
diff --git a/viterbi615_port.c b/viterbi615_port.c
new file mode 100644
index 0000000..89bdd80
--- /dev/null
+++ b/viterbi615_port.c
@@ -0,0 +1,156 @@
+/* K=15 r=1/6 Viterbi decoder in portable C
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { unsigned long w[16384]; } metric_t;
+
+static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_port(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_port(polys);
+  }
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_port(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_port(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.w[i] = 1000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_port(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned long metric,m0,m1,m2,m3,decision0,decision1;\
+    metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\
+	      +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\
+	      +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\
+    m2 = vp->old_metrics->w[i] + (1530-metric);\
+    m3 = vp->old_metrics->w[i+8192] + metric;\
+    decision0 = (signed long)(m0-m1) >= 0;\
+    decision1 = (signed long)(m2-m3) >= 0;\
+    vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\
+    vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\
+    d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\
+}
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  void *tmp;
+  decision_t *d;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    memset(d,0,sizeof(decision_t));
+    for(i=0;i<8192;i++)
+      BFLY(i);
+
+    syms += 6;
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }    
+  vp->dp = d;
+  return 0;
+}
+
diff --git a/viterbi615_sse.c b/viterbi615_sse.c
new file mode 100644
index 0000000..de0f8af
--- /dev/null
+++ b/viterbi615_sse.c
@@ -0,0 +1,201 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { signed short s[16384]; __m64 v[4096];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_sse(polys);
+  }
+
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){
+    return NULL;
+  }
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615_sse(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_sse(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    /*    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    sym3v = _mm_set1_pi16(syms[3]);
+    sym4v = _mm_set1_pi16(syms[4]);
+    sym5v = _mm_set1_pi16(syms[5]);
+    syms += 6;
+
+    for(i=0;i<2048;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric);
+      m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric);
+      m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_pi16(m0,m1);
+      survivor1 = _mm_min_pi16(m2,m3);
+      decision0 = _mm_cmpeq_pi16(survivor0,m1);
+      decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ 
+      /* Pack decisions into 8 bits and store */
+      d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-255 branch metrics is 12750
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+      int i,adjust;
+      __m64 adjustv;
+      union { __m64 v; signed short w[4]; } t;
+
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<4096;i++)
+	adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));    
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      adjustv = _mm_set1_pi16(adjust);
+      
+      for(i=0;i<4096;i++)
+	vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return 0;
+}
diff --git a/viterbi615_sse2.c b/viterbi615_sse2.c
new file mode 100644
index 0000000..7f711e5
--- /dev/null
+++ b/viterbi615_sse2.c
@@ -0,0 +1,204 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t;
+typedef union { signed short s[16384]; __m128i v[2048];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse2(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse2(int len){
+  void *p;
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_sse2(polys);
+  }
+
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615)))
+    return NULL;
+
+  vp = (struct v615 *)p;
+  if((p = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi615_sse2(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_sse2(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse2(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+
+  while(nbits--){
+    __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi16(syms[0]);
+    sym1v = _mm_set1_epi16(syms[1]);
+    sym2v = _mm_set1_epi16(syms[2]);
+    sym3v = _mm_set1_epi16(syms[3]);
+    sym4v = _mm_set1_epi16(syms[4]);
+    sym5v = _mm_set1_epi16(syms[5]);
+    syms += 6;
+
+    /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+    for(i=0;i<1024;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2));
+      m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric);
+      m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric);
+      m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_epi16(m0,m1);
+      survivor1 = _mm_min_epi16(m2,m3);
+      decision0 = _mm_cmpeq_epi16(survivor0,m1);
+      decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ 
+      /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+      d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-90 branch metrics is 405
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+      int i,adjust;
+      __m128i adjustv;
+      union { __m128i v; signed short w[8]; } t;
+      
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++)
+	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      adjustv = _mm_set1_epi16(adjust);
+
+      /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+       * This is okay since it can't overflow anyway
+       */
+      for(i=0;i<2048;i++)
+	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return 0;
+}
+
+
diff --git a/vtest27.c b/vtest27.c
new file mode 100644
index 0000000..7256483
--- /dev/null
+++ b/vtest27.c
@@ -0,0 +1,184 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./2.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long int tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*2*(MAXBYTES+6)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi27(framebits)) == NULL){
+    printf("create_viterbi27 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+6;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[2*i+0] = addnoise(parity(sr & V27POLYA),gain,Gain,127.5,255);
+	symbols[2*i+1] = addnoise(parity(sr & V27POLYB),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi27(vp,0);
+      
+      /* Decode block */
+      update_viterbi27_blk(vp,symbols,framebits+6);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi27(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi27(vp,0);
+      
+      /* Decode block */
+      update_viterbi27_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi27(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
diff --git a/vtest29.c b/vtest29.c
new file mode 100644
index 0000000..8471b54
--- /dev/null
+++ b/vtest29.c
@@ -0,0 +1,185 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./2.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*2*(MAXBYTES+8)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi29(framebits)) == NULL){
+    printf("create_viterbi29 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+8;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[2*i+0] = addnoise(parity(sr & V29POLYA),gain,Gain,127.5,255);
+	symbols[2*i+1] = addnoise(parity(sr & V29POLYB),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi29(vp,0);
+      
+      /* Decode block */
+      update_viterbi29_blk(vp,symbols,framebits+8);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi29(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi29(vp,0);
+      
+      /* Decode block */
+      update_viterbi29_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi29(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
+
+
diff --git a/vtest39.c b/vtest39.c
new file mode 100644
index 0000000..76723b2
--- /dev/null
+++ b/vtest39.c
@@ -0,0 +1,186 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./3.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*3*(MAXBYTES+8)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi39(framebits)) == NULL){
+    printf("create_viterbi39 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+8;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[3*i+0] = addnoise(parity(sr & V39POLYA),gain,Gain,127.5,255);
+	symbols[3*i+1] = addnoise(parity(sr & V39POLYB),gain,Gain,127.5,255);
+	symbols[3*i+2] = addnoise(parity(sr & V39POLYC),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi39(vp,0);
+      
+      /* Decode block */
+      update_viterbi39_blk(vp,symbols,framebits+8);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi39(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi39(vp,0);
+      
+      /* Decode block */
+      update_viterbi39_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi39(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
+
+
diff --git a/vtest615.c b/vtest615.c
new file mode 100644
index 0000000..4bd8c4f
--- /dev/null
+++ b/vtest615.c
@@ -0,0 +1,191 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./6.)
+#define MAXBYTES 10000
+#define OFFSET (127.5)
+#define CLIP 255
+
+double Gain = 24.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10,errcnt,framebits=2048;
+  int tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*6*(MAXBYTES+14)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi615(framebits)) == NULL){
+    printf("create_viterbi615 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+14;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[6*i+0] = addnoise(parity(sr & V615POLYA),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+1] = addnoise(parity(sr & V615POLYB),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+2] = addnoise(parity(sr & V615POLYC),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+3] = addnoise(parity(sr & V615POLYD),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+4] = addnoise(parity(sr & V615POLYE),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+5] = addnoise(parity(sr & V615POLYF),gain,Gain,OFFSET,CLIP);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi615(vp,0);
+      
+      /* Decode block */
+      update_viterbi615_blk(vp,symbols,framebits+14);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi615(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %d/%d (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,(tr+1),(double)badframes/(tr+1));
+      fflush(stdout);
+
+    }
+
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+	printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n",
+	       tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,(tr+1),(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi615(vp,0);
+
+      /* Decode block */
+      update_viterbi615_blk(vp,symbols,framebits+14);
+
+      /* Do Viterbi chainback */
+      chainback_viterbi615(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}