diff options
author | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
---|---|---|
committer | Matthias P. Braendli <matthias.braendli@mpb.li> | 2014-01-02 21:55:13 +0100 |
commit | a31630e0d5b9880c716d9004ef4154396ba41ebc (patch) | |
tree | aebbd3b132e5f2dd31bc34750ccded2378fc687a /ssebfly27.s | |
parent | 9aaac5be9db5e1537badc65242412ef14c5096e3 (diff) | |
download | ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.gz ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.tar.bz2 ka9q-fec-a31630e0d5b9880c716d9004ef4154396ba41ebc.zip |
Extract fec-3.0.1
Diffstat (limited to 'ssebfly27.s')
-rw-r--r-- | ssebfly27.s | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/ssebfly27.s b/ssebfly27.s new file mode 100644 index 0000000..7f445da --- /dev/null +++ b/ssebfly27.s @@ -0,0 +1,205 @@ +/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2001 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 +.text +.global update_viterbi27_blk_sse,Branchtab27_sse + .type update_viterbi27_blk_sse,@function + .align 16 + +update_viterbi27_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # %ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + movq thirtyones,%mm7 + + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab27_sse+(8*\GROUP),%mm4 + movq Branchtab27_sse+32+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 4 times for a total of 32 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + + addl $8,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmpl $150,%eax # is it greater than 150? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + + movd %mm0,%eax + and $0xff,%eax + +done: # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + + ret + + .data + + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + + |