?? viterbigsm.asm
字號:
// BL 2005-03-15 Optimized to 4750 cycles
// REGISTER USAGE:
//
// Dregs: R0, R1, R2, R3, R4, R5, R6, R7 (8 out of 8)
// Pregs: P0, P1, P2, P3, P4, P5 (6 out of 6)
// FP, SP: FP (1 out of 2)
// DAGs: I0, I1,I2,I3 (4 out of 4)
// Mregs: M2, m0 (2 out of 4)
// Lregs: L0, L1, L2, L3 (4 out of 4)
// Bregs: B0, B1, B2, B3 (4 out of 4)
// Accums: A0 (1 out of 2)
//
//
// RESTRICTIONS:
//
// 1. The soft decision, branch metric, and state buffers are assumed to be
// aligned on a 32-bit boundary.
//
// 2. The ouput data is provided on a 32-bit boundary. (Providing on a 16-bit boundary
// will consume an additional 12 cycles.)
//
// 3. The transition and soft decision arrays require an additional alocated memory element.
// These elements are used to balance the ACS loop. (Removing this optimization
// will reult in ~3% degredation in cycle count.)
//
// Notes, assumptions:
//
// 1. The following code sequence implements a rate n/2, 16-state viterbi decoder
// (constraint length = 5) for a GSM frame (189 bits) including traceback.
//
// 2. The work required to perform a complete butterfly is divided into two
// portions: (a) branch metric calculation, and (b) add-compare-selects (ACS).
// (a) requires one 16-bit add (done with H+L(SGN)), and one 16-bit load.
// (b) requires four 16-bit adds, two 16-bit compares, 2 bit updates (VMAX),
// two 16-bit loads (old path metrics) and two 16-bit stores (new path metrics).
// Since (b) is done with a quad 16-bit add/subtract instruction and a VMAX,
// the total inner-loop time per butterfly is 3 cycles. The ideal for a 16-state
// rate 1/2 ACS is
//
// 3*8 = 24 cycles/bit
//
// There are three instructions in the decoder
// inner loop:
//
// R2.H = SIGN(R0.H) * R1.H, R2.L = SIGN(R0.L) * R1.L ; (a) Branch Metric Calculation
// R5 = R3 +|- R2 , R4 = R3 -|+ R2; (b) Quad 16-bit add/subtract
// R6 = VIT_MAX( R5 , R4 ) (ASR); (c) Two 16-bit Compares, and 2
// bit updates.
//
// 3. The SIGN instruction is used to compute the branch metric used by each
// butterfly. It takes as input a pair of values representing the received
// symbol, and another pair of values which are +1 or -1. The latter come
// from a pre-computed table that holds all the branch metric information for
// a specific set of polynomials. As all symbols are assumed to be binary,
// distance metrics between a received symbol and a branch metric are computed
// by adding and subtracting the values of the symbol according to the
// transition of a branch.
//
// R2.H = SIGN(R0.H) * R1.H, R2.L = SIGN(R0.L) * R1.L ;
//
// _________________________
// R1 = | S1 | S0 |
// |____________|____________|
//
// _________________________
// R0 = | +1 | -1 | R0 holds pre-computed
// |____________|____________| branch metrics
//
//
// _________________________
// R2 = | D | D |
// |____________|____________|
//
//
// For a rate 1/n system, the branch metric for a butterfly is equal to sum of
// the components of a received symbol, with appropriate sign
//
// With the branch metric add/subtract from accumulated path metrics.
//
// R5 = R3 +|- R2 , R4 = R3 -|+ R2;
//
// _________________________
// R3 = | PM0 | PM1 |
// |____________|____________|
//
//
// D PM00 R5
// PM0 -+----------+---
// \ / PM01
// -D \ /
// +
// -D / \
// / \ PM10 R4
// PM1 -+----------+--
// D PM11
//
// VIT_MAX is a 2-way parallel comparison of four updated path metrics, resulting
// in 2 new path metrics as well as a 2 bit field indicating the selection
// results. This 2 bit field is shifted into accumulator A0. This instruction
// implements the selections of a complete butterfly for a rate 1/n system.
//
//
// R6 = VIT_MAX( R5 , R4 ) (ASR);
//
//
// ________________________________
// R6 = | MAX(PM10,PM11)| MAX(PM00,PM01) | 2-bit field 00,01,10,11
// |_______________|________________| (BB)
//
// ________________________________
// BB>> |BBxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx| (A0 >> 2)
// |________________________________|
//
// 4. The initial and alternate state arrays are setup in three circular
// buffers with identical base addresses and lengths.
//
// ------
// Initial State - >> | A0 |
// (I1) ------
// (B1,B2,B3) | A1 |
// ------
// .
// .
// .
// ------
// Initial State +N/2 ->>| A7 |
// (I2) ------
// .
// .
// .
// ------
// Alternate State - >>| B0 |
// (I3) ------
// | B1 |
// ------
// .
// .
// .
// ------
// | B7 |
// ------ Circular Buffer Length = 64
// (L1,L2,L3)
//
// 5. For the traceback portion of the benchmark, the C-code has been re-written to
// better match our implementation. The following routine
// provides the same output as the traceback C-code.
//
// void traceback16(short *trans, short *OUTaddr, int samples)
// {
// first state
// int i;
// int bit;
// int state=0;
//
// for (i=samples-1; i>=0; i--) {
// bit = (trans[i]>>state)&1;
// OUTaddr[i>>4] |= ((state&1)<<(i&15));
// state = state>>1 | bit << 3;
// }
// }
//
// With N=189 bits we have a total of 12 16-bit. or 6 32-bit words which
// need to be packed with the output bits.
//
// 189 = 11*16 + 13
// = 5*32 + 28
//
// Since the first two contains a total of 28-bits, we calculate these
// prior to entering into the main loop. This loop provides for the remaining 10
// 16-bit words. The inner loop calculates two bits per iteration. The outer
// loop provide for 2 16 bit words per iteration.
//
// The BITMUX instruction is used to collect the output word during
// the traceback through the 189 element history array. A special instruction
// provides for shifting the MSB of two source registers into the LSBs
// of an accumulator.
//
// BITMUX( R4 , R5, A0) (ASL);
//
// The output above is reported as 7.5 cycles times (N-1). Since the initial
// state is zero, state(0)=0, the calculation of the state(1) requires
// a masking operation and a shift. The main loop provides throughput of
// 7.5 cycles per bit with the first state being provided in 2 cycles, and
// the array alignment and store operations consuming an additional 18 cycles.
// These value included in the cycle count equation for the kernel
//
// 6. Memory usage summary:
// soft input 189*2 words
// trans array 189 words
// path cost 32 words
// decoded output 12 words
// pointer 24 bytes
//
//*************************************************************************/
#define N_bits 189
#define N_state 16
#define N_state_2 28
#define SOFTDEC_OFF 8
#define DECISION_HIST_OFF 12
#define BRANCH_METRIC_OFF 16
#define STATE1_OFF 20
#define STATE2_OFF 24
#define OUTBITS_OFF 28
.global _viterbi_1_2_189;
.section program;
_viterbi_1_2_189:
// Setup counter for outer loop and modifiers
// for adjusting pointers to initial & alternate state arrays
LINK 16;
[--SP]=(R7:4);
[--SP]=(P5:3);
SP += -16;
[FP+SOFTDEC_OFF]=R0;
[FP+DECISION_HIST_OFF]=R1;
[FP+BRANCH_METRIC_OFF]=R2;
P3 = N_bits;
M2 = N_state;
R0 = [FP+BRANCH_METRIC_OFF];
I0 = R0;
// ptr to soft input symbols (array of 2*n short)
P0 = [FP+SOFTDEC_OFF];
// Setup for initial and alternate state circular buffers
R0 = [FP+STATE1_OFF];
R1 = [FP+STATE2_OFF];
I1 = R0;
B1 = R0;
I2 = R0;
B2 = R0;
I3 = R1;
B3 = R0;
R0 = R1 - R0;
R1 = R0 + R0;
L1 = R1;
L2 = R1;
L3 = R1;
R1 = N_state;
R0 = R0 - R1;
M0 = R0;
R2 = 4;
R0 = R0 - R1;
R0 = R0 + R2;
M3 = R0;
// ptr to decision array (array of n short)
// Increment I2 to point to initial state + N_state/2
// Increment I3 to point to alternate state array.
P2 = [FP+DECISION_HIST_OFF]; // m0 = 28;
// Setup loop counters of traceback section
P5 = 24;
P4 = 6;
P2 += -2;
// the following section implements the add-compare-select portion of
// the viterbi decoder.
ViterbiACS:
// Setup outer loop for 189 iterations
// load soft decision data, precomputed branch metrics
// and initial state R0 = (D1 D0), R3.L = PM0,
// R1=(InputData[SymNo*2+1],InputData[SymNo*2])
R1 = [ P0 ++ ] || R3.L = W[ I1++];
A0 = 0 || I2+=M2 || R0 = [I0++];
I3 -= M3;
// LSETUP is moved up against the first line of the loop
LSETUP ( l$0 , l$0end ) LC0 = P3;
// apply sum-on-sign instruction, and load R3.H = PM1
// transition history is stored trans[i-1] = R7.L.
//.align 16;
l$0:
R0.H = R0.L = SIGN(R0.H) * R1.H + SIGN(R0.L) * R1.L || R2 = [I0--];
R2.H = R2.L = SIGN(R2.H) * R1.H + SIGN(R2.L) * R1.L || R3.H = W [ I2 ++ ] || W [ P2++] = R7;
R5 = R3 +|- R0 , R4 = R3 -|+ R0 || [ I3 ++ M3] = R6 || R1.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R1.L = W [ I1 ++ ];
R5 = R1 +|- R2 , R4 = R1 -|+ R2 || [ I3 ++ ] = R6 || R3.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R3.L = W [ I1 ++ ];
R5 = R3 +|- R0 , R4 = R3 -|+ R0 || [ I3 ++ ] = R6 || R1.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R1.L = W [ I1 ++ ];
R5 = R1 +|- R2 , R4 = R1 -|+ R2 || [ I3 ++ ] = R6 || R3.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R3.L = W [ I1 ++ ] ;
R4 = R3 +|- R0 , R5 = R3 -|+ R0 || [ I3 ++ ] = R6 || R1.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R1.L = W [ I1 ++ ];
R4 = R1 +|- R2 , R5 = R1 -|+ R2 || [ I3 ++ ] = R6 || R3.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R3.L = W [ I1 ++ ];
R4 = R3 +|- R0 , R5 = R3 -|+ R0 || [ I3 ++ ] = R6 || R1.H = W [ I2 ++ ];
R6 = VIT_MAX( R5 , R4 ) (ASR) || R1.L = W [ I1 ++ ];
// repeat of loop sequence with adjustment of pointers to
// alternate array
R4 = R1 +|- R2 , R5 = R1 -|+ R2 || I2 += M0 || [I3++ ] = R6;
R6 = VIT_MAX( R5 , R4 ) (ASR) || I1 += M0 || R0 = [I0++];
// extract trans[i], load R1=(InputData[SymNo*2+1],InputData[SymNo*2])
// and load R3.L = PM0
l$0end: R7.L = A0 (TFU) || R1 = [P0++] || R3.L = W[ I1++];
// store last decision history element trans[189],
// and initialize A0 to zero for begining of traceback.
W [ P2--] = R7;
// The following section implements the trace-back portion of
// the viterbi decoder.
tracebak16:
R1 = 32;
// Clear R2 and R3 - load output pointer
// [FP+STATE_OFF]is no longer used to
// contain state variables
// It will contain the inner loop counter
A1 = A0 = 0 || P0 = [FP+STATE1_OFF];
R3 = A1, R2 = A0 || P1 = [FP+OUTBITS_OFF];
// Disable circular buffer
L0 = 0;
// Set trans pointer
I0 = P2;
// Store 32 to memory
[P0] = R1;
// Low part of register R0 used for the DEPOSIT instruction
R0.L = 0x301; // position = 3 and length = 1
// The first time the inner loop will be called, it will
// execute 24 times
LSETUP ( l$0b , l$0be ) LC0 = P4;
l$0b: LSETUP ( l$1b , l$1be ) LC1 = P5;
l$1b:
// Negate the state, load trans[i] and update inner loop counter
R7.L = R3.H - R3.L (NS) || R0.H = W [ I0 --] || P5 = [P0];
// trans[i] >> state
R0.H = LSHIFT R0.H BY R7.L;
// state = state>>1 | bit << 3;
R3.L = R3.L >> 1;
R3 = DEPOSIT(R3,R0);
// OUTaddr[i>>4] |= ((state&1)<<(i&15));
CC = BITTST(R0,16);
l$1be: R2 = ROT R2 BY 1;
l$0be: [P1++ ] = R2;
// Disable circular buffers
L1 = 0;
L2 = 0;
L3 = 0;
//benchmark_end:
SP += 16;
P0=[FP+4];
(P5:3)=[SP++];
(R7:4)=[SP++];
UNLINK;
JUMP (P0);
_viterbi_1_2_189.end:
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -