?? bkfir2_flp32.asm
字號:
/* FIR.asm 32 bit floating point FIR filter. rev1.0 PM, 9/2003 rev1.1 PM, 12/2003 rev1.2 PM, 7/2004, project passed into VDSP3.5 rev1.3 RG, 5/2005, project passed into VDSP4.0OVERVIEW:=========-this program implements a floating point block FIR-it can be compiled and run on TS101 and TS201.-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.-In this program only 2 memory blocks have been used to maintain compatibility betweenTS101 and TS201-2 tcl files are provided, one for TS101 and one for TS201. Both of them buildthe correspondent project and save the output buffer into a file output.dat-.align_code 4 instruction has been introduced throughout the main part of the programto efficientize the cycle count for TS201 (On this processor they may be even discardedif the cycle count is not of interest). For TS101, the assembly option -align-branch-linesplaced in the project properties tab has the same effect (On TS101 they are a must forIF instructions).This program achives high efficiency and small filter granularityby exploiting high level of data re-use.Efficiency 95%Size 80 wordsM0: 75%, M1: 0%Filter granularity: 4Test for a 72-tap filter. In general, this code computes filters withlength a multiple of 4. Filters with different L's require zero padding.The buffer of coefficients is accessed with a circular buffer pointerto save one instruction (hence one cycle) for pointer update.Outer loop unrolled 4 times, with 4 outputs computed simultaneouslyin CBX and CBY. Data misaligned with respect to coefficients isachived by two different multiplications per line.With this high data re-use (each data point is used in 4 multiplications),one can either (a) decrease memory bus utilization, or (b) improve filtergranularity. This program improves filter granularity, and accesses datamostly in single word loads. (There is an alternate version of this programthat decreses mem utilization to 25% using quad data loads, but has filtergranularity of 8.)Restrictions: the length of data and coefficients must be a multiple of 4.I/O FILES USED:===============The data is stored in file data.asm in thefollowing buffers: buff1 -- N samples of input data, zero padded with L trailing zeros coeff_buff -- L coefficients output -- N samples of output data computed by this programDESCRIPTION:============This filter is based on computing outputs yi and yi+1 on CBY, and outputsyi+2 and yi+3 on CBX.The trick used here to achieve off by one asymetry between the comp blocks is toissue different multiplications to each one. So for each filter coefficient, oneissues 4 multiplications, where the 4 data points are rotated from one filtercoefficient to the next. Hence, for each set of 4 multiplications, one only needs toload a single data point.All data loads are broadcast, thereby creating the illusion that both comp blockshave read access to the "same" register file. This way, one can simplify the movementof data between the blocks. For instance, by loading x0 into both xR0 and yR0, thenany comp block can read x0.Computations are mapped according to: CBX CBY ------ ------ r7: yi+3 yi+1 r6: yi+2 yiwhere r7 and r6 are the accumulation registers that hold the respective output.This diagram shows the organization for the first few input points:CBX high c0 c1 c2 c3 c4 c5 ---> yi+3CBX low c0 c1 c2 c3 c4 c5 ---> yi+2CBY high c0 c1 c2 c3 c4 c5 ---> yi+1CBY low c0 c1 c2 c3 c4 c5 ---> yiinput x0 x1 x2 x3 x4 x5All the 4 computations associated with coefficient c0 are computed in 2 cycles, andaccumulated on xR7:6 and yR7:6. After this, all 4 computations of c1 are done in the next2 cycles, and so on.PERFORMANCE:============ EFFICIENCY: ----------An N=200 sample input with L=72 takes 7461 cycles on TS201 (7417 on TS101), for an efficiency of 97%.Only one stall is present in the LC1 loop. All the other instructions contain2 multiplications, one in Y and the other in X blocks.The expression is 4+N/4*[(L/4-1)*8+9]=NL/2+4+N/4=7254MEMORY MAP:=========== Data buffer with Coeff circular buffer zero pad for filter init: (negative increments): ------------------ |--------| | 0 0 0 0 |buff1 <-- j0 | c1 c0 |coeff_buffer | c2 | | 0 0 0 0 |buff1+68 | | |----------------| | x3 x2 x1 x0 |buff1+72 | | | x4 | |c71 c70 |coeff_buffer+70 <-j1 |--------| |x199 x196| |----------------| | |- at the end of the program, the cycle_count variable contains thecycle count of the main program*************************************************************************/#define N 200 // number of data points in input#define L 72 // number of filter coefficients#define N_MAX 300 // maximum buffer size.section data1;// input data buffer of length N+L.// L-1 leading zeros, and N data points..align 4;.var inputs[N+L-1] = "input.dat";.var temp;//this is a scratch location used in the cache loading process of the inputs buffer.var cycle_count; // execution cycle counts.section data2;// Coefficients buffer with L entries..align 4;.var coeffs[L] = "coefficients.dat";.var output[N]; // buffer used for storing calculated outputs/************************************************************************/#ifdef __ADSPTS201__ #include <defts201.h>#endif#include "cache_macros.h"/* Main Program Section */.section program;/************************************** Power up code *****************************************/main :powerup:#ifdef __ADSPTS201__/*in the case of TS201, at the beginning of the program thecache must be enabled. The procedure is contained in thecache_enable macro that uses the refresh rate as input parameter -if CCLK=500MHz, refresh_rate=750 -if CCLK=400MHz, refresh_rate=600 -if CCLK=300MHz, refresh_rate=450 -if CCLK=250MHz, refresh_rate=375*/ cache_enable(750);//cache preload. j0 = j31 + inputs; LC0 = (L+N)/4;;//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least//4 instruction lines before the end of the loop (jump instruction included)//and the loop must be at least 2 cycles long nop;nop;;.align_code 4;ini_cache1: xr3:0 = q[j0+=0];;.align_code 4; if NLC0E, jump ini_cache1; q[j0+=4] = xr3:0;; j0 = j31 + coeffs; LC0 = L/4;;//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least//4 instruction lines before the end of the loop (jump instruction included)//and the loop must be at least 2 cycles long nop;nop;;.align_code 4;ini_cache: xr3:0 = q[j0+=0];;.align_code 4; if NLC0E, jump ini_cache; q[j0+=4] = xr3:0;;#endifend_powerup://consider a second loop in order to eliminate the effects of starting//with the BTB unloaded. The second iteration gives the right value//for the cycle count j24 = j31 + 2;;.align_code 4;count_loop://read cycle counter ini_cycle_count;/************************************** Start of code *****************************************///j0 points to data j0 = j31 + inputs; LC1 = N/4;;//k1 points in a circular fashion to the coefficients k1 = k31 + coeffs + (L-2); LC0 = L/4-1;; kb1 = k31 + coeffs;; kl1 = k31 + L;;//j2 points to the output buffer j2 = j31 + output;;//in the following, r31:28 contain the coefficients, r3:0 the input data// Multiplications are indented accoring to the output// that they belong to://// CBX: CBY:// yi+2 yi// yi+3 yi+1 r31:30=cb l[k1+=-2]; r3:0=q[j0+=4];; r0=[j0+=1]; xfr6=r2*r31; yfr6=r0*r31;; r29:28=cb l[k1+=-2]; xfr7=r3*r31; yfr7=r1*r31;; r1=[j0+=1]; xfr4=r3*r30; yfr4=r1*r30;; xfr5=r0*r30; yfr5=r2*r30;;.align_code 4;loop: r2=[j0+=1]; xfr4=r0*r29; yfr4=r2*r29; fr6=r6+r4;; r31:30=cb l[k1+=-2]; xfr5=r1*r29; yfr5=r3*r29; fr7=r7+r5;; r3=[j0+=1]; xfr4=r1*r28; yfr4=r3*r28; fr6=r6+r4;; xfr5=r2*r28; yfr5=r0*r28; fr7=r7+r5;; r0=[j0+=1]; xfr4=r2*r31; yfr4=r0*r31; fr6=r6+r4;; r29:28=cb l[k1+=-2];xfr5=r3*r31; yfr5=r1*r31; fr7=r7+r5;; r1=[j0+=1]; xfr4=r3*r30; yfr4=r1*r30; fr6=r6+r4;;.align_code 4; if NLC0E, jump loop; xfr5=r0*r30; yfr5=r2*r30; fr7=r7+r5;; r2=[j0+=-(L-2)]; xfr4=r0*r29; yfr4=r2*r29; fr6=r6+r4;;//here the data for the next 4 filters is fetched r31:30=cb l[k1+=-2]; xfr5=r1*r29; yfr5=r0*r28; fr7=r7+r5;; r1:0=l[j0+=2]; xfr4=r1*r28; yfr4=r3*r28; fr6=r6+r4;; r3:2=l[j0+=2]; xfr5=r2*r28; yfr5=r3*r29; fr7=r7+r5;;//here the data for the next 4 filters begins to be elaborated//stall r0=[j0+=1]; xfr6=r2*r31; yfr6=r0*r31; fr8=r6+r4;; r29:28=cb l[k1+=-2]; xfr7=r3*r31; yfr7=r1*r31; fr9=r7+r5;; r1=[j0+=1];LC0 = L/4-1; xfr4=r3*r30; yfr4=r1*r30;;//store now the outputs.align_code 4; if NLC1E, jump loop; xfr5=r0*r30; yfr5=r2*r30; q[j2+=4] = r9:8;;/******************************************* Done ***********************************************///read cycle counter and compute the program's cycle count comp_cycle_count; j24 = j24 - 1;;.align_code 4; if NJEQ, jump count_loop;;main.end:___lib_prog_term: nop;nop;nop;nop;;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -