?? skl_dct_llm.cpp
字號:
/* * skl_dct_LLM.cpp * * LLM implementation of Forward Discrete Cosine Transform * designed for [0..255] input only. * * See: * Loeffler C., Ligtenberg A., and Moschytz C.S.: * Practical Fast 1D DCT Algorithm with Eleven Multiplications, * Proc. ICASSP 1989, 988-991. * * (C) 2002 Pascal Massimino * details at http://skal.planet-d.net/coding/dct.html ********************************************************/#include <math.h>#include <stdio.h>typedef short SKL_INT16;typedef long SKL_INT32;#define IN_TYPE SKL_INT16#define OUT_TYPE SKL_INT16#define TMP_TYPE SKL_INT32//////////////////////////////////////////////////////////#define SHIFTL(m,n) ((m)<<(n))#define SHIFTRH(m,n) ( ( (m)+HALF(n) ) >> (n) )#define FX(x,n) ((IN_TYPE)floor((x)*(1<<(n)) + .5))#define HALF(n) (1<<((n)-1))#define RMULT(x, y, n) ( (IN_TYPE)(((TMP_TYPE)(x)*(TMP_TYPE)(y))>>(n)) )#define PMULHW(x, y) ( (IN_TYPE)( ((TMP_TYPE)(x)*(TMP_TYPE)(y))>>16 ) )#define RMULT3(x, y, n) ( ( PMULHW(x,y) + HALF((n)-16) ) >> ((n)-16) )#define SWAP(m1,m2,tmp) (tmp)=(m1); (m1)=(m2); (m2)=(tmp)#define STORE(a, m) In[(a)*8] = (OUT_TYPE)(m);#define LOAD(m, a) (m) = (IN_TYPE) In[(a)*8]#define LOAD_BUTF(m1, m2, a, b) \ (m1) = (IN_TYPE)In[(a)*8] - (IN_TYPE)In[(b)*8]; \ (m2) = (IN_TYPE)In[(a)*8] + (IN_TYPE)In[(b)*8]#define BUTF(a, b, tmp) \ (tmp) = (a)+(b); \ (a) = (a)-(b); \ (b) = (tmp)#define BUTFSAFE(a, b, tmp) \ (tmp) = (IN_TYPE)( ((b)+(a)+1) >> 1 ); \ (a) = (IN_TYPE)( ((a)-(b)+1) >> 1 ); \ (b) = (tmp)#define ROTATE(a, b, Rot, tmp) \ (tmp) = RMULT( (a), (Rot), 15 ) - (b); \ (a) = (a) + RMULT( (b), (Rot), 15 ); \ (b) = (tmp)#define ROTATE16(a, b, Rot, tmp) \ (tmp)= PMULHW( (a), (Rot) ) - (b); \ (a) = (a) + PMULHW( (b), (Rot) ); \ (b) = (tmp)#define ROTATE15(a, b, Rot, tmp) \ (tmp)= (b) - (a) - PMULHW( (a), (Rot) ); \ (a) = (a) + (b) + PMULHW( (b), (Rot) ); \ (b) = (tmp)//////////////////////////////////////////////////////////static const IN_TYPE tan_1_16 = FX( tan(1.*M_PI/16), 15 );static const IN_TYPE tan_1_16_16b = FX( tan(1.*M_PI/16), 16 );static const IN_TYPE tan_2_16 = FX( tan(2.*M_PI/16), 15 );static const IN_TYPE tan_2_16_16b = FX( sqrt(2.0)-1., 16 );static const IN_TYPE tan_3_16 = FX( tan(3.*M_PI/16), 15 );static const IN_TYPE tan_3_16_16b = FX( tan(3.*M_PI/16)-1., 16 );static const IN_TYPE cos_4_16 = FX( cos(4.*M_PI/16), 15 );static const IN_TYPE ucos_4_16 = 0xb505; // unsignedstatic const IN_TYPE ocos_4_16 = FX( cos(4.*M_PI/16)-.5, 16 );static const IN_TYPE cos_4_16_16b = FX( cos(4.*M_PI/16)-.5, 17 );static const IN_TYPE cos3_cos1 = FX( cos(3.*M_PI/16)/cos(1.*M_PI/16), 15 );static const IN_TYPE ocos3_cos1 = FX( cos(3.*M_PI/16)/cos(1.*M_PI/16) - .5, 16 );static const double c1 = cos(1.*M_PI/16);static const double s1 = sin(1.*M_PI/16);static const double t1 = tan(1.*M_PI/16);static const double c2 = cos(2.*M_PI/16);static const double s2 = sin(2.*M_PI/16);static const double c3 = cos(3.*M_PI/16);static const double s3 = sin(3.*M_PI/16);static const double t3 = tan(3.*M_PI/16);static const double c4 = cos(4.*M_PI/16);static const double c6 = cos(6.*M_PI/16);#define PASSB 2static void DCT_FWD_PASS1(OUT_TYPE *In){ IN_TYPE mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; IN_TYPE Spill; LOAD_BUTF(mm7, mm0, 0, 7); LOAD_BUTF(mm4, mm3, 3, 4); LOAD_BUTF(mm6, mm1, 1, 6); LOAD_BUTF(mm5, mm2, 2, 5); BUTF( mm0, mm3, Spill ); BUTF( mm1, mm2, Spill ); BUTF( mm3, mm2, Spill ); mm0 = SHIFTL(mm0, PASSB); mm1 = SHIFTL(mm1, PASSB); mm2 = SHIFTL(mm2, PASSB); mm3 = SHIFTL(mm3, PASSB); STORE(0, mm2); STORE(4, mm3); // Rot6 Spill = PMULHW(mm0, tan_2_16_16b) + mm0; mm0 = mm0+mm1; mm1 = (Spill - mm0) | 1; mm0 = (Spill + mm0) | 1; STORE(2, mm0); STORE(6, mm1); // Rot1/3 mm4 = SHIFTL(mm4, PASSB+1); mm5 = SHIFTL(mm5, PASSB); mm6 = SHIFTL(mm6, PASSB); mm7 = SHIFTL(mm7, PASSB+1); BUTF( mm6, mm5, Spill ); mm6 = PMULHW( mm6, cos_4_16_16b ) + mm6; mm5 = PMULHW( mm5, cos_4_16_16b ) + mm5; mm5 |= 1; mm6 |= 1; BUTF( mm7, mm5, Spill ); BUTF( mm4, mm6, Spill ); ROTATE16( mm5, mm6, tan_1_16_16b, Spill ); ROTATE15( mm4, mm7, tan_3_16_16b, Spill ); STORE(1, mm5); STORE(3, mm7); STORE(5, mm4); STORE(7, mm6);}static void DCT_FWD_PASS2(OUT_TYPE *In){ IN_TYPE mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; IN_TYPE Spill; LOAD_BUTF(mm7, mm0, 0, 7); LOAD_BUTF(mm4, mm3, 3, 4); LOAD_BUTF(mm6, mm1, 1, 6); LOAD_BUTF(mm5, mm2, 2, 5); BUTF( mm0, mm3, Spill ); BUTF( mm1, mm2, Spill );// BUTF( mm3, mm2, Spill ); mm2 >>= 1; mm3 >>= 1; Spill = mm2; mm2 = mm3+Spill; mm3 = mm3-Spill; STORE(0, mm2); STORE(4, mm3); // Rot6 Spill = PMULHW(mm0, tan_2_16_16b) + mm0; mm0 = mm0+mm1; mm1 = (Spill - mm0) | 1; mm0 = (Spill + mm0) | 1; STORE(2, mm0); STORE(6, mm1); // Rot1/3 BUTF( mm6, mm5, Spill ); mm6 = PMULHW( mm6, ucos_4_16 ) + mm6; mm5 = PMULHW( mm5, ucos_4_16 ) + mm5; mm5 |= 1; BUTF( mm7, mm5, Spill ); BUTF( mm4, mm6, Spill ); ROTATE16( mm5, mm6, tan_1_16_16b, Spill ); ROTATE15( mm4, mm7, tan_3_16_16b, Spill ); STORE(1, mm5); STORE(3, mm7); STORE(5, mm4); STORE(7, mm6);}static IN_TYPE Final[64] = { FX(c4*c4, 15)-1, FX(c4*c1, 14), FX(c4*c6, 15), FX(c4*c3, 14), FX(c4*c4, 15)-1, FX(c4*c3, 14), FX(c4*c2, 15), FX(c4*c1, 14), FX(c1*c4, 14), FX(c1*c1, 13), FX(c1*c6, 14), FX(c1*c3, 13), FX(c1*c4, 14), FX(c1*c3, 13), FX(c1*c2, 14), FX(c1*c1, 13), FX(c6*c4, 14), FX(c6*c1, 13), FX(c6*c6, 14), FX(c6*c3, 13), FX(c6*c4, 14), FX(c6*c3, 13), FX(c6*c2, 14), FX(c6*c1, 13), FX(c3*c4, 14), FX(c3*c1, 13), FX(c3*c6, 14), FX(c3*c3, 13), FX(c3*c4, 14), FX(c3*c3, 13), FX(c3*c2, 14), FX(c3*c1, 13), FX(c4*c4, 15)-1, FX(c4*c1, 14), FX(c4*c6, 15), FX(c4*c3, 14), FX(c4*c4, 15)-1, FX(c4*c3, 14), FX(c4*c2, 15), FX(c4*c1, 14), FX(c3*c4, 14), FX(c3*c1, 13), FX(c3*c6, 14), FX(c3*c3, 13), FX(c3*c4, 14), FX(c3*c3, 13), FX(c3*c2, 14), FX(c3*c1, 13), FX(c2*c4, 14), FX(c2*c1, 13), FX(c2*c6, 14), FX(c2*c3, 13), FX(c2*c4, 14), FX(c2*c3, 13), FX(c2*c2, 14), FX(c2*c1, 13), FX(c1*c4, 14), FX(c1*c1, 13), FX(c1*c6, 14), FX(c1*c3, 13), FX(c1*c4, 14), FX(c1*c3, 13), FX(c1*c2, 14), FX(c1*c1, 13)};static void Print_Cst() { printf( "Mults dw" ); for(int k=0; k<64; ++k) { printf( " 0x%.4x,", Final[k]>>PASSB ); if ((k&7)==7) if (k!=63) printf( "\n dw"); else printf("\n"); } printf( "Halves dw" ); for(int l=0; l<64; ++l) { IN_TYPE M = Final[l]>>PASSB; IN_TYPE H = HALF(16)/M; printf( " 0x%.4x,", H ); if ((l&7)==7) if (l!=63) printf( "\n dw"); else printf("\n"); } printf( "tan_1_16_16b = 0x%x\n", tan_1_16_16b); printf( "tan_2_16_16b = 0x%x\n", tan_2_16_16b); printf( "tan_3_16_16b = 0x%x\n", tan_3_16_16b); printf( "cos_4_16_16b = 0x%x\n", cos_4_16_16b);}static void FINISH(OUT_TYPE *In){// Print_Cst(); for(int i=0; i<64; ++i) { IN_TYPE M = Final[i]>>PASSB; IN_TYPE H = HALF(16)/M; In[i] = (OUT_TYPE)PMULHW( In[i]+H, M ); }}///////////////////////////////////////////////////////////*IEEE1180-like Errors specs:Peak error: 1.0000Peak MSE: 0.1813Overall MSE: 0.0502Peak ME: 0.1813Overall ME: -0.0214*/static void TRANSPOSE( OUT_TYPE *In ){ for(int i=0; i<8; ++i) for(int j=i+1; j<8; ++j) { SKL_INT16 Tmp = In[i+8*j]; In[i+8*j] = In[j+8*i]; In[j+8*i] = Tmp; }}void Skl_Dct16_LLM( OUT_TYPE *In ){ int i; TRANSPOSE(In); for(i=0; i<8; ++i) DCT_FWD_PASS1(In+i); TRANSPOSE(In); for(i=0; i<8; ++i) DCT_FWD_PASS2(In+i); FINISH(In);}//////////////////////////////////////////////////////////// NASM source///////////////////////////////////////////////////////////*; [BITS 32];//////////////////////////////////////////////////////////////////////SECTION .dataalign 16Mults dw 0x0fff, 0x0b18, 0x08a8, 0x0968, 0x0fff, 0x0968, 0x14e7, 0x0b18, dw 0x0b18, 0x07b2, 0x0601, 0x0686, 0x0b18, 0x0686, 0x0e7f, 0x07b2, dw 0x0454, 0x0300, 0x0257, 0x028b, 0x0454, 0x028b, 0x05a8, 0x0300, dw 0x0968, 0x0686, 0x0517, 0x0587, 0x0968, 0x0587, 0x0c4a, 0x0686, dw 0x0fff, 0x0b18, 0x08a8, 0x0968, 0x0fff, 0x0968, 0x14e7, 0x0b18, dw 0x0968, 0x0686, 0x0517, 0x0587, 0x0968, 0x0587, 0x0c4a, 0x0686, dw 0x0a73, 0x073f, 0x05a8, 0x0625, 0x0a73, 0x0625, 0x0da8, 0x073f, dw 0x0b18, 0x07b2, 0x0601, 0x0686, 0x0b18, 0x0686, 0x0e7f, 0x07b2Halves dw 0x0008, 0x000b, 0x000e, 0x000d, 0x0008, 0x000d, 0x0006, 0x000b, dw 0x000b, 0x0010, 0x0015, 0x0013, 0x000b, 0x0013, 0x0008, 0x0010, dw 0x001d, 0x002a, 0x0036, 0x0032, 0x001d, 0x0032, 0x0016, 0x002a, dw 0x000d, 0x0013, 0x0019, 0x0017, 0x000d, 0x0017, 0x000a, 0x0013, dw 0x0008, 0x000b, 0x000e, 0x000d, 0x0008, 0x000d, 0x0006, 0x000b, dw 0x000d, 0x0013, 0x0019, 0x0017, 0x000d, 0x0017, 0x000a, 0x0013, dw 0x000c, 0x0011, 0x0016, 0x0014, 0x000c, 0x0014, 0x0009, 0x0011, dw 0x000b, 0x0010, 0x0015, 0x0013, 0x000b, 0x0013, 0x0008, 0x0010tan_1_16_16b times 8 dw 0x32ectan_2_16_16b times 8 dw 0x6a0atan_3_16_16b times 8 dw 0xab0eucos_4_16_16b times 8 dw 0xb505Rounder times 8 dw 0x0001Spill times 8 dw 0;//////////////////////////////////////////////////////////////////////SECTION .text
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -