?? fdct_sse2_skal.asm
字號:
movdqa xmm3, [sqrt2] movdqa xmm7, xmm4 movdqa xmm6, xmm2 psubsw xmm4, xmm1 ; tp17-tp35 = t1 psubsw xmm2, xmm0 ; tm17-tm35 = b3 paddsw xmm1, xmm7 ; tp17+tp35 = b0 paddsw xmm0, xmm6 ; tm17+tm35 = t2 ; xmm1 = b0, xmm2 = b3. preserved movdqa xmm6, xmm4 psubsw xmm4, xmm0 ; t1-t2 paddsw xmm0, xmm6 ; t1+t2 pmulhw xmm4, xmm3 ; (t1-t2)/(2.sqrt2) pmulhw xmm0, xmm3 ; (t1+t2)/(2.sqrt2) paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 movdqa xmm7, [tan2] ; t2 movdqa xmm3, [%1+2*16] ; x2 movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 pmulhw xmm7, xmm6 ; x6*t2 pmulhw xmm5, xmm3 ; x2*t2 paddsw xmm7, xmm3 ; x2+x6*t2 = tp26 psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 movdqa xmm3, [%1+0*16] ; x0 movdqa xmm6, [%1+4*16] ; x4 psubsw xmm3, xmm6 ; x0-x4 = tm04 paddsw xmm6, xmm6 ; 2.x4 paddsw xmm6, xmm3 ; x0+x4 = tp04 psubsw xmm3, xmm5 ; tm04-tm26 = a2 psubsw xmm6, xmm7 ; tp04-tp26 = a3 paddsw xmm5, xmm5 ; 2.tm26 paddsw xmm7, xmm7 ; 2.tp26 paddsw xmm5, xmm3 ; tm04+tm26 = a1 paddsw xmm7, xmm6 ; tp04+tp26 = a0 psubsw xmm5, xmm0 ; a1-b1 psubsw xmm3, xmm4 ; a2-b2 paddsw xmm0, xmm0 ; 2.b1 paddsw xmm4, xmm4 ; 2.b2 paddsw xmm0, xmm5 ; a1+b1 paddsw xmm4, xmm3 ; a2+b2 psraw xmm5, 6 ; out6 psraw xmm3, 6 ; out5 psraw xmm0, 6 ; out1 psraw xmm4, 6 ; out2 movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 movdqa [%1+1*16], xmm0 movdqa [%1+2*16], xmm4 ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3 movdqa xmm0, xmm7 movdqa xmm4, xmm6 psubsw xmm7, xmm1 ; a0-b0 psubsw xmm6, xmm2 ; a3-b3 paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 psraw xmm1, 6 ; out0 psraw xmm7, 6 ; out7 psraw xmm2, 6 ; out3 psraw xmm6, 6 ; out4 movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7%endmacro;-----------------------------------------------------------------------------; Function idct (the straight forward version);-----------------------------------------------------------------------------ALIGN 16idct_sse2_skal: mov ecx, [esp+4] iMTX_MULT 0, iTab1, Idct_Rnd0, 11 iMTX_MULT 1, iTab2, Idct_Rnd1, 11 iMTX_MULT 2, iTab3, Idct_Rnd2, 11 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 iMTX_MULT 5, iTab4, Idct_Rnd5, 11 iMTX_MULT 6, iTab3, Idct_Rnd6, 11 iMTX_MULT 7, iTab2, Idct_Rnd7, 11 iLLM_PASS ecx+0 ret;-----------------------------------------------------------------------------; Helper macro TEST_ROW (test a null row);-----------------------------------------------------------------------------%macro TEST_ROW 2 ; %1:src, %2:label x8 mov eax, [%1 ] mov edx, [%1+ 8] or eax, [%1+ 4] or edx, [%1+12] or eax, edx jz near %2%endmacro;-----------------------------------------------------------------------------; Function idct (this one skips null rows);-----------------------------------------------------------------------------ALIGN 16idct_sse2_sparse_skal: mov ecx, [esp+ 4] ; Src TEST_ROW ecx, .Row0_Round iMTX_MULT 0, iTab1, Idct_Rnd0, 11 jmp .Row1.Row0_Round movq mm0, [Idct_Sparse_Rnd0] movq [ecx ], mm0 movq [ecx+8], mm0.Row1 TEST_ROW ecx+16, .Row1_Round iMTX_MULT 1, iTab2, Idct_Rnd1, 11 jmp .Row2.Row1_Round movq mm0, [Idct_Sparse_Rnd1] movq [ecx+16 ], mm0 movq [ecx+16+8], mm0.Row2 TEST_ROW ecx+32, .Row2_Round iMTX_MULT 2, iTab3, Idct_Rnd2, 11 jmp .Row3.Row2_Round movq mm0, [Idct_Sparse_Rnd2] movq [ecx+32 ], mm0 movq [ecx+32+8], mm0.Row3 TEST_ROW ecx+48, .Row4 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 jmp .Row4.Row4 TEST_ROW ecx+64, .Row5 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 jmp .Row5.Row5 TEST_ROW ecx+80, .Row6 iMTX_MULT 5, iTab4, Idct_Rnd5, 11.Row6 TEST_ROW ecx+96, .Row7 iMTX_MULT 6, iTab3, Idct_Rnd6, 11.Row7 TEST_ROW ecx+112, .End iMTX_MULT 7, iTab2, Idct_Rnd7, 11.End iLLM_PASS ecx+0 ret;-----------------------------------------------------------------------------; Helper macro fLLM_PASS;-----------------------------------------------------------------------------%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift movdqa xmm0, [%1+0*16] ; In0 movdqa xmm2, [%1+2*16] ; In2 movdqa xmm3, xmm0 movdqa xmm4, xmm2 movdqa xmm7, [%1+7*16] ; In7 movdqa xmm5, [%1+5*16] ; In5 psubsw xmm0, xmm7 ; t7 = In0-In7 paddsw xmm7, xmm3 ; t0 = In0+In7 psubsw xmm2, xmm5 ; t5 = In2-In5 paddsw xmm5, xmm4 ; t2 = In2+In5 movdqa xmm3, [%1+3*16] ; In3 movdqa xmm4, [%1+4*16] ; In4 movdqa xmm1, xmm3 psubsw xmm3, xmm4 ; t4 = In3-In4 paddsw xmm4, xmm1 ; t3 = In3+In4 movdqa xmm6, [%1+6*16] ; In6 movdqa xmm1, [%1+1*16] ; In1 psubsw xmm1, xmm6 ; t6 = In1-In6 paddsw xmm6, [%1+1*16] ; t1 = In1+In6 psubsw xmm7, xmm4 ; tm03 = t0-t3 psubsw xmm6, xmm5 ; tm12 = t1-t2 paddsw xmm4, xmm4 ; 2.t3 paddsw xmm5, xmm5 ; 2.t2 paddsw xmm4, xmm7 ; tp03 = t0+t3 paddsw xmm5, xmm6 ; tp12 = t1+t2 psllw xmm2, %2+1 ; shift t5 (shift +1 to.. psllw xmm1, %2+1 ; shift t6 ..compensate cos4/2) psllw xmm4, %2 ; shift t3 psllw xmm5, %2 ; shift t2 psllw xmm7, %2 ; shift t0 psllw xmm6, %2 ; shift t1 psllw xmm3, %2 ; shift t4 psllw xmm0, %2 ; shift t7 psubsw xmm4, xmm5 ; out4 = tp03-tp12 psubsw xmm1, xmm2 ; xmm1: t6-t5 paddsw xmm5, xmm5 paddsw xmm2, xmm2 paddsw xmm5, xmm4 ; out0 = tp03+tp12 movdqa [%1+4*16], xmm4 ; => out4 paddsw xmm2, xmm1 ; xmm2: t6+t5 movdqa [%1+0*16], xmm5 ; => out0 movdqa xmm4, [tan2] ; xmm4 <= tan2 pmulhw xmm4, xmm7 ; tm03*tan2 movdqa xmm5, [tan2] ; xmm5 <= tan2 psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 pmulhw xmm5, xmm6 ; tm12*tan2 paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 movdqa xmm6, [sqrt2] movdqa xmm7, [Rounder1] pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 por xmm5, xmm7 ; correct out2 por xmm4, xmm7 ; correct out6 pmulhw xmm1, xmm6 ; xmm1: tm65 = (t6 - t5)*cos4 por xmm2, xmm7 ; correct tp65 movdqa [%1+2*16], xmm5 ; => out2 movdqa xmm5, xmm3 ; save t4 movdqa [%1+6*16], xmm4 ; => out6 movdqa xmm4, xmm0 ; save t7 psubsw xmm3, xmm1 ; xmm3: tm465 = t4 - tm65 psubsw xmm0, xmm2 ; xmm0: tm765 = t7 - tp65 paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 movdqa xmm4, [tan3] ; tan3 - 1 movdqa xmm5, [tan1] ; tan1 movdqa xmm7, xmm3 ; save tm465 pmulhw xmm3, xmm4 ; tm465*(tan3-1) movdqa xmm6, xmm1 ; save tp465 pmulhw xmm1, xmm5 ; tp465*tan1 paddsw xmm3, xmm7 ; tm465*tan3 pmulhw xmm4, xmm0 ; tm765*(tan3-1) paddsw xmm4, xmm0 ; tm765*tan3 pmulhw xmm5, xmm2 ; tp765*tan1 paddsw xmm1, xmm2 ; out1 = tp765 + tp465*tan1 psubsw xmm0, xmm3 ; out3 = tm765 - tm465*tan3 paddsw xmm7, xmm4 ; out5 = tm465 + tm765*tan3 psubsw xmm5, xmm6 ; out7 =-tp465 + tp765*tan1 movdqa [%1+1*16], xmm1 ; => out1 movdqa [%1+3*16], xmm0 ; => out3 movdqa [%1+5*16], xmm7 ; => out5 movdqa [%1+7*16], xmm5 ; => out7%endmacro;-----------------------------------------------------------------------------;Helper macro fMTX_MULT;-----------------------------------------------------------------------------%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] pshufd xmm0, xmm0, 01000100b pshufd xmm1, xmm1, 11101110b movdqa xmm2, xmm0 paddsw xmm0, xmm1 ; xmm0 = [a0 a1 a2 a3] psubsw xmm2, xmm1 ; xmm2 = [b0 b1 b2 b3] punpckldq xmm0, xmm2 ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3] pshufd xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1] ; [M00 M01 M16 M17] [M06 M07 M22 M23] x mm0 = [0 /1 /2'/3'] ; [M02 M03 M18 M19] [M04 M05 M20 M21] x mm2 = [0'/1'/2 /3 ] ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] movdqa xmm1, [%2+16] movdqa xmm3, [%2+32] pmaddwd xmm1, xmm2 pmaddwd xmm3, xmm0 pmaddwd xmm2, [%2+48] pmaddwd xmm0, [%2+ 0] paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] psrad xmm0, 16 psrad xmm2, 16 packssdw xmm0, xmm2 ; [ out0 .. out7 ] paddsw xmm0, [%3] ; Round psraw xmm0, 4 ; => [-2048, 2047] movdqa [ecx+%1*16+0], xmm0%endmacro;-----------------------------------------------------------------------------; Function Forward DCT;-----------------------------------------------------------------------------ALIGN 16fdct_sse2_skal: mov ecx, [esp+4] fLLM_PASS ecx+0, 3 fMTX_MULT 0, fTab1, Fdct_Rnd0 fMTX_MULT 1, fTab2, Fdct_Rnd2 fMTX_MULT 2, fTab3, Fdct_Rnd1 fMTX_MULT 3, fTab4, Fdct_Rnd1 fMTX_MULT 4, fTab1, Fdct_Rnd0 fMTX_MULT 5, fTab4, Fdct_Rnd1 fMTX_MULT 6, fTab3, Fdct_Rnd1 fMTX_MULT 7, fTab2, Fdct_Rnd1 ret
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -