?? pixel-sse2.asm
字號:
movdqa xmm4, xmm3 punpcklbw xmm1, xmm7 punpckhbw xmm2, xmm7 punpcklbw xmm3, xmm7 punpckhbw xmm4, xmm7 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 pmaddwd xmm3, xmm3 pmaddwd xmm4, xmm4 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] paddd xmm1, xmm2 paddd xmm3, xmm4 paddd xmm0, xmm1 paddd xmm0, xmm3%endmacro%macro SSD_START_SSE2 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2 pxor xmm7, xmm7 ; zero pxor xmm0, xmm0 ; mm0 holds the sum%endmacro%macro SSD_END_SSE2 0 movdqa xmm1, xmm0 psrldq xmm1, 8 paddd xmm0, xmm1 movdqa xmm1, xmm0 psrldq xmm1, 4 paddd xmm0, xmm1 movd eax, xmm0 pop ebx ret%endmacroALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_ssd_16x16_sse2: SSD_START_SSE2%rep 8 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2ALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_ssd_16x8_sse2: SSD_START_SSE2%rep 4 SSD_INC_2x16P_SSE2%endrep SSD_END_SSE2; %1=(row2, row0) %2=(row3, row1) %3=junk; output in %1=(row3, row0) and %3=(row2, row1)%macro HADAMARD4x4_SSE2 3 movdqa %3, %1 paddw %1, %2 psubw %3, %2 movdqa %2, %1 punpcklqdq %1, %3 punpckhqdq %2, %3 movdqa %3, %1 paddw %1, %2 psubw %3, %2%endmacro;;; two HADAMARD4x4_SSE2 running side-by-side%macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) movdqa %3, %1 movdqa %6, %4 paddw %1, %2 paddw %4, %5 psubw %3, %2 psubw %6, %5 movdqa %2, %1 movdqa %5, %4 punpcklqdq %1, %3 punpcklqdq %4, %6 punpckhqdq %2, %3 punpckhqdq %5, %6 movdqa %3, %1 movdqa %6, %4 paddw %1, %2 paddw %4, %5 psubw %3, %2 psubw %6, %5%endmacro%macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2 movdqa %3, %1 punpcklwd %1, %2 punpckhwd %2, %3 ; backwards because the high quadwords are already swapped movdqa %3, %1 punpckldq %1, %2 punpckhdq %3, %2 movdqa %2, %1 punpcklqdq %1, %3 punpckhqdq %2, %3%endmacro;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) movdqa %3, %1 movdqa %6, %4 punpcklwd %1, %2 punpcklwd %4, %5 punpckhwd %2, %3 punpckhwd %5, %6 movdqa %3, %1 movdqa %6, %4 punpckldq %1, %2 punpckldq %4, %5 punpckhdq %3, %2 punpckhdq %6, %5 movdqa %2, %1 movdqa %5, %4 punpcklqdq %1, %3 punpcklqdq %4, %6 punpckhqdq %2, %3 punpckhqdq %5, %6%endmacro;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order;;; destroys xmm2, 3;;; the value in xmm7 doesn't matter: it's only subtracted from itself%macro LOAD4x8_DIFF_SSE2 0 movq xmm0, [eax] movq xmm4, [ecx] punpcklbw xmm0, xmm7 punpcklbw xmm4, xmm7 psubw xmm0, xmm4 movq xmm1, [eax+ebx] movq xmm5, [ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] punpcklbw xmm1, xmm7 punpcklbw xmm5, xmm7 psubw xmm1, xmm5 movq xmm2, [eax] movq xmm4, [ecx] punpcklbw xmm2, xmm7 punpcklbw xmm4, xmm7 psubw xmm2, xmm4 movdqa xmm4, xmm0 punpcklqdq xmm0, xmm2 ; rows 0 and 2 punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2 movq xmm3, [eax+ebx] movq xmm5, [ecx+edx] lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] punpcklbw xmm3, xmm7 punpcklbw xmm5, xmm7 psubw xmm3, xmm5 movdqa xmm5, xmm1 punpcklqdq xmm1, xmm3 ; rows 1 and 3 punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3%endmacro%macro SUM4x4_SSE2 4 ; 02 13 junk sum pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 pxor %3, %3 psubw %3, %2 pmaxsw %2, %3 paddusw %4, %1 paddusw %4, %2%endmacro;;; two SUM4x4_SSE2 running side-by-side%macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum pxor %3, %3 pxor %6, %6 psubw %3, %1 psubw %6, %4 pmaxsw %1, %3 pmaxsw %4, %6 pxor %3, %3 pxor %6, %6 psubw %3, %2 psubw %6, %5 pmaxsw %2, %3 pmaxsw %5, %6 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4%endmacro%macro SUM_MM_SSE2 2 ; sum junk ; ebx is no longer used at this point, so no push needed picgetgot ebx ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw %1, 1 movdqa %2, %1 psrldq %1, 2 paddusw %1, %2 pand %1, [pd_0000ffff GOT_ebx] movdqa %2, %1 psrldq %1, 4 paddd %1, %2 movdqa %2, %1 psrldq %1, 8 paddd %1, %2 movd eax,%1%endmacro%macro SATD_TWO_SSE2 0 LOAD4x8_DIFF_SSE2 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3 TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5 HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6%endmacro%macro SATD_START 0 push ebx mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 mov edx, [esp+20] ; stride2 pxor xmm6, xmm6%endmacro%macro SATD_END 0 SUM_MM_SSE2 xmm6, xmm7 pop ebx ret%endmacroALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x16_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 mov eax, [esp+ 8] mov ecx, [esp+16] lea eax, [eax+8] lea ecx, [ecx+8] SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x16_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_16x8_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 mov eax, [esp+ 8] mov ecx, [esp+16] lea eax, [eax+8] lea ecx, [ecx+8] SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x8_sse2: SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_ENDALIGN 16;-----------------------------------------------------------------------------; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------x264_pixel_satd_8x4_sse2: SATD_START SATD_TWO_SSE2 SATD_END
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -