?? skl_img_x86.asm
字號:
;/********************************************************; * Some code. Copyright (C) 2003 by Pascal Massimino. *; * All Rights Reserved. (http://skal.planet-d.net) *; * For Educational/Academic use ONLY. See 'LICENSE.TXT'.*; ********************************************************/;//////////////////////////////////////////////////////////////////////; [BITS 32]%include "../../include/skl_syst/skl_nasm.h"globl Skl_SAD_4x4_MMXglobl Skl_SAD_4x8_MMXglobl Skl_SAD_8x4_MMXglobl Skl_SAD_8x8_MMXglobl Skl_SAD_8x16_MMXglobl Skl_SAD_16x8_MMXglobl Skl_SAD_16x16_MMXglobl Skl_SAD_16x8_Field_MMXglobl Skl_SAD_16x7_Self_MMXglobl Skl_SSD_4x4_MMXglobl Skl_SSD_8x8_MMXglobl Skl_SSD_16x16_MMXglobl Skl_SSD_16x8_Field_MMXglobl Skl_SAD_4x4_SSEglobl Skl_SAD_4x8_SSEglobl Skl_SAD_8x4_SSEglobl Skl_SAD_8x8_SSEglobl Skl_SAD_8x16_SSEglobl Skl_SAD_16x8_SSEglobl Skl_SAD_16x16_SSEglobl Skl_SAD_16x8_Field_SSEglobl Skl_SAD_16x7_Self_SSEglobl Skl_SAD_Avrg_16x16_SSEglobl Skl_SAD_Avrg_16x8_SSEglobl Skl_SAD_Avrg_8x16_SSEglobl Skl_SAD_Avrg_8x8_SSEglobl Skl_Mean_16x16_MMXglobl Skl_Mean_8x8_MMXglobl Skl_Mean_4x4_MMXglobl Skl_Sqr_16x16_MMXglobl Skl_Sqr_8x8_MMXglobl Skl_Sqr_4x4_MMXglobl Skl_Mean_16x16_SSEglobl Skl_Mean_8x8_SSEglobl Skl_Mean_4x4_SSEglobl Skl_Abs_Dev_16x16_SSEglobl Skl_Sqr_Dev_16x16_SSEglobl Skl_SAD_16x16_SSE2globl Skl_SAD_16x8_Field_SSE2globl Skl_SAD_16x7_Self_SSE2globl Skl_Mean_16x16_SSE2globl Skl_Sqr_16x16_SSE2globl Skl_Abs_Dev_16x16_SSE2DATAalign 16One: times 8 dw 1 ; for summing 4 wordsTEXT%macro COLLAPSE_MMX 0 movq mm7, mm6 pmaddwd mm6, [One] psrlq mm7, 32 pmaddwd mm7, [One] paddd mm6, mm7 movd eax, mm6%endmacro%macro COLLAPSE_4_MMX 0 COLLAPSE_MMX%endmacro;//////////////////////////////////////////////////////////////////////;//;// MMX impl;//;//////////////////////////////////////////////////////////////////////;//////////////////////////////////////////////////////////////////////; Skl_SAD_16x16_MMX;//////////////////////////////////////////////////////////////////////%macro SAD_16x16_MMX 0 movq mm0, [eax] movq mm1, [edx] movq mm2, [eax+8] movq mm3, [edx+8] lea edx,[edx+ecx] ; we do our best *not* to go 16b, here movq mm4, mm0 psubusb mm0, mm1 movq mm5, mm2 psubusb mm2, mm3 psubusb mm1, mm4 por mm0, mm1 psubusb mm3, mm5 por mm2, mm3 movq mm1,mm0 punpcklbw mm0,mm7 punpckhbw mm1,mm7 movq mm3,mm2 punpcklbw mm2,mm7 paddusw mm0,mm1 lea eax,[eax+ecx] punpckhbw mm3,mm7 paddusw mm6,mm0 paddusw mm2,mm3 paddusw mm6,mm2%endmacroalign 16Skl_SAD_16x16_MMX: ; 179c mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX COLLAPSE_MMX retalign 16Skl_SAD_16x8_MMX: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX COLLAPSE_MMX retalign 16Skl_SAD_16x8_Field_MMX: ; 179c mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS lea ecx, [ecx+ecx] ; 2.BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX SAD_16x16_MMX COLLAPSE_MMX ret;//////////////////////////////////////////////////////////////////////; Skl_SAD_16x7_Self_MMX;//////////////////////////////////////////////////////////////////////%macro SAD_16x7_MMX 0 movq mm0, [eax] movq mm1, [edx] movq mm2, [eax+8] movq mm3, [edx+8] lea edx,[edx+ecx] ; we do our best *not* to go 16b, here movq mm4, mm0 psubusb mm0, mm1 movq mm5, mm2 psubusb mm2, mm3 psubusb mm1, mm4 por mm0, mm1 psubusb mm3, mm5 por mm2, mm3 movq mm1,mm0 punpcklbw mm0,mm7 punpckhbw mm1,mm7 movq mm3,mm2 punpcklbw mm2,mm7 paddusw mm0,mm1 lea eax,[eax+ecx] punpckhbw mm3,mm7 paddusw mm6,mm0 paddusw mm2,mm3 paddusw mm6,mm2%endmacroalign 16Skl_SAD_16x7_Self_MMX: mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; BpS lea edx, [eax+ecx]; Src2 pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_16x7_MMX SAD_16x7_MMX SAD_16x7_MMX SAD_16x7_MMX SAD_16x7_MMX SAD_16x7_MMX SAD_16x7_MMX COLLAPSE_MMX ret;//////////////////////////////////////////////////////////////////////; Skl_SAD_8x8_MMX;//////////////////////////////////////////////////////////////////////%macro SAD_8x8_MMX 0 movq mm0, [eax] movq mm1, [edx] movq mm2, [eax+ecx] movq mm3, [edx+ecx] lea edx,[edx+2*ecx] ; we do our best *not* to go 16b, here movq mm4, mm0 psubusb mm0, mm1 movq mm5, mm2 psubusb mm2, mm3 psubusb mm1, mm4 por mm0, mm1 psubusb mm3, mm5 por mm2, mm3 movq mm1,mm0 punpcklbw mm0,mm7 punpckhbw mm1,mm7 movq mm3,mm2 punpcklbw mm2,mm7 paddusw mm0,mm1 lea eax,[eax+2*ecx] punpckhbw mm3,mm7 paddusw mm6,mm0 paddusw mm2,mm3 paddusw mm6,mm2%endmacroalign 16Skl_SAD_8x4_MMX: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_8x8_MMX SAD_8x8_MMX COLLAPSE_MMX retalign 16Skl_SAD_8x8_MMX: ; 57c mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX COLLAPSE_MMX retalign 16Skl_SAD_8x16_MMX: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX SAD_8x8_MMX COLLAPSE_MMX ret;//////////////////////////////////////////////////////////////////////; Skl_SAD_4x4_MMX;//////////////////////////////////////////////////////////////////////%macro SAD_4x4_MMX 0 movd mm0, [eax] movd mm1, [edx] movd mm2, [eax+ecx] movd mm3, [edx+ecx] lea edx,[edx+2*ecx] ; we do our best *not* to go 16b, here movq mm4, mm0 psubusb mm0, mm1 movq mm5, mm2 psubusb mm2, mm3 psubusb mm1, mm4 por mm0, mm1 psubusb mm3, mm5 por mm2, mm3 punpcklbw mm0,mm7 punpcklbw mm2,mm7 paddusw mm6,mm0 lea eax,[eax+2*ecx] paddusw mm6,mm2%endmacroalign 16Skl_SAD_4x4_MMX: ; 57c mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_4x4_MMX SAD_4x4_MMX COLLAPSE_4_MMX retalign 16Skl_SAD_4x8_MMX: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; BpS pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero SAD_4x4_MMX SAD_4x4_MMX SAD_4x4_MMX SAD_4x4_MMX COLLAPSE_4_MMX ret;//////////////////////////////////////////////////////////////////////; Skl_SSD_16x16_MMX;//////////////////////////////////////////////////////////////////////%macro SSD_16x16_MMX 0 movq mm0, [eax] movq mm1, [ecx] movq mm2, [eax+8] movq mm3, [ecx+8] lea eax,[eax+edx] lea ecx,[ecx+edx] movq mm4, mm0 movq mm5, mm1 punpcklbw mm0, mm6 punpcklbw mm1, mm6 punpckhbw mm4, mm6 punpckhbw mm5, mm6 psubw mm0, mm1 psubw mm4, mm5 pmaddwd mm0, mm0 pmaddwd mm4, mm4 paddd mm7, mm0 paddd mm7, mm4 movq mm4, mm2 movq mm5, mm3 punpcklbw mm2, mm6 punpcklbw mm3, mm6 punpckhbw mm4, mm6 punpckhbw mm5, mm6 psubw mm2, mm3 psubw mm4, mm5 pmaddwd mm2, mm2 pmaddwd mm4, mm4 paddd mm7, mm2 paddd mm7, mm4%endmacroalign 16Skl_SSD_16x16_MMX: mov eax, [esp+ 4] ; Src1 mov ecx, [esp+ 8] ; Src2 mov edx, [esp+12] ; BpS pxor mm7, mm7 ; accum pxor mm6, mm6 ; zero SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX movq mm6, mm7 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 retalign 16Skl_SSD_16x8_Field_MMX: mov eax, [esp+ 4] ; Src1 mov ecx, [esp+ 8] ; Src2 mov edx, [esp+12] ; BpS lea edx, [edx+edx] ; 2.BpS pxor mm7, mm7 ; accum pxor mm6, mm6 ; zero SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX SSD_16x16_MMX movq mm6, mm7 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 ret;//////////////////////////////////////////////////////////////////////; Skl_SSD_8x8_MMX;//////////////////////////////////////////////////////////////////////%macro SSD_8x8_MMX 0 movq mm0, [eax] movq mm1, [ecx] movq mm2, [eax+edx] movq mm3, [ecx+edx] lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] movq mm4, mm0 movq mm5, mm1 punpcklbw mm0, mm6 punpcklbw mm1, mm6 punpckhbw mm4, mm6 punpckhbw mm5, mm6 psubw mm0, mm1 psubw mm4, mm5 pmaddwd mm0, mm0 pmaddwd mm4, mm4 paddd mm7, mm0 paddd mm7, mm4 movq mm4, mm2 movq mm5, mm3 punpcklbw mm2, mm6 punpcklbw mm3, mm6 punpckhbw mm4, mm6 punpckhbw mm5, mm6 psubw mm2, mm3 psubw mm4, mm5 pmaddwd mm2, mm2 pmaddwd mm4, mm4 paddd mm7, mm2 paddd mm7, mm4%endmacroalign 16Skl_SSD_8x8_MMX: mov eax, [esp+ 4] ; Src1 mov ecx, [esp+ 8] ; Src2 mov edx, [esp+12] ; BpS pxor mm7, mm7 ; accum pxor mm6, mm6 ; zero SSD_8x8_MMX SSD_8x8_MMX SSD_8x8_MMX SSD_8x8_MMX movq mm6, mm7 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 ret;//////////////////////////////////////////////////////////////////////; Skl_SSD_4x4_MMX;//////////////////////////////////////////////////////////////////////%macro SSD_4x4_MMX 0 movd mm0, [eax] movd mm1, [ecx] movd mm2, [eax+edx] movd mm3, [ecx+edx] punpcklbw mm0, mm6 punpcklbw mm1, mm6 punpcklbw mm2, mm6 punpcklbw mm3, mm6 psubw mm0, mm1 psubw mm2, mm3 pmaddwd mm0, mm0 pmaddwd mm2, mm2 paddd mm7, mm0 paddd mm7, mm2%endmacroalign 16Skl_SSD_4x4_MMX: mov eax, [esp+ 4] ; Src1 mov ecx, [esp+ 8] ; Src2 mov edx, [esp+12] ; BpS pxor mm7, mm7 ; accum pxor mm6, mm6 ; zero
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -