?? skl_mb_mmx.asm
字號:
;/********************************************************; * Some code. Copyright (C) 2003 by Pascal Massimino. *; * All Rights Reserved. (http://skal.planet-d.net) *; * For Educational/Academic use ONLY. See 'LICENSE.TXT'.*; ********************************************************/;//////////////////////////////////////////////////////////////////////;// Macro-block processing;//////////////////////////////////////////////////////////////////////; [BITS 32]%include "../../include/skl_syst/skl_nasm.h"globl Skl_Add_8x4_FF_MMXglobl Skl_Add_8x4_FH_Rnd0_MMXglobl Skl_Add_8x4_HF_Rnd0_MMXglobl Skl_Add_8x4_HH_Rnd0_MMXglobl Skl_Add_8x8_FF_MMXglobl Skl_Add_8x8_FH_Rnd0_MMXglobl Skl_Add_8x8_HF_Rnd0_MMXglobl Skl_Add_8x8_HH_Rnd0_MMXglobl Skl_Add_16x8_FF_MMXglobl Skl_Add_16x8_FH_Rnd0_MMXglobl Skl_Add_16x8_HF_Rnd0_MMXglobl Skl_Add_16x8_HH_Rnd0_MMXglobl Skl_Copy_8x4_FF_MMXglobl Skl_Copy_8x4_FH_Rnd1_MMXglobl Skl_Copy_8x4_HF_Rnd1_MMXglobl Skl_Copy_8x4_HH_Rnd1_MMXglobl Skl_Copy_8x4_FH_Rnd0_MMXglobl Skl_Copy_8x4_HF_Rnd0_MMXglobl Skl_Copy_8x4_HH_Rnd0_MMXglobl Skl_Copy_8x8_FF_MMXglobl Skl_Copy_8x8_FH_Rnd1_MMXglobl Skl_Copy_8x8_HF_Rnd1_MMXglobl Skl_Copy_8x8_HH_Rnd1_MMXglobl Skl_Copy_8x8_FH_Rnd0_MMXglobl Skl_Copy_8x8_HF_Rnd0_MMXglobl Skl_Copy_8x8_HH_Rnd0_MMXglobl Skl_Copy_16x8_FF_MMXglobl Skl_Copy_16x8_FH_Rnd1_MMXglobl Skl_Copy_16x8_HF_Rnd1_MMXglobl Skl_Copy_16x8_HH_Rnd1_MMXglobl Skl_Copy_16x8_FH_Rnd0_MMXglobl Skl_Copy_16x8_HF_Rnd0_MMXglobl Skl_Copy_16x8_HH_Rnd0_MMXglobl Skl_H_Pass_2Taps_MMXglobl Skl_V_Pass_2Taps_MMXglobl Skl_HV_Pass_2Taps_MMXglobl Skl_Copy_16x8_8To16_MMXglobl Skl_Copy_8x8_8To16_MMXglobl Skl_Diff_16x8_8To16_MMXglobl Skl_Diff_8x8_8To16_MMXglobl Skl_Diff_16x8_88To16_MMXglobl Skl_Diff_8x8_88To16_MMXglobl Skl_Move_16x8_MMXglobl Skl_Move_8x8_MMXglobl Skl_Copy_Upsampled_8x8_16To8_MMXglobl Skl_Add_Upsampled_8x8_16To8_MMXglobl Skl_HFilter_31_MMXglobl Skl_VFilter_31_x86globl Skl_HFilter_31_x86globl Skl_Filter_18x18_To_8x8_MMXglobl Skl_Filter_Diff_18x18_To_8x8_MMX;//////////////////////////////////////////////////////////////////////DATAalign 16Rounder2_MMX times 4 dw 2Rounder1_MMX times 4 dw 1Rounder0_MMX times 4 dw 0Up31 dw 3, 1, 3, 1Up13 dw 1, 3, 1, 3Cst0 dw 0, 0, 0, 0Cst2 dw 2, 2, 2, 2Cst3 dw 3, 3, 3, 3Cst32 dw 32,32,32,32Cst2000 dw 2, 0, 0, 0Cst0002 dw 0, 0, 0, 2Mask_ff dw 0xff,0xff,0xff,0xffTEXT;//////////////////////////////////////////////////////////////////////;//;// Half-pixel interpolation functions;//;//////////////////////////////////////////////////////////////////////%macro PROLOG0 0 mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; BpS%endmacro%macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder pxor mm6, mm6 movq mm7, [%1] ; TODO: dangerous! (eax isn't checked)%if %2 movq mm5, [Rounder1_MMX]%endif PROLOG0%endmacro ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3)%macro MIX 0 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm0, mm2 paddusw mm1, mm3%endmacro%macro MIX_DST 0 movq mm3, mm2 paddusw mm0, mm7 ; rounder paddusw mm1, mm7 ; rounder punpcklbw mm2, mm6 punpckhbw mm3, mm6 psrlw mm0, 1 psrlw mm1, 1 paddusw mm0, mm2 ; mix Src(mm0/mm1) with Dst(mm2/mm3) paddusw mm1, mm3 paddusw mm0, mm5 paddusw mm1, mm5 psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1%endmacro%macro MIX2 0 punpcklbw mm0, mm6 punpcklbw mm2, mm6 paddusw mm0, mm2 paddusw mm0, mm7 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm1, mm7 paddusw mm1, mm3 psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1%endmacro;//////////////////////////////////////////////////////////////////////;// Add functions;//////////////////////////////////////////////////////////////////////%macro ADD_FF_MMX 1 movq mm0, [eax] movq mm2, [ecx] movq mm1, mm0 movq mm3, mm2%if (%1!=0) lea eax,[eax+%1*edx]%endif MIX paddusw mm0, mm5 ; rounder paddusw mm1, mm5 ; rounder psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1 movq [ecx], mm0%if (%1!=0) lea ecx,[ecx+%1*edx]%endif%endmacroalign 16Skl_Add_8x4_FF_MMX: PROLOG Rounder1_MMX, 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 0 retalign 16Skl_Add_8x8_FF_MMX: PROLOG Rounder1_MMX, 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 0 ret%macro ADD_16x8_FF_MMX 0 movq mm0, [eax] movq mm2, [ecx] movq mm1, mm0 movq mm3, mm2 MIX paddusw mm0, mm5 ; dst-rounder paddusw mm1, mm5 ; dst-rounder psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1 movq [ecx], mm0 movq mm0, [eax+8] movq mm2, [ecx+8] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX paddusw mm0, mm5 ; dst-rounder paddusw mm1, mm5 ; dst-rounder psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1 movq [ecx+8], mm0%endmacroalign 16Skl_Add_16x8_FF_MMX: PROLOG Rounder1_MMX, 1 ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX lea ecx,[ecx+edx] ADD_16x8_FF_MMX ret;//////////////////////////////////////////////////////////////////////%macro ADD_FH_MMX 0 movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX movq mm2, [ecx] ; prepare mix with Dst[0] MIX_DST movq [ecx], mm0%endmacroalign 16Skl_Add_8x4_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX retalign 16Skl_Add_8x8_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX ret%macro ADD_16x8_FH_MMX 0 movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 MIX movq mm2, [ecx] ; prepare mix with Dst[0] MIX_DST movq [ecx], mm0 movq mm0, [eax+8] movq mm2, [eax+9] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX movq mm2, [ecx+8] ; prepare mix with Dst[0] MIX_DST movq [ecx+8], mm0%endmacroalign 16Skl_Add_16x8_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX lea ecx,[ecx+edx] ADD_16x8_FH_MMX ret;//////////////////////////////////////////////////////////////////////%macro ADD_HF_MMX 0 movq mm0, [eax] movq mm2, [eax+edx] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX movq mm2, [ecx] ; prepare mix with Dst[0] MIX_DST movq [ecx], mm0%endmacroalign 16Skl_Add_8x4_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX retalign 16Skl_Add_8x8_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX ret%macro ADD_16x8_HF_MMX 0 movq mm0, [eax] movq mm2, [eax+edx] movq mm1, mm0 movq mm3, mm2 MIX movq mm2, [ecx] ; prepare mix with Dst[0] MIX_DST movq [ecx], mm0 movq mm0, [eax+8] movq mm2, [eax+edx+8] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX movq mm2, [ecx+8] ; prepare mix with Dst[0] MIX_DST movq [ecx+8], mm0%endmacroalign 16Skl_Add_16x8_HF_Rnd0_MMX: PROLOG Rounder1_MMX, 1 ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX lea ecx,[ecx+edx] ADD_16x8_HF_MMX ret;//////////////////////////////////////////////////////////////////////%macro ADD_HH_MMX 0 lea eax,[eax+edx] ; transfert prev line to mm0/mm1 movq mm0, mm2 movq mm1, mm3 ; load new line in mm2/mm3 movq mm2, [eax] movq mm4, [eax+1] movq mm3, mm2 movq mm5, mm4 punpcklbw mm2, mm6 punpcklbw mm4, mm6 paddusw mm2, mm4 punpckhbw mm3, mm6 punpckhbw mm5, mm6 paddusw mm3, mm5 ; mix current line (mm2/mm3) with previous (mm0,mm1); ; we'll preserve mm2/mm3 for next line... paddusw mm0, mm2 paddusw mm1, mm3 movq mm4, [ecx] ; prepare mix with Dst[0] movq mm5, mm4 paddusw mm0, mm7 ; finish mixing current line paddusw mm1, mm7 punpcklbw mm4, mm6 punpckhbw mm5, mm6 psrlw mm0, 2 psrlw mm1, 2 paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3) paddusw mm1, mm5 paddusw mm0, [Rounder1_MMX] paddusw mm1, [Rounder1_MMX] psrlw mm0, 1 psrlw mm1, 1 packuswb mm0, mm1 movq [ecx], mm0%endmacroalign 16Skl_Add_8x4_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; mm5 is busy. Don't load dst-rounder ; preprocess first line movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm2, mm0 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX retalign 16Skl_Add_8x8_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; mm5 is busy. Don't load dst-rounder ; preprocess first line movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm2, mm0 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX retalign 16Skl_Add_16x8_HH_Rnd0_MMX: PROLOG Rounder2_MMX, 0 ; preprocess first line movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm2, mm0 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX ; second column mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src lea ecx, [ecx+8] lea eax, [eax+8] ; preprocess first line movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddusw mm2, mm0 paddusw mm3, mm1 ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX lea ecx,[ecx+edx] ADD_HH_MMX ret;//////////////////////////////////////////////////////////////////////;// Copy functions;//////////////////////////////////////////////////////////////////////%macro COPY_FF_8 1 ; %1:phase movq mm0, [eax] movq mm1, [eax+edx] movq [ecx], mm0%if (%1!=1) lea eax, [eax+2*edx]%endif movq [ecx+edx], mm1%if (%1!=1) lea ecx, [ecx+2*edx]%endif%endmacroalign 16Skl_Copy_8x4_FF_MMX: ; 9c PROLOG0 movq mm0, [eax ] movq mm1, [eax+edx ] movq mm2, [eax+2*edx] movq [ecx ], mm0 movq [ecx+edx ], mm1 movq [ecx+2*edx], mm2 lea edx, [edx+2*edx] movq mm1, [eax+edx] movq [ecx+edx], mm1 retalign 16Skl_Copy_8x8_FF_MMX: ; 14c PROLOG0 COPY_FF_8 0 COPY_FF_8 0 COPY_FF_8 0 COPY_FF_8 1 ret%macro COPY_FF_16 1 movq mm0, [eax] movq mm1, [eax+8] movq mm2, [eax+edx] movq mm3, [eax+edx+8] movq [ecx], mm0 movq [ecx+8], mm1 movq [ecx+edx], mm2%if (%1!=1) lea eax, [eax+2*edx]%endif movq [ecx+edx+8], mm3%if (%1!=1) lea ecx, [ecx+2*edx]%endif%endmacroalign 16Skl_Copy_16x8_FF_MMX: ; 26c PROLOG0 COPY_FF_16 0 COPY_FF_16 0 COPY_FF_16 0 COPY_FF_16 1 ret;//////////////////////////////////////////////////////////////////////%macro COPY_FH_MMX 0 movq mm0, [eax] movq mm2, [eax+1] movq mm1, mm0 movq mm3, mm2 lea eax,[eax+edx] MIX2 movq [ecx], mm0%endmacroalign 16Skl_Copy_8x4_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX retalign 16Skl_Copy_8x4_FH_Rnd1_MMX: PROLOG Rounder0_MMX, 0 COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX retalign 16Skl_Copy_8x8_FH_Rnd0_MMX: PROLOG Rounder1_MMX, 0 COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX lea ecx,[ecx+edx] COPY_FH_MMX
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -