?? sad_sse2.asm
字號:
;/*****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * sse2 sum of absolute difference; *; * Copyright(C) 2002 Dmitry Rozhdestvensky; *; * This file is part of XviD, a free MPEG-4 video encoder/decoder; *; * XviD is free software; you can redistribute it and/or modify it; * under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * Under section 8 of the GNU General Public License, the copyright; * holders of XVID explicitly forbid distribution in the following; * countries:; *; * - Japan; * - United States of America; *; * Linking XviD statically or dynamically with other modules is making a; * combined work based on XviD. Thus, the terms and conditions of the; * GNU General Public License cover the whole combination.; *; * As a special exception, the copyright holders of XviD give you; * permission to link XviD with independent modules that communicate with; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the; * license terms of these independent modules, and to copy and distribute; * the resulting combined work under terms of your choice, provided that; * every copy of the combined work is accompanied by a complete copy of; * the source code of XviD (the version of XviD used to produce the; * combined work), being distributed under the terms of the GNU General; * Public License plus this exception. An independent module is a module; * which is not derived from or based on XviD.; *; * Note that people who make modified versions of XviD are not obligated; * to grant this special exception for their modified versions; it is; * their choice whether to do so. The GNU General Public License gives; * permission to release a modified version without this exception; this; * exception also makes it possible to release a modified version which; * carries forward this exception.; *; * $Id: sad_sse2.asm,v 1.7 2002/11/17 00:32:06 edgomez Exp $; *; ****************************************************************************/bits 32%macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif%endmacro%define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect%define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect%define test_stride_alignment 0 ;test stride for alignment while autodetect%define early_return 0 ;use early return in sadsection .dataalign 64buffer times 4*8 dd 0 ;8 128-bit wordszero times 4 dd 0section .textcglobal sad16_sse2cglobal dev16_sse2;===========================================================================; General macros for SSE2 code;===========================================================================%macro load_stride 1 mov ecx,%1 add ecx,ecx mov edx,ecx add ecx,%1 ;stride*3 add edx,edx ;stride*4%endmacro%macro sad8lines 1 psadbw xmm0,[%1] psadbw xmm1,[%1+ebx] psadbw xmm2,[%1+ebx*2] psadbw xmm3,[%1+ecx] add %1,edx psadbw xmm4,[%1] psadbw xmm5,[%1+ebx] psadbw xmm6,[%1+ebx*2] psadbw xmm7,[%1+ecx] add %1,edx%endmacro%macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers paddusw xmm0,xmm1 paddusw xmm2,xmm3 paddusw xmm4,xmm5 paddusw xmm6,xmm7 paddusw xmm0,xmm2 paddusw xmm4,xmm6 paddusw xmm4,xmm0 pshufd xmm5,xmm4,11111110b paddusw xmm5,xmm4 pextrw %1,xmm5,0 ;less latency then movd%endmacro%macro restore 1 ;restores used registers%if %1=1 pop ebp%endif pop edi pop esi pop ebx%endmacro;===========================================================================;; uint32_t sad16_sse2 (const uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride,; const uint32_t best_sad);;;;===========================================================================align 16sad16_sse2 push ebx push esi push edi mov ebx,[esp + 3*4 + 12] ;stride%if sad_debug<>0 mov edi,[esp + 3*4 + 4] mov esi,[esp + 3*4 + 8]%endif%if sad_debug=1 jmp sad16_sse2_ul%endif%if sad_debug=2 jmp sad16_sse2_semial%endif %if sad_debug=3 jmp sad16_sse2_al%endif%if test_stride_alignment<>0 test ebx,15 jnz sad16_sse2_ul%endif mov edi,[esp + 3*4 + 4] ;cur (most likely aligned) test edi,15 cmovz esi,[esp + 3*4 + 8] ;load esi if edi is aligned cmovnz esi,edi ;move to esi and load edi cmovnz edi,[esp + 3*4 + 8] ;if not jnz esi_unaligned test esi,15 jnz near sad16_sse2_semial jmp sad16_sse2_alesi_unaligned: test edi,15 jnz near sad16_sse2_ul jmp sad16_sse2_semial;===========================================================================; Branch requires 16-byte alignment of esi and edi and stride;===========================================================================%macro sad16x8_al 1 movdqa xmm0,[esi] movdqa xmm1,[esi+ebx] movdqa xmm2,[esi+ebx*2] movdqa xmm3,[esi+ecx] add esi,edx movdqa xmm4,[esi] movdqa xmm5,[esi+ebx] movdqa xmm6,[esi+ebx*2] movdqa xmm7,[esi+ecx] add esi,edx sad8lines edi after_sad %1%endmacroalign 16sad16_sse2_al load_stride ebx sad16x8_al eax%if early_return=1 cmp eax,[esp + 3*4 + 16] ;best_sad jg continue_al%endif sad16x8_al ebx add eax,ebxcontinue_al: restore 0 ret;===========================================================================; Branch requires 16-byte alignment of the edi and stride only;===========================================================================%macro sad16x8_semial 1 movdqu xmm0,[esi] movdqu xmm1,[esi+ebx] movdqu xmm2,[esi+ebx*2] movdqu xmm3,[esi+ecx] add esi,edx movdqu xmm4,[esi] movdqu xmm5,[esi+ebx] movdqu xmm6,[esi+ebx*2] movdqu xmm7,[esi+ecx] add esi,edx sad8lines edi after_sad %1%endmacroalign 16sad16_sse2_semial load_stride ebx sad16x8_semial eax%if early_return=1 cmp eax,[esp + 3*4 + 16] ;best_sad jg cont_semial%endif sad16x8_semial ebx add eax,ebxcont_semial: restore 0 ret;===========================================================================; Branch does not require alignment, even stride;===========================================================================%macro sad16x4_ul 1 movdqu xmm0,[esi] movdqu xmm1,[esi+ebx] movdqu xmm2,[esi+ebx*2] movdqu xmm3,[esi+ecx] add esi,edx movdqu xmm4,[edi] movdqu xmm5,[edi+ebx] movdqu xmm6,[edi+ebx*2] movdqu xmm7,[edi+ecx] add edi,edx psadbw xmm4,xmm0 psadbw xmm5,xmm1 psadbw xmm6,xmm2 psadbw xmm7,xmm3 paddusw xmm4,xmm5 paddusw xmm6,xmm7 paddusw xmm4,xmm6 pshufd xmm7,xmm4,11111110b paddusw xmm7,xmm4 pextrw %1,xmm7,0%endmacro align 16sad16_sse2_ul load_stride ebx push ebp sad16x4_ul eax%if early_return=1 cmp eax,[esp + 4*4 + 16] ;best_sad jg continue_ul%endif sad16x4_ul ebp add eax,ebp%if early_return=1 cmp eax,[esp + 4*4 + 16] ;best_sad jg continue_ul%endif sad16x4_ul ebp add eax,ebp%if early_return=1 cmp eax,[esp + 4*4 + 16] ;best_sad jg continue_ul%endif sad16x4_ul ebp add eax,ebpcontinue_ul: restore 1 ret;===========================================================================;; uint32_t dev16_sse2(const uint8_t * const cur,; const uint32_t stride);;; experimental!;;===========================================================================align 16dev16_sse2 push ebx push esi push edi push ebp mov esi, [esp + 4*4 + 4] ; cur mov ebx, [esp + 4*4 + 8] ; stride mov edi, buffer%if dev_debug=1 jmp dev16_sse2_ul%endif%if dev_debug=2 jmp dev16_sse2_al%endif test esi,15 jnz near dev16_sse2_ul%if test_stride_alignment=1 test ebx,15 jnz dev16_sse2_ul%endif mov edi,esi jmp dev16_sse2_al;===========================================================================; Branch requires alignment of both the cur and stride;===========================================================================%macro make_mean 0 add eax,ebp ;mean 16-bit mov al,ah ;eax= {0 0 mean/256 mean/256} mov ebp,eax shl ebp,16 or eax,ebp%endmacro%macro sad_mean16x8_al 3 ;destination,0=zero,1=mean from eax,source%if %2=0 pxor xmm0,xmm0%else movd xmm0,eax pshufd xmm0,xmm0,0%endif movdqa xmm1,xmm0 movdqa xmm2,xmm0 movdqa xmm3,xmm0 movdqa xmm4,xmm0 movdqa xmm5,xmm0 movdqa xmm6,xmm0 movdqa xmm7,xmm0 sad8lines %3 after_sad %1%endmacroalign 16dev16_sse2_al load_stride ebx sad_mean16x8_al eax,0,esi sad_mean16x8_al ebp,0,esi make_mean sad_mean16x8_al ebp,1,edi sad_mean16x8_al eax,1,edi add eax,ebp restore 1 ret;===========================================================================; Branch does not require alignment;===========================================================================%macro sad_mean16x8_ul 2 pxor xmm7,xmm7 movdqu xmm0,[%1] movdqu xmm1,[%1+ebx] movdqu xmm2,[%1+ebx*2] movdqu xmm3,[%1+ecx] add %1,edx movdqa [buffer+16*0],xmm0 movdqa [buffer+16*1],xmm1 movdqa [buffer+16*2],xmm2 movdqa [buffer+16*3],xmm3 movdqu xmm4,[%1] movdqu xmm5,[%1+ebx] movdqu xmm6,[%1+ebx*2] movdqa [buffer+16*4],xmm4 movdqa [buffer+16*5],xmm5 movdqa [buffer+16*6],xmm6 psadbw xmm0,xmm7 psadbw xmm1,xmm7 psadbw xmm2,xmm7 psadbw xmm3,xmm7 psadbw xmm4,xmm7 psadbw xmm5,xmm7 psadbw xmm6,xmm7 movdqu xmm7,[%1+ecx] movdqa [buffer+16*7],xmm7 psadbw xmm7,[zero] add %1,edx after_sad %2%endmacroalign 16dev16_sse2_ul load_stride ebx sad_mean16x8_ul esi,eax sad_mean16x8_ul esi,ebp make_mean sad_mean16x8_al ebp,1,edi sad_mean16x8_al eax,1,edi add eax,ebp restore 1 ret
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -