?? sad_xmm.asm
字號:
;/*****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * xmm (extended mmx) sum of absolute difference; *; * Copyright(C) 2002 Peter Ross <pross@xvid.org>; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>; *; * This file is part of XviD, a free MPEG-4 video encoder/decoder; *; * XviD is free software; you can redistribute it and/or modify it; * under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * Under section 8 of the GNU General Public License, the copyright; * holders of XVID explicitly forbid distribution in the following; * countries:; *; * - Japan; * - United States of America; *; * Linking XviD statically or dynamically with other modules is making a; * combined work based on XviD. Thus, the terms and conditions of the; * GNU General Public License cover the whole combination.; *; * As a special exception, the copyright holders of XviD give you; * permission to link XviD with independent modules that communicate with; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the; * license terms of these independent modules, and to copy and distribute; * the resulting combined work under terms of your choice, provided that; * every copy of the combined work is accompanied by a complete copy of; * the source code of XviD (the version of XviD used to produce the; * combined work), being distributed under the terms of the GNU General; * Public License plus this exception. An independent module is a module; * which is not derived from or based on XviD.; *; * Note that people who make modified versions of XviD are not obligated; * to grant this special exception for their modified versions; it is; * their choice whether to do so. The GNU General Public License gives; * permission to release a modified version without this exception; this; * exception also makes it possible to release a modified version which; * carries forward this exception.; *; * $Id: sad_xmm.asm,v 1.5 2002/11/17 00:32:06 edgomez Exp $; *; *************************************************************************/bits 32%macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif%endmacrosection .dataalign 16mmx_one times 4 dw 1section .textcglobal sad16_xmmcglobal sad8_xmmcglobal sad16bi_xmmcglobal sad8bi_xmmcglobal dev16_xmm;===========================================================================;; uint32_t sad16_xmm(const uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride,; const uint32_t best_sad);;;===========================================================================%macro SAD_16x16_SSE 0 movq mm0, [eax] psadbw mm0, [edx] movq mm1, [eax+8] add eax, ecx psadbw mm1, [edx+8] paddusw mm5,mm0 add edx, ecx paddusw mm6,mm1%endmacroalign 16sad16_xmm: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; Stride pxor mm5, mm5 ; accum1 pxor mm6, mm6 ; accum2 SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE SAD_16x16_SSE paddusw mm6,mm5 movd eax, mm6 ret;===========================================================================;; uint32_t sad8_xmm(const uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride);;;===========================================================================%macro SAD_8x8_SSE 0 movq mm0, [eax] movq mm1, [eax+ecx] psadbw mm0, [edx] psadbw mm1, [edx+ecx] add eax, ebx add edx, ebx paddusw mm5,mm0 paddusw mm6,mm1%endmacroalign 16sad8_xmm: mov eax, [esp+ 4] ; Src1 mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; Stride push ebx lea ebx, [ecx+ecx] pxor mm5, mm5 ; accum1 pxor mm6, mm6 ; accum2 SAD_8x8_SSE SAD_8x8_SSE SAD_8x8_SSE movq mm0, [eax] movq mm1, [eax+ecx] psadbw mm0, [edx] psadbw mm1, [edx+ecx] pop ebx paddusw mm5,mm0 paddusw mm6,mm1 paddusw mm6,mm5 movd eax, mm6 ret;===========================================================================;; uint32_t sad16bi_xmm(const uint8_t * const cur,; const uint8_t * const ref1,; const uint8_t * const ref2,; const uint32_t stride);;;===========================================================================%macro SADBI_16x16_SSE 0 movq mm0, [eax] movq mm1, [eax+8] movq mm2, [edx] movq mm3, [edx+8] pavgb mm2, [ebx] add edx, ecx pavgb mm3, [ebx+8] add ebx, ecx psadbw mm0, mm2 add eax, ecx psadbw mm1, mm3 paddusw mm5,mm0 paddusw mm6,mm1 %endmacroalign 16sad16bi_xmm: push ebx mov eax, [esp+4+ 4] ; Src mov edx, [esp+4+ 8] ; Ref1 mov ebx, [esp+4+12] ; Ref2 mov ecx, [esp+4+16] ; Stride pxor mm5, mm5 ; accum1 pxor mm6, mm6 ; accum2 SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE SADBI_16x16_SSE paddusw mm6,mm5 movd eax, mm6 pop ebx ret;=========================================================================== ; ; uint32_t sad8bi_xmm(const uint8_t * const cur, ; const uint8_t * const ref1, ; const uint8_t * const ref2, ; const uint32_t stride); ; ;=========================================================================== %macro SADBI_8x8_XMM 0 movq mm0, [eax] movq mm1, [eax+ecx] movq mm2, [edx] movq mm3, [edx+ecx] pavgb mm2, [ebx] lea edx, [edx+2*ecx] pavgb mm3, [ebx+ecx] lea ebx, [ebx+2*ecx] psadbw mm0, mm2 lea eax, [eax+2*ecx] psadbw mm1, mm3 paddusw mm5,mm0 paddusw mm6,mm1 %endmacro align 16 sad8bi_xmm: push ebx mov eax, [esp+4+ 4] ; Src mov edx, [esp+4+ 8] ; Ref1 mov ebx, [esp+4+12] ; Ref2 mov ecx, [esp+4+16] ; Stride pxor mm5, mm5 ; accum1 pxor mm6, mm6 ; accum2 .Loop SADBI_8x8_XMM SADBI_8x8_XMM SADBI_8x8_XMM SADBI_8x8_XMM paddusw mm6,mm5 movd eax, mm6 pop ebx ret ;===========================================================================;; uint32_t dev16_xmm(const uint8_t * const cur,; const uint32_t stride);;;===========================================================================%macro MEAN_16x16_SSE 0 movq mm0, [eax] movq mm1, [eax+8] psadbw mm0, mm7 psadbw mm1, mm7 add eax, ecx paddw mm5, mm0 paddw mm6, mm1%endmacro %macro ABS_16x16_SSE 0 movq mm0, [eax] movq mm1, [eax+8] psadbw mm0, mm4 psadbw mm1, mm4 lea eax,[eax+ecx] paddw mm5, mm0 paddw mm6, mm1%endmacroalign 16dev16_xmm: mov eax, [esp+ 4] ; Src mov ecx, [esp+ 8] ; Stride pxor mm7, mm7 ; zero pxor mm5, mm5 ; mean accums pxor mm6, mm6 MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE MEAN_16x16_SSE paddusw mm6, mm5 movq mm4, mm6 psllq mm4, 32 paddd mm4, mm6 psrld mm4, 8 ; /= (16*16) packssdw mm4, mm4 packuswb mm4, mm4 ; mm4 contains the mean mov eax, [esp+ 4] ; Src pxor mm5, mm5 ; sums pxor mm6, mm6 ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE ABS_16x16_SSE paddusw mm6, mm5 movq mm7, mm6 psllq mm7, 32 paddd mm6, mm7 movd eax, mm6 ret
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -