?? plugin_ssim-a.asm
字號:
;/****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - optimized SSIM routines -
; *
; * Copyright(C) 2006 Johannes Reinhardt <johannes.reinhardt@gmx.de>
; *
; * This program is free software; you can redistribute it and/or modify it
; * under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; *
; ***************************************************************************/
BITS 32
%macro cglobal 1
%ifdef PREFIX
%ifdef MARK_FUNCS
global _%1:function %1.endfunc-%1
%define %1 _%1:function %1.endfunc-%1
%else
global _%1
%define %1 _%1
%endif
%else
%ifdef MARK_FUNCS
global %1:function %1.endfunc-%1
%else
global %1
%endif
%endif
%endmacro
%macro ACC_ROW 2
movq %1,[ ecx]
movq %2,[ecx+edx]
psadbw %1,mm0
psadbw %2,mm0
lea ecx, [ecx+2*edx]
paddw %1, %2
%endmacro
%macro CONSIM_1x8_SSE2 0
movdqu xmm0,[ecx]
movdqu xmm1,[edx]
;unpack to words
punpcklbw xmm0,xmm2
punpcklbw xmm1,xmm2
movaps xmm3,xmm0
movaps xmm4,xmm1
pmaddwd xmm0,xmm0;orig
pmaddwd xmm1,xmm1;comp
pmaddwd xmm3,xmm4;corr
paddd xmm5,xmm0
paddd xmm6,xmm1
paddd xmm7,xmm3
%endmacro
%macro CONSIM_1x8_MMX 0
movq mm0,[ecx];orig
movq mm1,[edx];comp
;unpack low half of qw to words
punpcklbw mm0,mm2
punpcklbw mm1,mm2
movq mm3,mm0
pmaddwd mm3,mm0
paddd mm5,mm3;
movq mm4,mm1
pmaddwd mm4,mm1
paddd mm6,mm4;
pmaddwd mm1,mm0
paddd mm7,mm1
movq mm0,[ecx];orig
movq mm1,[edx];comp
;unpack high half of qw to words
punpckhbw mm0,mm2
punpckhbw mm1,mm2
movq mm3,mm0
pmaddwd mm3,mm0
paddd mm5,mm3;
movq mm4,mm1
pmaddwd mm4,mm1
paddd mm6,mm4;
pmaddwd mm1,mm0
paddd mm7,mm1
%endmacro
%macro CONSIM_WRITEOUT 3
mov eax,[esp + 16];lumo
mul eax; lumo^2
add eax, 32
shr eax,6; 64*lum0^2
movd ecx,%1
sub ecx,eax
mov edx,[esp + 24]; pdevo
mov [edx],ecx
mov eax,[esp + 20];lumc
mul eax; lumc^2
add eax, 32
shr eax,6; 64*lumc^2
movd ecx,%2
sub ecx,eax
mov edx,[esp + 28]; pdevc
mov [edx],ecx
mov eax,[esp + 16];lumo
mul dword [esp + 20]; lumo*lumc, should fit in eax
add eax, 32
shr eax,6; 64*lumo*lumc
movd ecx,%3
sub ecx,eax
mov edx,[esp + 32]; pcorr
mov [edx],ecx
%endmacro
SECTION .text
cglobal lum_8x8_mmx
cglobal consim_sse2
cglobal consim_mmx
;int lum_8x8_c(uint8_t* ptr, uint32_t stride)
ALIGN 16
lum_8x8_mmx:
mov ecx, [esp + 4] ;ptr
mov edx, [esp + 8];stride
pxor mm0,mm0
ACC_ROW mm1, mm2
ACC_ROW mm3, mm4
ACC_ROW mm5, mm6
ACC_ROW mm7, mm4
paddw mm1, mm3
paddw mm5, mm7
paddw mm1, mm5
movd eax,mm1
ret
.endfunc
ALIGN 16
consim_sse2:
mov ecx,[esp+4] ;ptro
mov edx,[esp+8] ;ptrc
mov eax,[esp+12];stride
pxor xmm2,xmm2;null vektor
pxor xmm5,xmm5;devo
pxor xmm6,xmm6;devc
pxor xmm7,xmm7;corr
;broadcast lumo/c
punpcklbw xmm6,xmm6
punpcklwd xmm6,xmm6
pshufd xmm6,xmm6,00000000b;or shufps
punpcklbw xmm7,xmm7
punpcklwd xmm7,xmm7
pshufd xmm7,xmm7,00000000b
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
add ecx,eax
add edx,eax
CONSIM_1x8_SSE2
;accumulate xmm5-7
pshufd xmm0, xmm5, 0x0E
paddd xmm5, xmm0
pshufd xmm0, xmm5, 0x01
paddd xmm5, xmm0
pshufd xmm1, xmm6, 0x0E
paddd xmm6, xmm1
pshufd xmm1, xmm6, 0x01
paddd xmm6, xmm1
pshufd xmm2, xmm7, 0x0E
paddd xmm7, xmm2
pshufd xmm2, xmm7, 0x01
paddd xmm7, xmm2
CONSIM_WRITEOUT xmm5,xmm6,xmm7
ret
.endfunc
ALIGN 16
consim_mmx:
mov ecx,[esp+4] ;ptro
mov edx,[esp+8] ;ptrc
mov eax,[esp+12];stride
pxor mm2,mm2;null
pxor mm5,mm5;devo
pxor mm6,mm6;devc
pxor mm7,mm7;corr
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
add ecx,eax
add edx,eax
CONSIM_1x8_MMX
movq mm0,mm5
psrlq mm0,32
paddd mm5,mm0
movq mm1,mm6
psrlq mm1,32
paddd mm6,mm1
movq mm2,mm7
psrlq mm2,32
paddd mm7,mm2
CONSIM_WRITEOUT mm5,mm6,mm7
ret
.endfunc
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -