?? mc-a2.asm
字號:
;*****************************************************************************;* mc-a2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005-2008 x264 project;*;* Authors: Loren Merritt <lorenm@u.washington.edu>;* Jason Garrett-Glaser <darkshikari@gmail.com>;* Holger Lubitz <hal@duncan.ol.sub.de>;* Mathieu Monnier <manao@melix.net>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.;*****************************************************************************%include "x86inc.asm"%include "x86util.asm"SECTION_RODATAfilt_mul20: times 16 db 20filt_mul51: times 8 db 1, -5pw_1: times 8 dw 1pw_16: times 8 dw 16pw_32: times 8 dw 32SECTION .text%macro LOAD_ADD 4 movh %4, %3 movh %1, %2 punpcklbw %4, m0 punpcklbw %1, m0 paddw %1, %4%endmacro%macro LOAD_ADD_2 6 mova %5, %3 mova %1, %4 mova %6, %5 mova %2, %1 punpcklbw %5, m0 punpcklbw %1, m0 punpckhbw %6, m0 punpckhbw %2, m0 paddw %1, %5 paddw %2, %6%endmacro%macro FILT_V2 0 psubw m1, m2 ; a-b psubw m4, m5 psubw m2, m3 ; b-c psubw m5, m6 psllw m2, 2 psllw m5, 2 psubw m1, m2 ; a-5*b+4*c psubw m4, m5 psllw m3, 4 psllw m6, 4 paddw m1, m3 ; a-5*b+20*c paddw m4, m6%endmacro%macro FILT_H 3 psubw %1, %2 ; a-b psraw %1, 2 ; (a-b)/4 psubw %1, %2 ; (a-b)/4-b paddw %1, %3 ; (a-b)/4-b+c psraw %1, 2 ; ((a-b)/4-b+c)/4 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16%endmacro%macro FILT_H2 6 psubw %1, %2 psubw %4, %5 psraw %1, 2 psraw %4, 2 psubw %1, %2 psubw %4, %5 paddw %1, %3 paddw %4, %6 psraw %1, 2 psraw %4, 2 paddw %1, %3 paddw %4, %6%endmacro%macro FILT_PACK 3 paddw %1, m7 paddw %2, m7 psraw %1, %3 psraw %2, %3 packuswb %1, %2%endmacroINIT_MMX%macro HPEL_V 1-2 0;-----------------------------------------------------------------------------; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_v_%1, 5,6,%2%ifdef WIN64 movsxd r4, r4d%endif lea r5, [r1+r3] sub r1, r3 sub r1, r3 add r0, r4 lea r2, [r2+r4*2] neg r4%ifnidn %1, ssse3 pxor m0, m0%else mova m0, [filt_mul51 GLOBAL]%endif.loop:%ifidn %1, ssse3 mova m1, [r1] mova m4, [r1+r3] mova m2, [r5+r3*2] mova m5, [r5+r3] mova m3, [r1+r3*2] mova m6, [r5] SBUTTERFLY bw, 1, 4, 7 SBUTTERFLY bw, 2, 5, 7 SBUTTERFLY bw, 3, 6, 7 pmaddubsw m1, m0 pmaddubsw m4, m0 pmaddubsw m2, m0 pmaddubsw m5, m0 pmaddubsw m3, [filt_mul20 GLOBAL] pmaddubsw m6, [filt_mul20 GLOBAL] paddw m1, m2 paddw m4, m5 paddw m1, m3 paddw m4, m6%else LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2%endif mova m7, [pw_16 GLOBAL] mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 paddw m1, m7 paddw m4, m7 psraw m1, 5 psraw m4, 5 packuswb m1, m4 mova [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize jl .loop REP_RET%endmacroHPEL_V mmxext;-----------------------------------------------------------------------------; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_c_mmxext, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 %define src r1+r2*2 movq m7, [pw_32 GLOBAL].loop: movq m1, [src-4] movq m2, [src-2] movq m3, [src ] movq m4, [src+4] movq m5, [src+6] paddw m3, [src+2] ; c0 paddw m2, m4 ; b0 paddw m1, m5 ; a0 movq m6, [src+8] paddw m4, [src+14] ; a1 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, 6 movntq [r0+r2], m1 add r2, 8 jl .loop REP_RET;-----------------------------------------------------------------------------; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_h_mmxext, 3,3 add r0, r2 add r1, r2 neg r2 %define src r1+r2 pxor m0, m0.loop: movd m1, [src-2] movd m2, [src-1] movd m3, [src ] movd m6, [src+1] movd m4, [src+2] movd m5, [src+3] punpcklbw m1, m0 punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m6, m0 punpcklbw m4, m0 punpcklbw m5, m0 paddw m3, m6 ; c0 paddw m2, m4 ; b0 paddw m1, m5 ; a0 movd m7, [src+7] movd m6, [src+6] punpcklbw m7, m0 punpcklbw m6, m0 paddw m4, m7 ; c1 paddw m5, m6 ; b1 movd m7, [src+5] movd m6, [src+4] punpcklbw m7, m0 punpcklbw m6, m0 paddw m6, m7 ; a1 movq m7, [pw_1 GLOBAL] FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, 1 movntq [r0+r2], m1 add r2, 8 jl .loop REP_RETINIT_XMM%macro HPEL_C 1;-----------------------------------------------------------------------------; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_c_%1, 3,3,9 add r0, r2 lea r1, [r1+r2*2] neg r2 %define src r1+r2*2%ifidn %1, ssse3 mova m7, [pw_32 GLOBAL] %define tpw_32 m7%elifdef ARCH_X86_64 mova m8, [pw_32 GLOBAL] %define tpw_32 m8%else %define tpw_32 [pw_32 GLOBAL]%endif.loop:%ifidn %1,sse2_misalign movu m0, [src-4] movu m1, [src-2] mova m2, [src] paddw m0, [src+6] paddw m1, [src+4] paddw m2, [src+2]%else mova m6, [src-16] mova m2, [src] mova m3, [src+16] mova m0, m2 mova m1, m2 mova m4, m3 mova m5, m3 PALIGNR m3, m2, 2, m7 PALIGNR m4, m2, 4, m7 PALIGNR m5, m2, 6, m7 PALIGNR m0, m6, 12, m7 PALIGNR m1, m6, 14, m7 paddw m2, m3 paddw m1, m4 paddw m0, m5%endif FILT_H m0, m1, m2 paddw m0, tpw_32 psraw m0, 6 packuswb m0, m0 movq [r0+r2], m0 add r2, 8 jl .loop REP_RET%endmacro;-----------------------------------------------------------------------------; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_h_sse2, 3,3,8 add r0, r2 add r1, r2 neg r2 %define src r1+r2 pxor m0, m0.loop: movh m1, [src-2] movh m2, [src-1] movh m3, [src ] movh m4, [src+1] movh m5, [src+2] movh m6, [src+3] punpcklbw m1, m0 punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m4, m0 punpcklbw m5, m0 punpcklbw m6, m0 paddw m3, m4 ; c0 paddw m2, m5 ; b0 paddw m1, m6 ; a0 movh m4, [src+6] movh m5, [src+7] movh m6, [src+10] movh m7, [src+11] punpcklbw m4, m0 punpcklbw m5, m0 punpcklbw m6, m0 punpcklbw m7, m0 paddw m5, m6 ; b1 paddw m4, m7 ; a1 movh m6, [src+8] movh m7, [src+9] punpcklbw m6, m0 punpcklbw m7, m0 paddw m6, m7 ; c1 mova m7, [pw_1 GLOBAL] ; FIXME xmm8 FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, 1 movntdq [r0+r2], m1 add r2, 16 jl .loop REP_RET%ifndef ARCH_X86_64;-----------------------------------------------------------------------------; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_h_ssse3, 3,3 add r0, r2 add r1, r2 neg r2 %define src r1+r2 pxor m0, m0 movh m1, [src-8] punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8 movh m2, [src] punpcklbw m2, m0 mova m7, [pw_1 GLOBAL].loop: movh m3, [src+8] punpcklbw m3, m0 mova m4, m2 palignr m2, m1, 14 mova m5, m3 palignr m3, m4, 4 paddw m3, m2 mova m2, m4 palignr m4, m1, 12 mova m1, m5 palignr m5, m2, 6 paddw m5, m4 mova m4, m1 palignr m1, m2, 2 paddw m1, m2 FILT_H m5, m3, m1 movh m1, [src+16] punpcklbw m1, m0 mova m3, m4 palignr m4, m2, 14 mova m6, m1 palignr m1, m3, 4 paddw m1, m4 mova m4, m3 palignr m3, m2, 12 mova m2, m6 palignr m6, m4, 6 paddw m6, m3 mova m3, m2 palignr m2, m4, 2 paddw m2, m4 FILT_H m6, m1, m2 FILT_PACK m5, m6, 1 movdqa [r0+r2], m5 add r2, 16 mova m2, m3 mova m1, m4 jl .loop REP_RET%endif%define PALIGNR PALIGNR_MMX%ifndef ARCH_X86_64HPEL_C sse2%endifHPEL_V sse2, 8HPEL_C sse2_misalign%define PALIGNR PALIGNR_SSSE3HPEL_C ssse3HPEL_V ssse3%ifdef ARCH_X86_64%macro DO_FILT_V 6%ifidn %6, ssse3 mova m1, [r3] mova m2, [r3+r2] mova %3, [r3+r2*2] mova m3, [r1] mova %4, [r1+r2] mova m0, [r1+r2*2] mova %2, [filt_mul51 GLOBAL] mova m4, m1 punpcklbw m1, m2 punpckhbw m4, m2 mova m2, m0 punpcklbw m0, %4 punpckhbw m2, %4 mova %1, m3 punpcklbw m3, %3 punpckhbw %1, %3 mova %3, m3 mova %4, %1 pmaddubsw m1, %2 pmaddubsw m4, %2 pmaddubsw m0, %2 pmaddubsw m2, %2 pmaddubsw m3, [filt_mul20 GLOBAL] pmaddubsw %1, [filt_mul20 GLOBAL] psrlw %3, 8 psrlw %4, 8 paddw m1, m0 paddw m4, m2 paddw m1, m3 paddw m4, %1%else LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1 FILT_V2%endif mova %1, m1 mova %2, m4 paddw m1, m15 paddw m4, m15 add r3, 16 add r1, 16 psraw m1, 5 psraw m4, 5 packuswb m1, m4 movntps [r11+r4+%5], m1%endmacro%macro DO_FILT_H 4 mova m1, %2 PALIGNR m1, %1, 12, m4 mova m2, %2 PALIGNR m2, %1, 14, m4 mova %1, %3 PALIGNR %3, %2, 6, m4 mova m3, %1 PALIGNR m3, %2, 4, m4 mova m4, %1 paddw %3, m1 PALIGNR m4, %2, 2, m1 paddw m3, m2 paddw m4, %2 FILT_H %3, m3, m4 paddw %3, m15 psraw %3, %4%endmacro%macro DO_FILT_CC 4 DO_FILT_H %1, %2, %3, 6 DO_FILT_H %2, %1, %4, 6 packuswb %3, %4 movntps [r5+r4], %3%endmacro%macro DO_FILT_HH 4 DO_FILT_H %1, %2, %3, 1 DO_FILT_H %2, %1, %4, 1 packuswb %3, %4 movntps [r0+r4], %3%endmacro%macro DO_FILT_H2 6 DO_FILT_H %1, %2, %3, 6 psrlw m15, 5 DO_FILT_H %4, %5, %6, 1 packuswb %6, %3%endmacro
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -