?? mc-a2.asm
字號:
%macro HPEL 1;-----------------------------------------------------------------------------; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,; uint8_t *src, int stride, int width, int height);-----------------------------------------------------------------------------cglobal x264_hpel_filter_%1, 7,7,16%ifdef WIN64 movsxd r4, r4d movsxd r5, r5d%endif mov r10, r3 sub r5, 16 mov r11, r1 and r10, 15 sub r3, r10 add r0, r5 add r11, r5 add r10, r5 add r5, r2 mov r2, r4 neg r10 lea r1, [r3+r2] sub r3, r2 sub r3, r2 mov r4, r10%ifidn %1, sse2 pxor m0, m0%endif pcmpeqw m15, m15 psrlw m15, 15 ; pw_1 psllw m15, 4;ALIGN 16.loopy:; first filter_v; prefetching does not help here! lots of variants tested, all slower DO_FILT_V m8, m7, m13, m12, 0, %1;ALIGN 16.loopx: DO_FILT_V m6, m5, m11, m10, 16, %1.lastx: paddw m15, m15 DO_FILT_CC m9, m8, m7, m6 movdqa m7, m12 ; not really necessary, but seems free and movdqa m6, m11 ; gives far shorter code psrlw m15, 5 DO_FILT_HH m14, m13, m7, m6 psllw m15, 4 ; pw_16 movdqa m7, m5 movdqa m12, m10 add r4, 16 jl .loopx cmp r4, 16 jl .lastx; setup regs for next y sub r4, r10 sub r4, r2 sub r1, r4 sub r3, r4 add r0, r2 add r11, r2 add r5, r2 mov r4, r10 sub r6d, 1 jg .loopy sfence RET%endmacro%define PALIGNR PALIGNR_MMXHPEL sse2%define PALIGNR PALIGNR_SSSE3HPEL ssse3%endifcglobal x264_sfence sfence ret;-----------------------------------------------------------------------------; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,; uint8_t *src, int i_src, int w, int h);-----------------------------------------------------------------------------cglobal x264_plane_copy_mmxext, 6,7 movsxdifnidn r1, r1d movsxdifnidn r3, r3d add r4d, 3 and r4d, ~3 mov r6d, r4d and r6d, ~15 sub r1, r6 sub r3, r6.loopy: mov r6d, r4d sub r6d, 64 jl .endx.loopx: prefetchnta [r2+256] movq mm0, [r2 ] movq mm1, [r2+ 8] movq mm2, [r2+16] movq mm3, [r2+24] movq mm4, [r2+32] movq mm5, [r2+40] movq mm6, [r2+48] movq mm7, [r2+56] movntq [r0 ], mm0 movntq [r0+ 8], mm1 movntq [r0+16], mm2 movntq [r0+24], mm3 movntq [r0+32], mm4 movntq [r0+40], mm5 movntq [r0+48], mm6 movntq [r0+56], mm7 add r2, 64 add r0, 64 sub r6d, 64 jge .loopx.endx: prefetchnta [r2+256] add r6d, 48 jl .end16.loop16: movq mm0, [r2 ] movq mm1, [r2+8] movntq [r0 ], mm0 movntq [r0+8], mm1 add r2, 16 add r0, 16 sub r6d, 16 jge .loop16.end16: add r6d, 12 jl .end4.loop4: movd mm2, [r2+r6] movd [r0+r6], mm2 sub r6d, 4 jge .loop4.end4: add r2, r3 add r0, r1 dec r5d jg .loopy sfence emms RET; These functions are not general-use; not only do the SSE ones require aligned input,; but they also will fail if given a non-mod16 size or a size less than 64.; memzero SSE will fail for non-mod128.;-----------------------------------------------------------------------------; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );;-----------------------------------------------------------------------------cglobal x264_memcpy_aligned_mmx, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 movq mm0, [r1 + r2 + 0] movq mm1, [r1 + r2 + 8] movq [r0 + r2 + 0], mm0 movq [r0 + r2 + 8], mm1.copy32: sub r2d, 32 movq mm0, [r1 + r2 + 0] movq mm1, [r1 + r2 + 8] movq mm2, [r1 + r2 + 16] movq mm3, [r1 + r2 + 24] movq [r0 + r2 + 0], mm0 movq [r0 + r2 + 8], mm1 movq [r0 + r2 + 16], mm2 movq [r0 + r2 + 24], mm3 jg .copy32 REP_RET;-----------------------------------------------------------------------------; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );;-----------------------------------------------------------------------------cglobal x264_memcpy_aligned_sse2, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 movdqa xmm0, [r1 + r2] movdqa [r0 + r2], xmm0.copy32: test r2d, 32 jz .copy64 sub r2d, 32 movdqa xmm0, [r1 + r2 + 0] movdqa [r0 + r2 + 0], xmm0 movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1.copy64: sub r2d, 64 movdqa xmm0, [r1 + r2 + 0] movdqa [r0 + r2 + 0], xmm0 movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1 movdqa xmm2, [r1 + r2 + 32] movdqa [r0 + r2 + 32], xmm2 movdqa xmm3, [r1 + r2 + 48] movdqa [r0 + r2 + 48], xmm3 jg .copy64 REP_RET;-----------------------------------------------------------------------------; void *x264_memzero_aligned( void *dst, size_t n );;-----------------------------------------------------------------------------%macro MEMZERO 1cglobal x264_memzero_aligned_%1, 2,2 pxor m0, m0.loop: sub r1d, mmsize*8%assign i 0%rep 8 mova [r0 + r1 + i], m0%assign i i+mmsize%endrep jg .loop REP_RET%endmacroINIT_MMXMEMZERO mmxINIT_XMMMEMZERO sse2;-----------------------------------------------------------------------------; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );-----------------------------------------------------------------------------cglobal x264_integral_init4h_sse4, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4.loop: movdqa m0, [r1+r2] movdqa m1, [r1+r2+16] palignr m1, m0, 8 mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] paddw m1, [r0+r2*2+16] movdqa [r3+r2*2 ], m0 movdqa [r3+r2*2+16], m1 add r2, 16 jl .loop REP_RETcglobal x264_integral_init8h_sse4, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4.loop: movdqa m0, [r1+r2] movdqa m1, [r1+r2+16] palignr m1, m0, 8 movdqa m2, m0 movdqa m3, m1 mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 mpsadbw m2, m4, 4 mpsadbw m3, m4, 4 paddw m0, [r0+r2*2] paddw m1, [r0+r2*2+16] paddw m0, m2 paddw m1, m3 movdqa [r3+r2*2 ], m0 movdqa [r3+r2*2+16], m1 add r2, 16 jl .loop REP_RET%macro INTEGRAL_INIT 1;-----------------------------------------------------------------------------; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );-----------------------------------------------------------------------------cglobal x264_integral_init4v_%1, 3,5 shl r2, 1 add r0, r2 add r1, r2 lea r3, [r0+r2*4] lea r4, [r0+r2*8] neg r2.loop: movu m0, [r0+r2+8] mova m2, [r0+r2] movu m1, [r4+r2+8] paddw m0, m2 paddw m1, [r4+r2] mova m3, [r3+r2] psubw m1, m0 psubw m3, m2 mova [r0+r2], m1 mova [r1+r2], m3 add r2, mmsize jl .loop REP_RET;-----------------------------------------------------------------------------; void x264_integral_init8v_mmx( uint16_t *sum8, int stride );-----------------------------------------------------------------------------cglobal x264_integral_init8v_%1, 3,3 shl r1, 1 add r0, r1 lea r2, [r0+r1*8] neg r1.loop: mova m0, [r2+r1] mova m1, [r2+r1+mmsize] psubw m0, [r0+r1] psubw m1, [r0+r1+mmsize] mova [r0+r1], m0 mova [r0+r1+mmsize], m1 add r1, 2*mmsize jl .loop REP_RET%endmacroINIT_MMXINTEGRAL_INIT mmxINIT_XMMINTEGRAL_INIT sse2%macro FILT8x4 7 mova %3, [r0+%7] mova %4, [r0+r5+%7] pavgb %3, %4 pavgb %4, [r0+r5*2+%7] PALIGNR %1, %3, 1, m6 PALIGNR %2, %4, 1, m6 pavgb %1, %3 pavgb %2, %4 mova %5, %1 mova %6, %2 pand %1, m7 pand %2, m7 psrlw %5, 8 psrlw %6, 8%endmacro%macro FILT16x2 4 mova m3, [r0+%4+mmsize] mova m2, [r0+%4] pavgb m3, [r0+%4+r5+mmsize] pavgb m2, [r0+%4+r5] PALIGNR %1, m3, 1, m6 pavgb %1, m3 PALIGNR m3, m2, 1, m6 pavgb m3, m2 mova m5, m3 mova m4, %1 pand m3, m7 pand %1, m7 psrlw m5, 8 psrlw m4, 8 packuswb m3, %1 packuswb m5, m4 mova [%2], m3 mova [%3], m5 mova %1, m2%endmacro%macro FILT8x2U 3 mova m3, [r0+%3+8] mova m2, [r0+%3] pavgb m3, [r0+%3+r5+8] pavgb m2, [r0+%3+r5] mova m1, [r0+%3+9] mova m0, [r0+%3+1] pavgb m1, [r0+%3+r5+9] pavgb m0, [r0+%3+r5+1] pavgb m1, m3 pavgb m0, m2 mova m3, m1 mova m2, m0 pand m1, m7 pand m0, m7 psrlw m3, 8 psrlw m2, 8 packuswb m0, m1 packuswb m2, m3 mova [%1], m0 mova [%2], m2%endmacro;-----------------------------------------------------------------------------; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,; int src_stride, int dst_stride, int width, int height );-----------------------------------------------------------------------------%macro FRAME_INIT_LOWRES 1-2 0 ; FIXMEcglobal x264_frame_init_lowres_core_%1, 6,7,%2%ifdef WIN64 movsxd r5, r5d%endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m dec r6d imul r6d, r5d add r6d, r7m lea r0, [r0+r6*2] ; dst += (height-1)*stride + width mov r6d, r8m dec r6d imul r6d, r6m add r6d, r7m add r1, r6 add r2, r6 add r3, r6 add r4, r6 ; gap = stride - width mov r6d, r6m sub r6d, r7m PUSH r6 %define dst_gap [rsp+gprsize] mov r6d, r5d sub r6d, r7m shl r6d, 1 PUSH r6 %define src_gap [rsp]%if mmsize == 16 ; adjust for the odd end case mov r6d, r7m and r6d, 8 sub r1, r6 sub r2, r6 sub r3, r6 sub r4, r6 add dst_gap, r6d%endif ; mmsize pcmpeqb m7, m7 psrlw m7, 8.vloop: mov r6d, r7m%ifnidn %1, mmxext mova m0, [r0] mova m1, [r0+r5] pavgb m0, m1 pavgb m1, [r0+r5*2]%endif%if mmsize == 16 test r6d, 8 jz .hloop sub r0, 16 FILT8x4 m0, m1, m2, m3, m4, m5, 0 packuswb m0, m4 packuswb m1, m5 movq [r1], m0 movhps [r2], m0 movq [r3], m1 movhps [r4], m1 mova m0, m2 mova m1, m3 sub r6d, 8%endif ; mmsize.hloop: sub r0, mmsize*2 sub r1, mmsize sub r2, mmsize sub r3, mmsize sub r4, mmsize%ifdef m8 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize mova m8, m0 mova m9, m1 FILT8x4 m2, m3, m0, m1, m4, m5, 0 packuswb m2, m8 packuswb m3, m9 packuswb m4, m10 packuswb m5, m11 mova [r1], m2 mova [r2], m4 mova [r3], m3 mova [r4], m5%elifidn %1, mmxext FILT8x2U r1, r2, 0 FILT8x2U r3, r4, r5%else FILT16x2 m0, r1, r2, 0 FILT16x2 m1, r3, r4, r5%endif sub r6d, mmsize jg .hloop.skip: mov r6, dst_gap sub r0, src_gap sub r1, r6 sub r2, r6 sub r3, r6 sub r4, r6 dec dword r8m jg .vloop ADD rsp, 2*gprsize emms RET%endmacro ; FRAME_INIT_LOWRESINIT_MMX%define PALIGNR PALIGNR_MMXFRAME_INIT_LOWRES mmxext%ifndef ARCH_X86_64FRAME_INIT_LOWRES cache32_mmxext%endifINIT_XMMFRAME_INIT_LOWRES sse2, 12%define PALIGNR PALIGNR_SSSE3FRAME_INIT_LOWRES ssse3, 12
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -