?? bcopy.s
字號:
Hi,The following code is the file support.s from the FreeBSD 2.6distribution for i386. I included the entire file so you canpick and choose as you like and you can pick up the license.There's a generic bcopy that does overlapping, uses rep movsin the largest chunk possible, etc. That might do the trick.There's a few macros around but hopefully you can decipherthem.Later,FM--Frank W. MillerCornfed Systems Incwww.cornfed.com--/*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#include "npx.h"#include "opt_cpu.h"#include <machine/asmacros.h>#include <machine/cputypes.h>#include <machine/pmap.h>#include <machine/specialreg.h>#include "assym.s"#define KDSEL 0x10 /* kernel data selector */#define IDXSHIFT 10 .data .globl _bcopy_vector_bcopy_vector: .long _generic_bcopy .globl _bzero_bzero: .long _generic_bzero .globl _copyin_vector_copyin_vector: .long _generic_copyin .globl _copyout_vector_copyout_vector: .long _generic_copyout .globl _ovbcopy_vector_ovbcopy_vector: .long _generic_bcopy#if defined(I586_CPU) && NNPX > 0kernel_fpu_lock: .byte 0xfe .space 3#endif .text/* * bcopy family * void bzero(void *buf, u_int len) */ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret#if defined(I486_CPU)ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax/* * do 64 byte chunks first * * XXX this is probably over-unrolled at least for DX2's */2: cmpl $64,%ecx jb 3f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) movl %eax,16(%edx) movl %eax,20(%edx) movl %eax,24(%edx) movl %eax,28(%edx) movl %eax,32(%edx) movl %eax,36(%edx) movl %eax,40(%edx) movl %eax,44(%edx) movl %eax,48(%edx) movl %eax,52(%edx) movl %eax,56(%edx) movl %eax,60(%edx) addl $64,%edx subl $64,%ecx jnz 2b ret/* * do 16 byte chunks */ SUPERALIGN_TEXT3: cmpl $16,%ecx jb 4f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) addl $16,%edx subl $16,%ecx jnz 3b ret/* * do 4 byte chunks */ SUPERALIGN_TEXT4: cmpl $4,%ecx jb 5f movl %eax,(%edx) addl $4,%edx subl $4,%ecx jnz 4b ret/* * do 1 byte chunks * a jump table seems to be faster than a loop or more range reductions * * XXX need a const section for non-text */ .datajtab: .long do0 .long do1 .long do2 .long do3 .text SUPERALIGN_TEXT5: jmp jtab(,%ecx,4) SUPERALIGN_TEXTdo3: movw %ax,(%edx) movb %al,2(%edx) ret SUPERALIGN_TEXTdo2: movw %ax,(%edx) ret SUPERALIGN_TEXTdo1: movb %al,(%edx) ret SUPERALIGN_TEXTdo0: ret#endif#if defined(I586_CPU) && NNPX > 0ENTRY(i586_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx /* * The FPU register method is twice as fast as the integer register * method unless the target is in the L1 cache and we pre-allocate a * cache line for it (then the integer register method is 4-5 times * faster). However, we never pre-allocate cache lines, since that * would make the integer method 25% or more slower for the common * case when the target isn't in either the L1 cache or the L2 cache. * Thus we normally use the FPU register method unless the overhead * would be too large. */ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ jb intreg_i586_bzero /* * The FPU registers may belong to an application or to fastmove() * or to another invocation of bcopy() or ourself in a higher level * interrupt or trap handler. Preserving the registers is * complicated since we avoid it if possible at all levels. We * want to localize the complications even when that increases them. * Here the extra work involves preserving CR0_TS in TS. * `npxproc != NULL' is supposed to be the condition that all the * FPU resources belong to an application, but npxproc and CR0_TS * aren't set atomically enough for this condition to work in * interrupt handlers. * * Case 1: FPU registers belong to the application: we must preserve * the registers if we use them, so we only use the FPU register * method if the target size is large enough to amortize the extra * overhead for preserving them. CR0_TS must be preserved although * it is very likely to end up as set. * * Case 2: FPU registers belong to fastmove(): fastmove() currently * makes the registers look like they belong to an application so * that cpu_switch() and savectx() don't have to know about it, so * this case reduces to case 1. * * Case 3: FPU registers belong to the kernel: don't use the FPU * register method. This case is unlikely, and supporting it would * be more complicated and might take too much stack. * * Case 4: FPU registers don't belong to anyone: the FPU registers * don't need to be preserved, so we always use the FPU register * method. CR0_TS must be preserved although it is very likely to * always end up as clear. */ cmpl $0,_npxproc je i586_bz1 cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ jb intreg_i586_bzero sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts subl $108,%esp fnsave 0(%esp) jmp i586_bz2i586_bz1: sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts fninit /* XXX should avoid needing this */i586_bz2: fldz /* * Align to an 8 byte boundary (misalignment in the main loop would * cost a factor of >= 2). Avoid jumps (at little cost if it is * already aligned) by always zeroing 8 bytes and using the part up * to the _next_ alignment position. */ fstl 0(%edx) addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ addl $8,%edx andl $~7,%edx subl %edx,%ecx /* * Similarly align `len' to a multiple of 8. */ fstl -8(%edx,%ecx) decl %ecx andl $~7,%ecx /* * This wouldn't be any faster if it were unrolled, since the loop * control instructions are much faster than the fstl and/or done * in parallel with it so their overhead is insignificant. */fpureg_i586_bzero_loop: fstl 0(%edx) addl $8,%edx subl $8,%ecx cmpl $8,%ecx jae fpureg_i586_bzero_loop cmpl $0,_npxproc je i586_bz3 frstor 0(%esp) addl $108,%esp lmsw %ax movb $0xfe,kernel_fpu_lock reti586_bz3: fstpl %st(0) lmsw %ax movb $0xfe,kernel_fpu_lock retintreg_i586_bzero: /* * `rep stos' seems to be the best method in practice for small * counts. Fancy methods usually take too long to start up due * to cache and BTB misses. */ pushl %edi movl %edx,%edi xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx jne 1f popl %edi ret1: rep stosb popl %edi ret#endif /* I586_CPU && NNPX > 0 *//* fillw(pat, base, cnt) */ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx cld rep stosw popl %edi retENTRY(bcopyb)bcopyb: pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cld /* nope, copy forwards */ rep movsb popl %edi popl %esi ret ALIGN_TEXT1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld retENTRY(bcopy) MEXITCOUNT jmp *_bcopy_vectorENTRY(ovbcopy) MEXITCOUNT jmp *_ovbcopy_vector/* * generic_bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ENTRY(generic_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi ret ALIGN_TEXT1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 20(%esp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret#if defined(I586_CPU) && NNPX > 0ENTRY(i586_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cmpl $1024,%ecx jb small_i586_bcopy sarb $1,kernel_fpu_lock jc small_i586_bcopy cmpl $0,_npxproc je i586_bc1 smsw %dx clts subl $108,%esp fnsave 0(%esp) jmp 4fi586_bc1: smsw %dx clts fninit /* XXX should avoid needing this */ ALIGN_TEXT4: pushl %ecx#define DCACHE_SIZE 8192 cmpl $(DCACHE_SIZE-512)/2,%ecx jbe 2f movl $(DCACHE_SIZE-512)/2,%ecx2: subl %ecx,0(%esp) cmpl $256,%ecx jb 5f /* XXX should prefetch if %ecx >= 32 */ pushl %esi pushl %ecx ALIGN_TEXT3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b popl %ecx popl %esi5: ALIGN_TEXTlarge_i586_bcopy_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi)
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -