?? lcompasm.asm
字號:
; NASM assembly language code for LCOMP compression
; (C) 2009, Larry Mugim
;
; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
;
; MINGW g++: nasm lCompAsm.asm -f win32 --prefix _
; DJGPP g++: nasm lCompAsm.asm -f coff --prefix _
; Borland, Mars: nasm lCompAsm.asm -f obj --prefix _
; Linux: nasm lCompAsm.asm -f elf
;
; For other Windows compilers try -f win32 or -f obj. Some old versions
; of Linux should use -f aout instead of -f elf.
;
; This code will only work on a Pentium-MMX or higher. It doesn't
; use extended (Katmai/SSE) instructions. It won't work
; in 64-bit mode.
section .text use32 class=CODE
; Reset after MMX
global do_emms
do_emms:
emms
ret
; Vector product a*b of n signed words, returning signed dword scaled
; down by 8 bits. n is rounded up to a multiple of 8.
global dot_product ; (short* a, short* b, int n)
align 16
dot_product:
mov eax, [esp+4] ; a
mov edx, [esp+8] ; b
mov ecx, [esp+12] ; n
add ecx, 7 ; n rounding up
and ecx, -8
jz .done
sub eax, 8
sub edx, 8
pxor mm0, mm0 ; sum = 0
.loop: ; each loop sums 4 products
movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
pmaddwd mm1, [edx+ecx*2]
movq mm2, [eax+ecx*2-8]
pmaddwd mm2, [edx+ecx*2-8]
psrad mm1, 8
psrad mm2, 8
paddd mm0, mm1
paddd mm0, mm2
sub ecx, 8
ja .loop
movq mm1, mm0 ; add 2 halves of mm0 and return in eax
psrlq mm1, 32
paddd mm0, mm1
movd eax, mm0
emms
.done
ret
; This should work on a Pentium 4 or higher in 32-bit mode,
; but it isn't much faster than the MMX version so I don't use it.
global dot_product_sse2 ; (short* a, short* b, int n)
align 16
dot_product_sse2:
mov eax, [esp+4] ; a
mov edx, [esp+8] ; b
mov ecx, [esp+12] ; n
add ecx, 7 ; n rounding up
and ecx, -8
jz .done
sub eax, 16
sub edx, 16
pxor xmm0, xmm0 ; sum = 0
.loop: ; each loop sums 4 products
movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
pmaddwd xmm1, [edx+ecx*2]
psrad xmm1, 8
paddd xmm0, xmm1
sub ecx, 8
ja .loop
movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
.done
ret
; Train n neural network weights w[n] on inputs t[n] and err.
; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
; n is rounded up to a multiple of 8.
global train ; (short* t, short* w, int n, int err)
align 16
train:
mov eax, [esp+16] ; err
and eax, 0xffff ; put 4 copies of err in mm0
movd mm0, eax
movd mm1, eax
psllq mm1, 16
por mm0, mm1
movq mm1, mm0
psllq mm1, 32
por mm0, mm1
pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1
psrlw mm1, 15
mov eax, [esp+4] ; t
mov edx, [esp+8] ; w
mov ecx, [esp+12] ; n
add ecx, 7 ; n/8 rounding up
and ecx, -8
sub eax, 8
sub edx, 8
jz .done
.loop: ; each iteration adjusts 8 weights
movq mm2, [edx+ecx*2] ; w[i]
movq mm3, [eax+ecx*2] ; t[i]
movq mm4, [edx+ecx*2-8] ; w[i]
movq mm5, [eax+ecx*2-8] ; t[i]
paddsw mm3, mm3
paddsw mm5, mm5
pmulhw mm3, mm0
pmulhw mm5, mm0
paddsw mm3, mm1
paddsw mm5, mm1
psraw mm3, 1
psraw mm5, 1
paddsw mm2, mm3
paddsw mm4, mm5
movq [edx+ecx*2], mm2
movq [edx+ecx*2-8], mm4
sub ecx, 8
ja .loop
.done:
emms
ret
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -