?? sparcv9-mont.pl
字號:
#!/usr/bin/env perl# ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. The module is, however, dual licensed under OpenSSL and# CRYPTOGAMS licenses depending on where you obtain it. For further# details see http://www.openssl.org/~appro/cryptogams/.# ====================================================================# December 2005## Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons# for undertaken effort are multiple. First of all, UltraSPARC is not# the whole SPARCv9 universe and other VIS-free implementations deserve# optimized code as much. Secondly, newly introduced UltraSPARC T1,# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with# several integrated RSA/DSA accelerator circuits accessible through# kernel driver [only(*)], but having decent user-land software# implementation is important too. Finally, reasons like desire to# experiment with dedicated squaring procedure. Yes, this module# implements one, because it was easiest to draft it in SPARCv9# instructions...# (*) Engine accessing the driver in question is on my TODO list.# For reference, acceleator is estimated to give 6 to 10 times# improvement on single-threaded RSA sign. It should be noted# that 6-10x improvement coefficient does not actually mean# something extraordinary in terms of absolute [single-threaded]# performance, as SPARCv9 instruction set is by all means least# suitable for high performance crypto among other 64 bit# platforms. 6-10x factor simply places T1 in same performance# domain as say AMD64 and IA-64. Improvement of RSA verify don't# appear impressive at all, but it's the sign operation which is# far more critical/interesting.# You might notice that inner loops are modulo-scheduled:-) This has# essentially negligible impact on UltraSPARC performance, it's# Fujitsu SPARC64 V users who should notice and hopefully appreciate# the advantage... Currently this module surpasses sparcv9a-mont.pl# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a# module still have hidden potential [see TODO list there], which is# estimated to be larger than 20%...# int bn_mul_mont($rp="%i0"; # BN_ULONG *rp,$ap="%i1"; # const BN_ULONG *ap,$bp="%i2"; # const BN_ULONG *bp,$np="%i3"; # const BN_ULONG *np,$n0="%i4"; # const BN_ULONG *n0,$num="%i5"; # int num);$bits=32;for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }if ($bits==64) { $bias=2047; $frame=192; }else { $bias=0; $frame=128; }$car0="%o0";$car1="%o1";$car2="%o2"; # 1 bit$acc0="%o3";$acc1="%o4";$mask="%g1"; # 32 bits, what a waste...$tmp0="%g4";$tmp1="%g5";$i="%l0";$j="%l1";$mul0="%l2";$mul1="%l3";$tp="%l4";$apj="%l5";$npj="%l6";$tpj="%l7";$fname="bn_mul_mont_int";$code=<<___;.section ".text",#alloc,#execinstr.global $fname.align 32$fname: cmp %o5,4 ! 128 bits minimum bge,pt %icc,.Lenter sethi %hi(0xffffffff),$mask retl clr %o0.align 32.Lenter: save %sp,-$frame,%sp sll $num,2,$num ! num*=4 or $mask,%lo(0xffffffff),$mask ld [$n0],$n0 cmp $ap,$bp and $num,$mask,$num ld [$bp],$mul0 ! bp[0] nop add %sp,$bias,%o7 ! real top of stack ld [$ap],$car0 ! ap[0] ! redundant in squaring context sub %o7,$num,%o7 ld [$ap+4],$apj ! ap[1] and %o7,-1024,%o7 ld [$np],$car1 ! np[0] sub %o7,$bias,%sp ! alloca ld [$np+4],$npj ! np[1] be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont mov 12,$j mulx $car0,$mul0,$car0 ! ap[0]*bp[0] mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] and $car0,$mask,$acc0 add %sp,$bias+$frame,$tp ld [$ap+8],$apj !prologue! mulx $n0,$acc0,$mul1 ! "t[0]"*n0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 srlx $car0,32,$car0 add $acc0,$car1,$car1 ld [$np+8],$npj !prologue! srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue!.L1st: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 add $acc1,$car1,$car1 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$car1,$car1 add $j,4,$j ! j++ mov $tmp0,$acc0 st $car1,[$tp] cmp $j,$num mov $tmp1,$acc1 srlx $car1,32,$car1 bl %icc,.L1st add $tp,4,$tp ! tp++!.L1st mulx $apj,$mul0,$tmp0 !epilogue! mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 and $car0,$mask,$acc0 add $acc1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $tmp0,$car0,$car0 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car1 add $car0,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 mov 4,$i ! i++ ld [$bp+4],$mul0 ! bp[1].Louter: add %sp,$bias+$frame,$tp ld [$ap],$car0 ! ap[0] ld [$ap+4],$apj ! ap[1] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] ld [$tp],$tmp1 ! tp[0] ld [$tp+4],$tpj ! tp[1] mov 12,$j mulx $car0,$mul0,$car0 mulx $apj,$mul0,$tmp0 !prologue! add $tmp1,$car0,$car0 ld [$ap+8],$apj !prologue! and $car0,$mask,$acc0 mulx $n0,$acc0,$mul1 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 mulx $npj,$mul1,$acc1 !prologue! srlx $car0,32,$car0 add $acc0,$car1,$car1 ld [$np+8],$npj !prologue! srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue!.Linner: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $tpj,$car0,$car0 ld [$ap+$j],$apj ! ap[j] add $acc0,$car0,$car0 add $acc1,$car1,$car1 ld [$np+$j],$npj ! np[j] and $car0,$mask,$acc0 ld [$tp+8],$tpj ! tp[j] srlx $car0,32,$car0 add $acc0,$car1,$car1 add $j,4,$j ! j++ mov $tmp0,$acc0 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 mov $tmp1,$acc1 cmp $j,$num bl %icc,.Linner add $tp,4,$tp ! tp++!.Linner mulx $apj,$mul0,$tmp0 !epilogue! mulx $npj,$mul1,$tmp1 add $tpj,$car0,$car0 add $acc0,$car0,$car0 ld [$tp+8],$tpj ! tp[j] and $car0,$mask,$acc0 add $acc1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $tpj,$car0,$car0 add $tmp0,$car0,$car0 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 add $acc0,$car1,$car1 st $car1,[$tp+4] ! tp[j-1] srlx $car0,32,$car0 add $i,4,$i ! i++ srlx $car1,32,$car1 add $car0,$car1,$car1 cmp $i,$num add $car2,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 bl,a %icc,.Louter ld [$bp+$i],$mul0 ! bp[i]!.Louter add $tp,12,$tp.Ltail: add $np,$num,$np add $rp,$num,$rp mov $tp,$ap sub %g0,$num,%o7 ! k=-num ba .Lsub subcc %g0,%g0,%g0 ! clear %icc.c.align 16.Lsub: ld [$tp+%o7],%o0 ld [$np+%o7],%o1 subccc %o0,%o1,%o1 ! tp[j]-np[j] add $rp,%o7,$i add %o7,4,%o7 brnz %o7,.Lsub st %o1,[$i] subc $car2,0,$car2 ! handle upmost overflow bit and $tp,$car2,$ap andn $rp,$car2,$np or $ap,$np,$ap sub %g0,$num,%o7.Lcopy: ld [$ap+%o7],%o0 ! copy or in-place refresh st %g0,[$tp+%o7] ! zap tp st %o0,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lcopy nop mov 1,%i0 ret restore___################ .Lbn_sqr_mont gives up to 20% *overall* improvement over######## code without following dedicated squaring procedure.########$sbit="%i2"; # re-use $bp!$code.=<<___;.align 32.Lbn_sqr_mont: mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] mulx $apj,$mul0,$tmp0 !prologue! and $car0,$mask,$acc0 add %sp,$bias+$frame,$tp ld [$ap+8],$apj !prologue! mulx $n0,$acc0,$mul1 ! "t[0]"*n0 srlx $car0,32,$car0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 mulx $npj,$mul1,$acc1 !prologue! and $car0,1,$sbit ld [$np+8],$npj !prologue! srlx $car0,1,$car0 add $acc0,$car1,$car1 srlx $car1,32,$car1 mov $tmp0,$acc0 !prologue!.Lsqr_1st: mulx $apj,$mul0,$tmp0 mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ! ap[j]*a0+c0 add $acc1,$car1,$car1 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 mov $tmp1,$acc1 srlx $acc0,32,$sbit add $j,4,$j ! j++ and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 st $car1,[$tp] mov $tmp0,$acc0 srlx $car1,32,$car1 bl %icc,.Lsqr_1st add $tp,4,$tp ! tp++!.Lsqr_1st mulx $apj,$mul0,$tmp0 ! epilogue mulx $npj,$mul1,$tmp1 add $acc0,$car0,$car0 ! ap[j]*a0+c0 add $acc1,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $tmp0,$car0,$car0 ! ap[j]*a0+c0 add $tmp1,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 st $car1,[$tp+8] srlx $car1,32,$car2 ld [%sp+$bias+$frame],$tmp0 ! tp[0] ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] ld [%sp+$bias+$frame+8],$tpj ! tp[2] ld [$ap+4],$mul0 ! ap[1] ld [$ap+8],$apj ! ap[2] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp0,$mul1 mulx $mul0,$mul0,$car0 and $mul1,$mask,$mul1 mulx $car1,$mul1,$car1 mulx $npj,$mul1,$acc1 add $tmp0,$car1,$car1 and $car0,$mask,$acc0 ld [$np+8],$npj ! np[2] srlx $car1,32,$car1 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add $acc0,$car1,$car1 and $car0,1,$sbit add $acc1,$car1,$car1 srlx $car0,1,$car0 mov 12,$j st $car1,[%sp+$bias+$frame] ! tp[0]= srlx $car1,32,$car1 add %sp,$bias+$frame+4,$tp.Lsqr_2nd: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc1,$car1,$car1 ld [$tp+8],$tpj ! tp[j] add $acc0,$acc0,$acc0 add $j,4,$j ! j++ or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 bl %icc,.Lsqr_2nd add $tp,4,$tp ! tp++!.Lsqr_2nd mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc1,$car1,$car1 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 ld [%sp+$bias+$frame],$tmp1 ! tp[0] ld [%sp+$bias+$frame+4],$tpj ! tp[1] ld [$ap+8],$mul0 ! ap[2] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp1,$mul1 and $mul1,$mask,$mul1 mov 8,$i mulx $mul0,$mul0,$car0 mulx $car1,$mul1,$car1 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add %sp,$bias+$frame,$tp srlx $car1,32,$car1 and $car0,1,$sbit srlx $car0,1,$car0 mov 4,$j.Lsqr_outer:.Lsqr_inner1: mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $j,4,$j ld [$tp+8],$tpj cmp $j,$i add $acc1,$car1,$car1 ld [$np+$j],$npj st $car1,[$tp] srlx $car1,32,$car1 bl %icc,.Lsqr_inner1 add $tp,4,$tp!.Lsqr_inner1 add $j,4,$j ld [$ap+$j],$apj ! ap[j] mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 ld [$np+$j],$npj ! np[j] add $acc0,$car1,$car1 ld [$tp+8],$tpj ! tp[j] add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $j,4,$j cmp $j,$num be,pn %icc,.Lsqr_no_inner2 add $tp,4,$tp.Lsqr_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] srlx $car0,32,$car0 add $acc0,$acc0,$acc0 ld [$tp+8],$tpj ! tp[j] or $sbit,$acc0,$acc0 add $j,4,$j ! j++ srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 bl %icc,.Lsqr_inner2 add $tp,4,$tp ! tp++.Lsqr_no_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car0,$car0 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] ! tp[j-1] srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 add $i,4,$i ! i++ ld [%sp+$bias+$frame],$tmp1 ! tp[0] ld [%sp+$bias+$frame+4],$tpj ! tp[1] ld [$ap+$i],$mul0 ! ap[j] ld [$np],$car1 ! np[0] ld [$np+4],$npj ! np[1] mulx $n0,$tmp1,$mul1 and $mul1,$mask,$mul1 add $i,4,$tmp0 mulx $mul0,$mul0,$car0 mulx $car1,$mul1,$car1 and $car0,$mask,$acc0 add $tmp1,$car1,$car1 srlx $car0,32,$car0 add %sp,$bias+$frame,$tp srlx $car1,32,$car1 and $car0,1,$sbit srlx $car0,1,$car0 cmp $tmp0,$num ! i<num-1 bl %icc,.Lsqr_outer mov 4,$j.Lsqr_last: mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $j,4,$j ld [$tp+8],$tpj cmp $j,$i add $acc1,$car1,$car1 ld [$np+$j],$npj st $car1,[$tp] srlx $car1,32,$car1 bl %icc,.Lsqr_last add $tp,4,$tp!.Lsqr_last mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $car0,$car0,$car0 ! recover $car0 or $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] srlx $car1,32,$car2 ba .Ltail add $tp,8,$tp.type $fname,#function.size $fname,(.-$fname).asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>".align 32___$code =~ s/\`([^\`]*)\`/eval($1)/gem;print $code;close STDOUT;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -