?? cmll-x86.pl
字號(hào):
#!/usr/bin/env perl# ====================================================================# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>## This module may be used under the terms of either the GNU General# Public License version 2 or later, the GNU Lesser General Public# License version 2.1 or later, the Mozilla Public License version# 1.1 or the BSD License. The exact terms of either license are# distributed along with this module. For further details see# http://www.openssl.org/~appro/camellia/.# ====================================================================# Performance in cycles per processed byte (less is better) in# 'openssl speed ...' benchmark:## AMD K8 Core2 PIII P4# -evp camellia-128-ecb 21.5 22.8 27.0 28.9# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%## camellia-128-cbc 17.3 21.1 23.9 25.9## 128-bit key setup 196 280 256 240 cycles/key# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%## Pairs of numbers in "+" rows represent performance improvement over# compiler generated position-independent code, PIC, and non-PIC# respectively. PIC results are of greater relevance, as this module# is position-independent, i.e. suitable for a shared library or PIE.# Position independence "costs" one register, which is why compilers# are so close with non-PIC results, they have an extra register to# spare. CBC results are better than ECB ones thanks to "zero-copy"# private _x86_* interface, and are ~30-40% better than with compiler# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on# same CPU (where applicable).$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;push(@INC,"${dir}","${dir}../../perlasm");require "x86asm.pl";$OPENSSL=1;&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");@T=("eax","ebx","ecx","edx");$idx="esi";$key="edi";$Tbl="ebp";# stack frame layout in _x86_Camellia_* routines, frame is allocated# by caller$__ra=&DWP(0,"esp"); # return address$__s0=&DWP(4,"esp"); # s0 backing store$__s1=&DWP(8,"esp"); # s1 backing store$__s2=&DWP(12,"esp"); # s2 backing store$__s3=&DWP(16,"esp"); # s3 backing store$__end=&DWP(20,"esp"); # pointer to end/start of key schedule# stack frame layout in Camellia_[en|crypt] routines, which differs from# above by 4 and overlaps by pointer to end/start of key schedule$_end=&DWP(16,"esp");$_esp=&DWP(20,"esp");# const unsigned int Camellia_SBOX[4][256];# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],# and [2][] - with [3][]. This is done to optimize code size.$SBOX1_1110=0; # Camellia_SBOX[0]$SBOX4_4404=4; # Camellia_SBOX[1]$SBOX2_0222=2048; # Camellia_SBOX[2]$SBOX3_3033=2052; # Camellia_SBOX[3]&static_label("Camellia_SIGMA");&static_label("Camellia_SBOX");sub Camellia_Feistel {my $i=@_[0];my $seed=defined(@_[1])?@_[1]:0;my $scale=$seed<0?-8:8;my $frame=defined(@_[2])?@_[2]:0;my $j=($i&1)*2;my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; &xor ($t0,$idx); # t0^=key[0] &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] &movz ($idx,&HB($t0)); # (t0>>8)&0xff &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] &movz ($idx,&LB($t0)); # (t0>>0)&0xff &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] &shr ($t0,16); &movz ($idx,&LB($t1)); # (t1>>0)&0xff &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] &movz ($idx,&HB($t0)); # (t0>>24)&0xff &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] &movz ($idx,&HB($t1)); # (t1>>8)&0xff &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] &shr ($t1,16); &movz ($t0,&LB($t0)); # (t0>>16)&0xff &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] &movz ($idx,&HB($t1)); # (t1>>24)&0xff &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" &xor ($t2,$t3); # t2^=t3 &rotr ($t3,8); # t3=RightRotate(t3,8) &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] &movz ($idx,&LB($t1)); # (t1>>16)&0xff &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" &xor ($t3,$t0); # t3^=s3 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] &xor ($t3,$t2); # t3^=t2 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 &xor ($t2,$t1); # t2^=s2 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2}# void Camellia_EncryptBlock_Rounds(# int grandRounds,# const Byte plaintext[],# const KEY_TABLE_TYPE keyTable,# Byte ciphertext[])&function_begin("Camellia_EncryptBlock_Rounds"); &mov ("eax",&wparam(0)); # load grandRounds &mov ($idx,&wparam(1)); # load plaintext pointer &mov ($key,&wparam(2)); # load key schedule pointer &mov ("ebx","esp"); &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra &and ("esp",-64); # place stack frame just "above mod 1024" the key schedule # this ensures that cache associativity of 2 suffices &lea ("ecx",&DWP(-64-63,$key)); &sub ("ecx","esp"); &neg ("ecx"); &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line &sub ("esp","ecx"); &add ("esp",4); # 4 is reserved for callee's return address &shl ("eax",6); &lea ("eax",&DWP(0,$key,"eax")); &mov ($_esp,"ebx"); # save %esp &mov ($_end,"eax"); # save keyEnd &call (&label("pic_point")); &set_label("pic_point"); &blindpop($Tbl); &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); &mov (@T[0],&DWP(0,$idx)); # load plaintext &mov (@T[1],&DWP(4,$idx)); &mov (@T[2],&DWP(8,$idx)); &bswap (@T[0]); &mov (@T[3],&DWP(12,$idx)); &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &call ("_x86_Camellia_encrypt"); &mov ("esp",$_esp); &bswap (@T[0]); &mov ($idx,&wparam(3)); # load ciphertext pointer &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &mov (&DWP(0,$idx),@T[0]); # write ciphertext &mov (&DWP(4,$idx),@T[1]); &mov (&DWP(8,$idx),@T[2]); &mov (&DWP(12,$idx),@T[3]);&function_end("Camellia_EncryptBlock_Rounds");# V1.x API&function_begin_B("Camellia_EncryptBlock"); &mov ("eax",128); &sub ("eax",&wparam(0)); # load keyBitLength &mov ("eax",3); &adc ("eax",0); # keyBitLength==128?3:4 &mov (&wparam(0),"eax"); &jmp (&label("Camellia_EncryptBlock_Rounds"));&function_end_B("Camellia_EncryptBlock");if ($OPENSSL) {# void Camellia_encrypt(# const unsigned char *in,# unsigned char *out,# const CAMELLIA_KEY *key)&function_begin("Camellia_encrypt"); &mov ($idx,&wparam(0)); # load plaintext pointer &mov ($key,&wparam(2)); # load key schedule pointer &mov ("ebx","esp"); &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra &and ("esp",-64); &mov ("eax",&DWP(272,$key)); # load grandRounds counter # place stack frame just "above mod 1024" the key schedule # this ensures that cache associativity of 2 suffices &lea ("ecx",&DWP(-64-63,$key)); &sub ("ecx","esp"); &neg ("ecx"); &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line &sub ("esp","ecx"); &add ("esp",4); # 4 is reserved for callee's return address &shl ("eax",6); &lea ("eax",&DWP(0,$key,"eax")); &mov ($_esp,"ebx"); # save %esp &mov ($_end,"eax"); # save keyEnd &call (&label("pic_point")); &set_label("pic_point"); &blindpop($Tbl); &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); &mov (@T[0],&DWP(0,$idx)); # load plaintext &mov (@T[1],&DWP(4,$idx)); &mov (@T[2],&DWP(8,$idx)); &bswap (@T[0]); &mov (@T[3],&DWP(12,$idx)); &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &call ("_x86_Camellia_encrypt"); &mov ("esp",$_esp); &bswap (@T[0]); &mov ($idx,&wparam(1)); # load ciphertext pointer &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &mov (&DWP(0,$idx),@T[0]); # write ciphertext &mov (&DWP(4,$idx),@T[1]); &mov (&DWP(8,$idx),@T[2]); &mov (&DWP(12,$idx),@T[3]);&function_end("Camellia_encrypt");}&function_begin_B("_x86_Camellia_encrypt"); &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] &xor (@T[1],&DWP(4,$key)); &xor (@T[2],&DWP(8,$key)); &xor (@T[3],&DWP(12,$key)); &mov ($idx,&DWP(16,$key)); # prefetch key[4] &mov ($__s0,@T[0]); # save s[0-3] &mov ($__s1,@T[1]); &mov ($__s2,@T[2]); &mov ($__s3,@T[3]);&set_label("loop",16); for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } &add ($key,16*4); &cmp ($key,$__end); &je (&label("done")); # @T[0-1] are preloaded, $idx is preloaded with key[0] &and ($idx,@T[0]); &mov (@T[3],$__s3); &rotl ($idx,1); &mov (@T[2],@T[3]); &xor (@T[1],$idx); &or (@T[2],&DWP(12,$key)); &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); &xor (@T[2],$__s2); &mov ($idx,&DWP(4,$key)); &mov ($__s2,@T[2]); # s2^=s3|key[3]; &or ($idx,@T[1]); &and (@T[2],&DWP(8,$key)); &xor (@T[0],$idx); &rotl (@T[2],1); &mov ($__s0,@T[0]); # s0^=s1|key[1]; &xor (@T[3],@T[2]); &mov ($idx,&DWP(16,$key)); # prefetch key[4] &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); &jmp (&label("loop"));&set_label("done",8); &mov (@T[2],@T[0]); # SwapHalf &mov (@T[3],@T[1]); &mov (@T[0],$__s2); &mov (@T[1],$__s3); &xor (@T[0],$idx); # $idx is preloaded with key[0] &xor (@T[1],&DWP(4,$key)); &xor (@T[2],&DWP(8,$key)); &xor (@T[3],&DWP(12,$key)); &ret ();&function_end_B("_x86_Camellia_encrypt");# void Camellia_DecryptBlock_Rounds(# int grandRounds,# const Byte ciphertext[],# const KEY_TABLE_TYPE keyTable,# Byte plaintext[])&function_begin("Camellia_DecryptBlock_Rounds"); &mov ("eax",&wparam(0)); # load grandRounds &mov ($idx,&wparam(1)); # load ciphertext pointer &mov ($key,&wparam(2)); # load key schedule pointer &mov ("ebx","esp"); &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra &and ("esp",-64); # place stack frame just "above mod 1024" the key schedule # this ensures that cache associativity of 2 suffices &lea ("ecx",&DWP(-64-63,$key)); &sub ("ecx","esp"); &neg ("ecx"); &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line &sub ("esp","ecx"); &add ("esp",4); # 4 is reserved for callee's return address &shl ("eax",6); &mov (&DWP(4*4,"esp"),$key); # save keyStart &lea ($key,&DWP(0,$key,"eax")); &mov (&DWP(5*4,"esp"),"ebx");# save %esp &call (&label("pic_point")); &set_label("pic_point"); &blindpop($Tbl); &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); &mov (@T[0],&DWP(0,$idx)); # load ciphertext &mov (@T[1],&DWP(4,$idx)); &mov (@T[2],&DWP(8,$idx)); &bswap (@T[0]); &mov (@T[3],&DWP(12,$idx)); &bswap (@T[1]); &bswap (@T[2]); &call ("_x86_Camellia_decrypt"); &mov ("esp",&DWP(5*4,"esp")); &bswap (@T[0]); &mov ($idx,&wparam(3)); # load plaintext pointer &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &mov (&DWP(0,$idx),@T[0]); # write plaintext &mov (&DWP(4,$idx),@T[1]); &mov (&DWP(8,$idx),@T[2]); &mov (&DWP(12,$idx),@T[3]);&function_end("Camellia_DecryptBlock_Rounds");# V1.x API&function_begin_B("Camellia_DecryptBlock"); &mov ("eax",128); &sub ("eax",&wparam(0)); # load keyBitLength &mov ("eax",3); &adc ("eax",0); # keyBitLength==128?3:4 &mov (&wparam(0),"eax"); &jmp (&label("Camellia_DecryptBlock_Rounds"));&function_end_B("Camellia_DecryptBlock");if ($OPENSSL) {# void Camellia_decrypt(# const unsigned char *in,# unsigned char *out,# const CAMELLIA_KEY *key)&function_begin("Camellia_decrypt"); &mov ($idx,&wparam(0)); # load ciphertext pointer &mov ($key,&wparam(2)); # load key schedule pointer &mov ("ebx","esp"); &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra &and ("esp",-64); &mov ("eax",&DWP(272,$key)); # load grandRounds counter # place stack frame just "above mod 1024" the key schedule # this ensures that cache associativity of 2 suffices &lea ("ecx",&DWP(-64-63,$key)); &sub ("ecx","esp"); &neg ("ecx"); &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line &sub ("esp","ecx"); &add ("esp",4); # 4 is reserved for callee's return address &shl ("eax",6); &mov (&DWP(4*4,"esp"),$key); # save keyStart
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -