?? idea.c
字號:
/* idea.c - C source code for IDEA block cipher. IDEA (International Data
* Encryption Algorithm), formerly known as IPES (Improved Proposed Encryption
* Standard). Algorithm developed by Xuejia Lai and James L. Massey, of ETH
* Zurich. This implementation modified and derived from original C code
* developed by Xuejia Lai. Zero-based indexing added, names changed from IPES
* to IDEA. CFB functions added. Random number routines added. Optimized for
* speed 21 Oct 92 by Colin Plumb <colin@nsq.gts.org>. This code assumes that
* each pair of 8-bit bytes comprising a 16-bit word in the key and in the
* cipher block are externally represented with the Most Significant Byte
* (MSB) first, regardless of internal native byte order of the target CPU. */
#ifdef TEST
#include <stdio.h>
#include <time.h>
#endif
#ifdef sgi
#define HIGHFIRST
#endif
#ifdef sun
#define HIGHFIRST
#define const
#endif
#include "idea.h"
#define FAR IFAR
#ifndef _WIN32
#ifdef _M_I86
#define USE_ASM
#endif
#else
#define IDEA32
#endif
#define min(x, y) (((x) < (y)) ? (x) : (y))
#define TRUE 1
#define FALSE 0
#define IDEABLOCKSIZE 8
#define ROUNDS 8 /* Don't change this value, should be 8 */
#define KEYLEN (6*ROUNDS+4) /* length of key schedule */
#define byte unsigned char
#define word16 unsigned short
#define boolean int
#define word32 unsigned long
#define byteptr unsigned char FAR *
typedef word16 IDEAkey[KEYLEN];
#ifdef IDEA32 /* Use >16-bit temporaries */
#define low16(x) ((x) & 0xFFFF)
typedef unsigned int uint16; /* at LEAST 16 bits, maybe more */
#else
#define low16(x) (x) /* this is only ever applied to uint16's */
typedef word16 uint16;
#endif
#ifdef _GNUC_
/* __const__ simply means there are no side effects for this function,
* which is useful info for the gcc optimizer */
#define CONST __const__
#else
#define CONST
#endif
static void en_key_idea(word16 *userkey, word16 *Z);
static void de_key_idea(IDEAkey Z, IDEAkey DK);
/* Multiplication, modulo (2**16)+1. Note that this code is structured like
* this on the assumption that untaken branches are cheaper than taken
* branches, and the compiler doesn't schedule branches. */
#ifdef SMALL_CACHE
CONST static uint16 mul(register uint16 a, register uint16 b)
{
register word32 p;
if (a)
{ if (b)
{ p = (word32)a * b;
b = low16(p);
a = p>>16;
return b - a + (b < a);
}
else
{ return 1-a;
}
}
else
{ return 1-b;
}
}
#endif /* SMALL_CACHE */
/* Compute multiplicative inverse of x, modulo (2**16)+1, using Euclid's GCD
* algorithm. It is unrolled twice to avoid swapping the meaning of the
* registers each iteration; some subtracts of t have been changed to adds. */
CONST static uint16 inv(uint16 x)
{
uint16 t0, t1;
uint16 q, y;
if (x <= 1)
return x; /* 0 and 1 are self-inverse */
t1 = 0x10001 / x; /* Since x >= 2, this fits into 16 bits */
y = 0x10001 % x;
if (y == 1)
return low16(1-t1);
t0 = 1;
do
{ q = x / y;
x = x % y;
t0 += q * t1;
if (x == 1)
return t0;
q = y / x;
y = y % x;
t1 += q * t0;
} while (y != 1);
return low16(1-t1);
}
/* Compute IDEA encryption subkeys Z */
static void en_key_idea(word16 *userkey, word16 *Z)
{
int i,j;
/* shifts */
for (j=0; j<8; j++)
Z[j] = *userkey++;
for (i=0; j<KEYLEN; j++)
{ i++;
Z[i+7] = Z[i & 7] << 9 | Z[i+1 & 7] >> 7;
Z += i & 8;
i &= 7;
}
}
/* Compute IDEA decryption subkeys DK from encryption subkeys Z */
/* Note: these buffers *may* overlap! */
static void de_key_idea(IDEAkey Z, IDEAkey DK)
{
int j;
uint16 t1, t2, t3;
IDEAkey T;
word16 *p = T + KEYLEN;
t1 = inv(*Z++);
t2 = -*Z++;
t3 = -*Z++;
*--p = inv(*Z++);
*--p = t3;
*--p = t2;
*--p = t1;
for (j = 1; j < ROUNDS; j++)
{
t1 = *Z++;
*--p = *Z++;
*--p = t1;
t1 = inv(*Z++);
t2 = -*Z++;
t3 = -*Z++;
*--p = inv(*Z++);
*--p = t2;
*--p = t3;
*--p = t1;
}
t1 = *Z++;
*--p = *Z++;
*--p = t1;
t1 = inv(*Z++);
t2 = -*Z++;
t3 = -*Z++;
*--p = inv(*Z++);
*--p = t3;
*--p = t2;
*--p = t1;
/* Copy and destroy temp copy */
for (j = 0, p = T; j < KEYLEN; j++)
{
*DK++ = *p;
*p++ = 0;
}
}
/* MUL(x,y) computes x = x*y, modulo 0x10001. Requires two temps, t16 and t32.
* x must me a side-effect-free lvalue. y may be anything, but unlike x, must
* be strictly 16 bits even if low16() is #defined. All of these are
* equivalent; see which is faster on your machine. */
#ifdef SMALL_CACHE
#define MUL(x,y) (x = mul(low16(x),y))
#else
#ifdef AVOID_JUMPS
#define MUL(x,y) (x = low16(x-1), t16 = low16((y)-1), \
t32 = (word32)x*t16+x+t16+1, x = low16(t32), \
t16 = t32>>16, x = x-t16+(x<t16) )
#else
#define MUL(x,y) ((t16 = (y)) ? (x=low16(x)) ? \
t32 = (word32)x*t16, x = low16(t32), t16 = t32>>16, \
x = x-t16+(x<t16) : \
(x = 1-t16) : (x = 1-x))
#endif
#endif
#ifdef USE_ASM
static void cipher_idea(word16 FAR *inblock, word16 FAR *outblock, IDEAkey zkey)
{
word16 sx1, sx4, skk, done8;
__asm {
;A while ago I posted a message claiming a speed of 238,000
;bytes/sec for an implementation of IDEA on a 33Mh 486. Below is
;an explanation and some code to show how it works. The basic
;trick should be useful on many (but not all) processors. I
;expect only those familiar with IDEA and its reference
;implementation will be able to follow the discussion. See:
;
;Lai, Xueja and Massey, James L. A Proposal for a New Block
;Encryption Standard, Eurocrypt 90
;
;For those who have been asking for the code, sorry I kept
;putting it off. I wanted to get it out of Turbo Pascal
;ideal-mode, but I never had the time.
;
;Colin Plum wrote IDEA-386 code which is included in PGP
;2.3a and uses the same tricks. I don't know who's is
;faster, but I expect they will be very close. Now
;here's how it's done.
;
;A major bottleneck in software IDEA is the mul() routine, which
;is used 34 times per 64 bit block. The routine performs
;multiplication in the multiplicative group mod 2^16+1. The two
;factors are each in a 16 bit word, and the output is also in a 16
;bit word. Note that 0 is not a member of the multiplicative
;group and 2^16 does not fit in 16 bits. We therefor use the 0
;word to represent 2^16. Now group elements map one to one onto
;all possible 16 bit words, since 2^16+1 is prime.
;
;Here is (essentially) the reference implementation from [Lai].
;
;
;unsigned mul( unsigned a, unsigned b ) {
; long int p ;
; long unsigned q ;
; if( a==0 ) p= 0x00010001 - b ;
; else if( b==0 ) p= 0x00010001 - a ;
; else {
; q= a*b;
; p= (q & 0xffff) - (q>>16)
; if( p<0 ) p= p + 0x00010001 ;
; }
; return (unsigned)(p & 0xffff) ;
;}
;
;
;Note the method of reducing a 32 bit word modulo 2^16-1. We
;subtract the high word from the low word, and add the modulus
;back if the result is less than 0. [Lai] contains a proof that
;this works, and you can convince yourself fairly easily.
;
;To speed up this routine, we note that the tests for a=0 and b=0
;will rarely be false. With the possible exception of the first 2
;of the 34 multiplications, 0 should be no more likely than any of
;the other 65535 numbers. Note that if (and only if) either a or
;b is 0 then q will also be 0, and we can check for this in one
;instruction if our processor sets a zero flag for multiplication
;(as the 68000 does but 80x86 does not).
;
;Fortunately p will also be zero after the subtraction if and only
;if either a or b is 0. Proof: r will be zero when the high order
;word of q equals the low order word, and that happens when q is
;divisible by 00010001 hex. Since 00010001h = 2^16+1 is prime,
;this happens if either a or b is a multiple of 2^16+1, and 0 is
;the only such multiple which will fit in a 16 bit word.
;
;The speed-up strategy is to proceed under the assumption that a
;and b are not 0, check to be sure in one instruction, and
;recompute if the assumption was wrong. Here's some 8086
;assembler code:
;
; mov ax, [a]
; mul [b] ; ax is implied. q is now in DX AX
; sub ax, dx ; mod 2^16+1
; jnz not0 ; Jump if neither op was 0. Usually taken.
;
; mov ax, 1 ; recompute result knowing one op is 0.
; sub ax, [a]
; sub ax, [b]
; jmp out ; Just jump over adding the carry.
;not0:
; adc ax, 0 ; If r<0 add 1, otherwise do nothing.
;out: ; Result is now in ax
;
;
;Note that when r<0 we add 1 instead of 2^16+1 since the 2^16 part
;overflows out of the result. The "adc ax, 0" does all the work
;of checking for a negative result and adding the modulus if
;needed.
;
;The multiplication takes 9 instructions, 4 of which are rarely
;executed. I believe similar tricks are possible on many
;processors. The one drawback to the check-after-multiply tactic
;is that we can't let the multiply overwrite the only copy of an
;operand.
;
;Note that most software implementations of IDEA will run at
;slightly different speeds when 0's come up in the multiply
;routine. The reference implementation is faster on 0, this one
;is faster on non-zero. This may be a problem for some real-time
;stuff, and also suggests an attack based on timing.
;
;Finally, below is an implementation of the complete encryption
;function in 8086 assembler, to replace the cipher_idea() function
;in PGP. It takes the same parameters as the function from PGP,
;and uses the c language calling conventions. I tested it using
;the debug features of the idea.c file in PGP. You will need to
;add segment/assume directives. This version uses no global data
;and should be reentrant.
;
;The handling of zero multipliers is outside the inner loop so
;that a short conditional jump can loop back to the beginning.
;Forward conditional jumps are usually not taken and backward
;jumps are usually taken, which is consistent with 586 branch
;prediction (or so I've heard). Stalls where the output of one
;instruction is needed for the next seem unavoidable.
;
;Last I heard, IDEA was patent pending. My code is up for grabs,
;although I would get a kick out being credited if you use it.
;On the other hand Colin's code is already tested and ready
;to assemble and link with PGP.
;
;--Bryan
;
;____________________CODE STARTS BELOW THIS LINE_________
; Called as: cipher_idea(inbuff, outbuff, zkey)
; All arguments must be near pointers addressed off DS.
; push ax ; My compiler assumes these are not saved.
; push bx
; push cx
; push dx
push si
push di
; Put the 16 bit sub-blocks in registers and/or local variables
mov si, [inblock]
mov ax, [si]
mov [sx1], ax ; x1 is in ax and sx1
mov di, [si+2] ; x2 is in di
mov bx, [si+4] ; x3 is in bx
mov dx, [si+6]
mov [sx4], dx ; x4 is in sx4
mov si, [zkey] ; si points to next subkey
mov [done8], si
add [done8], 96 ; we will be finished with 8 rounds
; when si=done8
LLloop: ; 8 rounds of this
add di, [si+2] ; x2+=zkey[2] is in di
add bx, [si+4] ; x3+=zkey[4] is in bx
mul [Word ptr si] ;x1 *= zkey[0]
sub ax, dx
jz LLx1 ; if 0, use special case multiply
adc ax, 0
LLx1out:
mov [sx1], ax ; x1 is in ax and sx1
xor ax, bx ; ax= x1^x3
mul [Word ptr si+8] ; compute kk
sub ax, dx ; if 0, use special case multiply
jz LLkk
adc ax, 0
LLkkout:
mov cx, ax ; kk is in cx
mov ax, [sx4] ; x4 *= zkey[6]
mul [Word ptr si+6]
sub ax, dx
jz LLx4 ; if 0, use special case multiply
adc ax, 0
LLx4out:
mov [sx4], ax ; x4 is in sx4 and ax
xor ax, di ; x4^x2
add ax, cx ; kk+(x2^x4)
mul [Word ptr si+10]; compute t1
sub ax, dx
jz LLt1 ; if 0, use special case multiply
adc ax, 0
LLt1out: ; t1 is in ax
add cx, ax ; t2 is in cx kk+t1
xor [sx4], cx ; x4 in sx4
xor di, cx ; new x3 in di
xor bx, ax ; new x2 in bx
xchg bx, di ; x2 in di, x3 in bx
xor ax, [sx1] ; x1 in ax
mov [sx1], ax ; and [sx1]
add si, 12 ; point to next subkey
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -