?? motion.c
字號:
unsigned char *blk1,*blk2;
int lx,hx,hy,h;
int distlim;
{
unsigned char *p1,*p1a,*p2;
int i,j;
int s,v;
if(fastMotionCompensationLevel)
{
lx <<= fastMotionCompensationLevel;
h >>= fastMotionCompensationLevel;
}
if (!hx && !hy)
{
//針對MMX處理器
if(cpu_MMX)
{
_asm
{
mov esi, blk1 ;// esi = blk1
mov edi, blk2 ;// edi = blk2
mov ecx, h ;// ecx = h
mov edx, lx ;// edx = lx
mov eax, 0 ;// eax = s
dist1__l1:
movq mm0, [esi] ;// load esi[0..7] into mm0
movq mm1, [edi] ;// load edi[0..7] into mm1
movq mm2, [esi+8] ;// load esi[8..15] into mm2
movq mm3, [edi+8] ;// load edi[8..15] into mm3
movq mm4, mm0
movq mm5, mm1
movq mm6, mm2
movq mm7, mm3
psubusb mm0, mm1
psubusb mm2, mm3
psubusb mm5, mm4
psubusb mm7, mm6
pxor mm1, mm1 ;// mm1 = 0
por mm0, mm5 ;// mm0 = abs(esi[0..7] - edi[0..7])
por mm2, mm7 ;// mm2 = abs(esi[8..15] - edi[8..15])
movq mm4, mm0
movq mm6, mm2
punpcklbw mm0, mm1 ;// unpack the lower 4 bytes into mm0
punpcklbw mm2, mm1 ;// unpack the lower 4 bytes into mm2
punpckhbw mm4, mm1 ;// unpack the upper 4 bytes into mm4
punpckhbw mm6, mm1 ;// unpack the upper 4 bytes into mm6
paddw mm0, mm2
paddw mm4, mm6
paddw mm0, mm4 ;// mm0 += mm2 + mm4 + mm6
movq mm5, mm0
punpcklwd mm0, mm1 ;// unpack the lower 2 words into mm0
punpckhwd mm5, mm1 ;// unpack the upper 2 words into mm5
paddd mm0, mm5 ;// mm0 += mm5
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
psrlq mm0, 32 ;// shift mm0 to get upper dword
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
cmp eax, distlim ;// compare eax with distlim
jge dist1__l2 ;// terminate if eax >= distlim
add esi, edx ;// esi += edx
add edi, edx ;// edi += edx
dec ecx ;// decrement ecx
jnz dist1__l1 ;// loop while not zero
dist1__l2:
mov s, eax ;// s = eax
emms ;// empty MMX state
}
}
else
{
s = 0;
p1 = blk1;
p2 = blk2;
for (j=0; j<h; j++)
{
if ((v = p1[0] - p2[0])<0) v = -v; s+= v;
if ((v = p1[1] - p2[1])<0) v = -v; s+= v;
if ((v = p1[2] - p2[2])<0) v = -v; s+= v;
if ((v = p1[3] - p2[3])<0) v = -v; s+= v;
if ((v = p1[4] - p2[4])<0) v = -v; s+= v;
if ((v = p1[5] - p2[5])<0) v = -v; s+= v;
if ((v = p1[6] - p2[6])<0) v = -v; s+= v;
if ((v = p1[7] - p2[7])<0) v = -v; s+= v;
if ((v = p1[8] - p2[8])<0) v = -v; s+= v;
if ((v = p1[9] - p2[9])<0) v = -v; s+= v;
if ((v = p1[10] - p2[10])<0) v = -v; s+= v;
if ((v = p1[11] - p2[11])<0) v = -v; s+= v;
if ((v = p1[12] - p2[12])<0) v = -v; s+= v;
if ((v = p1[13] - p2[13])<0) v = -v; s+= v;
if ((v = p1[14] - p2[14])<0) v = -v; s+= v;
if ((v = p1[15] - p2[15])<0) v = -v; s+= v;
if (s >= distlim)
break;
p1+= lx;
p2+= lx;
}
}
}
else if (hx && !hy)
{
//針對支持3DNow的CPU
if (cpu_3DNow)
{
s = 0;
p1 = blk1;
p2 = blk2;
for (j=0; j<h; j++)
{
#define femmsInst __asm _emit 0x0F __asm _emit 0x0E
#define pavgusbmm0mm2 __asm _emit 0x0F __asm _emit 0x0F __asm _emit 0xC2 __asm _emit 0xBF
#define pavgusbmm1mm3 __asm _emit 0x0F __asm _emit 0x0F __asm _emit 0xCB __asm _emit 0xBF
__asm
{
femmsInst
mov eax, p1
movq mm0,[eax]
movq mm1,[eax+8]
movq mm2,[eax+1]
movq mm3,[eax+9]
pavgusbmm0mm2
movq tmpResult,mm0
pavgusbmm1mm3
movq tmpResult+8,mm1
femmsInst
}
s += calcDist16(tmpResult,p2);
p1+= lx;
p2+= lx;
}
}
else if(cpu_MMX)
{
_asm
{
mov esi, blk1 ;// esi = blk1
mov edi, blk2 ;// edi = blk2
mov ecx, h ;// ecx = h
mov edx, lx ;// edx = lx
mov eax, 0 ;// eax = s
pxor mm7, mm7 ;// mm7 = 0
dist1__l3:
movq mm5, PACKED_1 ;// mm5 = (1, 1, 1, 1)
movd mm0, [esi] ;// lower 4 bytes in mm0 = esi[0..3]
movd mm1, [esi+1] ;// lower 4 bytes in mm1 = esi[1..4]
movd mm2, [esi+4] ;// lower 4 bytes in mm2 = esi[4..7]
movd mm3, [esi+5] ;// lower 4 bytes in mm3 = esi[5..8]
punpcklbw mm0, mm7 ;// unpack the lower 4 bytes into mm0
punpcklbw mm1, mm7 ;// unpack the lower 4 bytes into mm1
punpcklbw mm2, mm7 ;// unpack the lower 4 bytes into mm2
punpcklbw mm3, mm7 ;// unpack the lower 4 bytes into mm3
paddw mm0, mm1 ;// mm0 += mm1
paddw mm2, mm3 ;// mm2 += mm3
paddw mm0, mm5 ;// mm0 += (1,1,1,1)
paddw mm2, mm5 ;// mm2 += (1,1,1,1)
psrlw mm0, 1 ;// mm0 >>= 1
psrlw mm2, 1 ;// mm2 >>= 1
movd mm4, [edi] ;// lower 4 bytes in mm4 = edi[0..3]
movd mm6, [edi+4] ;// lower 4 bytes in mm6 = edi[0..3]
punpcklbw mm4, mm7 ;// unpack the lower 4 bytes into mm4
punpcklbw mm6, mm7 ;// unpack the lower 4 bytes into mm6
movq mm1, mm0
movq mm3, mm2
psubusw mm0, mm4
psubusw mm2, mm6
psubusw mm4, mm1
psubusw mm6, mm3
por mm0, mm4 ;// mm0 = abs((esi[0..3] + esi[1..4]) >> 1) - edi[0..3])
por mm2, mm6 ;// mm2 = abs((esi[4..7] + esi[5..8]) >> 1) - edi[4..7])
movq mm4, mm0
movq mm6, mm2
punpcklwd mm0, mm7 ;// unpack the lower 4 word into mm0
punpckhwd mm4, mm7 ;// unpack the lower 4 word into mm4
punpcklwd mm2, mm7 ;// unpack the lower 4 word into mm2
punpckhwd mm6, mm7 ;// unpack the lower 4 word into mm6
paddd mm0, mm2
paddd mm4, mm6
paddd mm0, mm4 ;// mm0 += mm2 + mm4 + mm6
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
psrlq mm0, 32 ;// shift mm0 to get upper dword
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
movd mm0, [esi+8] ;// lower 4 bytes in mm0 = esi[8..11]
movd mm1, [esi+9] ;// lower 4 bytes in mm1 = esi[9..12]
movd mm2, [esi+12] ;// lower 4 bytes in mm2 = esi[12..15]
movd mm3, [esi+13] ;// lower 4 bytes in mm3 = esi[13..16]
punpcklbw mm0, mm7 ;// unpack the lower 4 bytes into mm0
punpcklbw mm1, mm7 ;// unpack the lower 4 bytes into mm1
punpcklbw mm2, mm7 ;// unpack the lower 4 bytes into mm2
punpcklbw mm3, mm7 ;// unpack the lower 4 bytes into mm3
paddw mm0, mm1 ;// mm0 += mm1
paddw mm2, mm3 ;// mm2 += mm3
paddw mm0, mm5 ;// mm0 += (1,1,1,1)
paddw mm2, mm5 ;// mm2 += (1,1,1,1)
psrlw mm0, 1 ;// mm0 >>= 1
psrlw mm2, 1 ;// mm2 >>= 1
movd mm4, [edi+8] ;// lower 4 bytes in mm4 = edi[8..11]
movd mm6, [edi+12] ;// lower 4 bytes in mm6 = edi[12..15]
punpcklbw mm4, mm7 ;// unpack the lower 4 bytes into mm4
punpcklbw mm6, mm7 ;// unpack the lower 4 bytes into mm6
movq mm1, mm0
movq mm3, mm2
psubusw mm0, mm4
psubusw mm2, mm6
psubusw mm4, mm1
psubusw mm6, mm3
por mm0, mm4 ;// mm0 = abs((esi[8..11] + esi[9..12]) >> 1) - edi[8..11])
por mm2, mm6 ;// mm2 = abs((esi[12..15] + esi[13..16]) >> 1) - edi[12..15])
movq mm4, mm0
movq mm6, mm2
punpcklwd mm0, mm7 ;// unpack the lower 4 word into mm0
punpckhwd mm4, mm7 ;// unpack the lower 4 word into mm4
punpcklwd mm2, mm7 ;// unpack the lower 4 word into mm2
punpckhwd mm6, mm7 ;// unpack the lower 4 word into mm6
paddd mm0, mm2
paddd mm4, mm6
paddd mm0, mm4 ;// mm0 += mm2 + mm4 + mm6
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
psrlq mm0, 32 ;// shift mm0 to get upper dword
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
add esi, edx ;// esi += edx
add edi, edx ;// edi += edx
dec ecx ;// decrement ecx
jnz dist1__l3 ;// loop while not zero
mov s, eax ;// s = eax
emms ;// empty MMX state
}
}
else
{
s = 0;
p1 = blk1;
p2 = blk2;
for (j=0; j<h; j++)
{
for (i=0; i<16; i++)
{
v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
if (v>=0)
s+= v;
else
s-= v;
}
p1+= lx;
p2+= lx;
}
}
}
else if (!hx && hy)
{
if(cpu_3DNow)
{
s = 0;
p1 = blk1;
p2 = blk2;
p1a = p1 + lx;
for (j=0; j<h; j++)
{
__asm
{
// femms
femmsInst
mov eax, p1
movq mm0,[eax]
movq mm1,[eax+8]
mov eax,p1a
movq mm2,[eax]
movq mm3,[eax+8]
pavgusbmm0mm2
movq tmpResult,mm0
pavgusbmm1mm3
movq tmpResult+8,mm1
femmsInst
}
s += calcDist16(tmpResult,p2);
p1 = p1a;
p1a+= lx;
p2+= lx;
}
}
else if(cpu_MMX)
{
_asm
{
mov esi, blk1 ;// esi = blk1
mov edi, blk2 ;// edi = blk2
mov edx, lx ;// edx = lx
mov ecx, h ;// ecx = h
mov eax, 0 ;// eax = s
pxor mm7, mm7 ;// mm7 = 0
dist1__l4:
movq mm5, PACKED_1 ;// mm5 = (1, 1, 1, 1)
movd mm0, [esi] ;// lower 4 bytes in mm0 = esi[0..3]
movd mm1, [esi+edx] ;// lower 4 bytes in mm1 = (esi + edx)[0..3]
movd mm2, [esi+4] ;// lower 4 bytes in mm2 = esi[4..7]
movd mm3, [esi+edx+4] ;// lower 4 bytes in mm3 = (esi + edx)[4..7]
punpcklbw mm0, mm7 ;// unpack the lower 4 bytes into mm0
punpcklbw mm1, mm7 ;// unpack the lower 4 bytes into mm1
punpcklbw mm2, mm7 ;// unpack the lower 4 bytes into mm2
punpcklbw mm3, mm7 ;// unpack the lower 4 bytes into mm3
paddw mm0, mm1 ;// mm0 += mm1
paddw mm2, mm3 ;// mm2 += mm3
paddw mm0, mm5 ;// mm0 += (1, 1, 1, 1)
paddw mm2, mm5 ;// mm2 += (1, 1, 1, 1)
psrlw mm0, 1 ;// mm0 >>= 1
psrlw mm2, 1 ;// mm0 >>= 1
movd mm4, [edi] ;// lower 4 bytes in mm4 = edi[0..3]
movd mm6, [edi+4] ;// lower 4 bytes in mm6 = edi[4..7]
punpcklbw mm4, mm7 ;// unpack the lower 4 bytes into mm4
punpcklbw mm6, mm7 ;// unpack the lower 4 bytes into mm6
movq mm1, mm0
movq mm3, mm2
psubusw mm0, mm4
psubusw mm2, mm6
psubusw mm4, mm1
psubusw mm6, mm3
por mm0, mm4 ;// mm0 = abs((esi[0..3] + (esi + edx)[0..3]) >> 1) - edi[0..3])
por mm2, mm6 ;// mm2 = abs((esi[4..7] + (esi + edx)[4..7]) >> 1) - edi[4..7])
movq mm4, mm0
movq mm6, mm2
punpcklwd mm0, mm7 ;// unpack lower 4 words into mm0
punpckhwd mm4, mm7 ;// unpack upper 4 words into mm4
punpcklwd mm2, mm7 ;// unpack lower 4 words into mm2
punpckhwd mm6, mm7 ;// unpack upper 4 words into mm6
paddd mm0, mm2
paddd mm4, mm6
paddd mm0, mm4 ;// mm0 += mm2 + mm4 + mm6
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
psrlq mm0, 32 ;// shift mm0 to get upper dword
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
movd mm0, [esi+8] ;// lower 4 bytes in mm0 = esi[8..11]
movd mm1, [esi+edx+8] ;// lower 4 bytes in mm1 = (esi + edx)[8..11]
movd mm2, [esi+12] ;// lower 4 bytes in mm2 = esi[12..15]
movd mm3, [esi+edx+12] ;// lower 4 bytes in mm3 = (esi + edx)[12..15]
punpcklbw mm0, mm7 ;// unpack the lower 4 bytes into mm0
punpcklbw mm1, mm7 ;// unpack the lower 4 bytes into mm1
punpcklbw mm2, mm7 ;// unpack the lower 4 bytes into mm2
punpcklbw mm3, mm7 ;// unpack the lower 4 bytes into mm3
paddw mm0, mm1 ;// mm0 += mm1
paddw mm2, mm3 ;// mm2 += mm3
paddw mm0, mm5 ;// mm0 += (1, 1, 1, 1)
paddw mm2, mm5 ;// mm2 += (1, 1, 1, 1)
psrlw mm0, 1 ;// mm0 >>= 1
psrlw mm2, 1 ;// mm0 >>= 1
movd mm4, [edi+8] ;// lower 4 bytes in mm4 = edi[8..11]
movd mm6, [edi+12] ;// lower 4 bytes in mm6 = edi[12..15]
punpcklbw mm4, mm7 ;// unpack the lower 4 bytes into mm4
punpcklbw mm6, mm7 ;// unpack the lower 4 bytes into mm6
movq mm1, mm0
movq mm3, mm2
psubusw mm0, mm4
psubusw mm2, mm6
psubusw mm4, mm1
psubusw mm6, mm3
por mm0, mm4 ;// mm0 = abs((esi[8..11] + (esi + edx)[8..11]) >> 1) - edi[8..11])
por mm2, mm6 ;// mm2 = abs((esi[12..15] + (esi + edx)[12..15]) >> 1) - edi[12..15])
movq mm4, mm0
movq mm6, mm2
punpcklwd mm0, mm7 ;// unpack the lower 2 words into mm0
punpckhwd mm4, mm7 ;// unpack the upper 2 words into mm4
punpcklwd mm2, mm7 ;// unpack the lower 2 words into mm2
punpckhwd mm6, mm7 ;// unpack the upper 2 words into mm6
paddd mm0, mm2
paddd mm4, mm6
paddd mm0, mm4 ;// mm0 += mm2 + mm4 + mm6
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
psrlq mm0, 32 ;// shift mm0 to get upper dword
movd ebx, mm0 ;// load lower dword of mm0 into ebx
add eax, ebx ;// eax += ebx
add esi, edx ;// esi += edx
add edi, edx ;// edi += edx
dec ecx ;// decrement ecx
jnz dist1__l4 ;// loop while not zero
mov s, eax ;// s = eax
emms ;// empty MMX state
}
}
else
{
s = 0;
p1 = blk1;
p2 = blk2;
p1a = p1 + lx;
for (j=0; j<h; j++)
{
for (i=0; i<16; i++)
{
v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -