?? yuv12-rgb16.s
字號:
sar ebx, 1
add esi, ebx
add edx, esi
neg ebx
mov [esp+FrameWidth],ebx
; Register Usage:
;
;------------------------------------------------------------------------------
PrepareChromaLine:
mov ebp,[esp+AspectCount]
mov ebx,[esp+FrameWidth]
sub ebp,2
mov eax,[esp+CCOPitch]
mov [esp+tmpCCOPitch],eax
ja continue
xor eax,eax
add ebp,[esp+AspectAdjustmentCount]
mov [esp+tmpCCOPitch],eax
continue:
mov [esp+AspectCount],ebp
do_next_8x2_block:
mov ebp,[esp+tmpYCursorEven]
; here is even line
movd mm1, [edx+ebx] ; 4 u values
pxor mm0, mm0 ; mm0=0
movd mm2, [esi+ebx] ; 4 v values
punpcklbw mm1, mm0 ; get 4 unsign u
psubw mm1, [Minusg] ; get 4 unsign u-128
punpcklbw mm2, mm0 ; get unsign v
psubw mm2, [Minusg] ; get unsign v-128
movq mm3, mm1 ; save the u-128 unsign
movq mm5, mm1 ; save u-128 unsign
punpcklwd mm1, mm2 ; get 2 low u, v unsign pairs
pmaddwd mm1, [UVtG]
punpckhwd mm3, mm2 ; create high 2 unsign uv pairs
pmaddwd mm3, [UVtG]
movq [temp_mmx+esp], mm2 ; save v-128
movq mm6, [ebp+2*ebx] ; mm6 has 8 y pixels
psubusb mm6, [Yadd] ; mm6 has 8 y-16 pixels
packssdw mm1, mm3 ; packed the results to signed words
movq mm7, mm6 ; save the 8 y-16 pixels
punpcklbw mm6, mm0 ; mm6 has 4 low y-16 unsign
pmullw mm6, [Ymul]
punpckhbw mm7, mm0 ; mm7 has 4 high y-16 unsign
pmullw mm7, [Ymul]
movq mm4, mm1
movq [temp_mmx+esp+8], mm1 ; save 4 chroma G values
punpcklwd mm1, mm1 ; chroma G replicate low 2
movq mm0, mm6 ; low y
punpckhwd mm4, mm4 ; chroma G replicate high 2
movq mm3, mm7 ; high y
psubw mm6, mm1 ; 4 low G
psraw mm6, [esp+GRightShift]
psubw mm7, mm4 ; 4 high G values in signed 16 bit
movq mm2, mm5
punpcklwd mm5, mm5 ; replicate the 2 low u pixels
pmullw mm5, [UtB]
punpckhwd mm2, mm2
psraw mm7, [esp+GRightShift]
pmullw mm2, [UtB]
packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
movq [temp_mmx+esp+16], mm5 ; low chroma B
paddw mm5, mm0 ; 4 low B values in signed 16 bit
movq [temp_mmx+esp+40], mm2 ; high chroma B
paddw mm2, mm3 ; 4 high B values in signed 16 bit
psraw mm5, [esp+BRightShift] ; low B scaled down by 6+(8-5)
psraw mm2, [esp+BRightShift] ; high B scaled down by 6+(8-5)
packuswb mm5, mm2 ; mm5: B7 B6 B5 B4 B3 B2 B1 B0
movq mm2, [temp_mmx+esp] ; 4 v values
movq mm1, mm5 ; save B
movq mm7, mm2
punpcklwd mm2, mm2 ; replicate the 2 low v pixels
pmullw mm2, [VtR]
punpckhwd mm7, mm7
pmullw mm7, [VtR]
paddusb mm1, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
movq [temp_mmx+esp+24], mm2 ; low chroma R
paddw mm2, mm0 ; 4 low R values in signed 16 bit
psraw mm2, [esp+RRightShift] ; low R scaled down by 6+(8-5)
pxor mm4, mm4 ; mm4=0 for 8->16 conversion
movq [temp_mmx+esp+32], mm7 ; high chroma R
paddw mm7, mm3 ; 4 high R values in signed 16 bit
psraw mm7, [esp+RRightShift] ; high R scaled down by 6+(8-5)
psubusb mm1, [esp+BUpperLimit]
packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
paddusb mm6, [esp+GUpperLimit] ; G fast patch ih
psubusb mm6, [esp+GUpperLimit] ; fast patch ih
paddusb mm2, [esp+RUpperLimit] ; R
psubusb mm2, [esp+RUpperLimit]
; here we are packing from RGB24 to RGB16
; input:
; mm6: G7 G6 G5 G4 G3 G2 G1 G0
; mm1: B7 B6 B5 B4 B3 B2 B1 B0
; mm2: R7 R6 R5 R4 R3 R2 R1 R0
; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
; when H=2**xBITS-1 (x is for R G B)
; output:
; mm1- result: 4 low RGB16
; mm7- result: 4 high RGB16
; using: mm0- zero register
; mm3- temporary results
; algorithm:
; for (i=0; i<8; i++) {
; RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
; }
psllq mm2, [esp+RLeftShift] ; position R in the most significant part of the byte
movq mm7, mm1 ; mm1: Save B
; note: no need for shift to place B on the least significant part of the byte
; R in left position, B in the right position so they can be combined
punpcklbw mm1, mm2 ; mm1: 4 low 16 bit RB
pxor mm0, mm0 ; mm0: 0
punpckhbw mm7, mm2 ; mm5: 4 high 16 bit RB
movq mm3, mm6 ; mm3: G
punpcklbw mm6, mm0 ; mm6: low 4 G 16 bit
psllw mm6, [esp+GLeftShift] ; shift low G 5 positions
punpckhbw mm3, mm0 ; mm3: high 4 G 16 bit
por mm1, mm6 ; mm1: low RBG16
psllw mm3, [esp+GLeftShift] ; shift high G 5 positions
por mm7, mm3 ; mm5: high RBG16
mov ebp,[esp+tmpYCursorOdd] ; moved to here to save cycles before odd line
movq [edi], mm1 ; !! aligned
;- start odd line
movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
pxor mm2, mm2
psubusb mm1, [Yadd] ; mm1 has 8 pixels y-16
movq mm5, mm1
punpcklbw mm1, mm2 ; get 4 low y-16 unsign pixels word
pmullw mm1, [Ymul] ; low 4 luminance contribution
punpckhbw mm5, mm2 ; 4 high y-16
pmullw mm5, [Ymul] ; high 4 luminance contribution
movq [edi+8], mm7 ; !! aligned
movq mm0, mm1
paddw mm0, [temp_mmx+esp+24] ; low 4 R
movq mm6, mm5
psraw mm0, [esp+RRightShift] ; low R scaled down by 6+(8-5)
paddw mm5, [temp_mmx+esp+32] ; high 4 R
movq mm2, mm1
psraw mm5, [esp+RRightShift] ; high R scaled down by 6+(8-5)
paddw mm2, [temp_mmx+esp+16] ; low 4 B
packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
psraw mm2, [esp+BRightShift] ; low B scaled down by 6+(8-5)
movq mm5, mm6
paddw mm6, [temp_mmx+esp+40] ; high 4 B
psraw mm6, [esp+BRightShift] ; high B scaled down by 6+(8-5)
movq mm3, [temp_mmx+esp+8] ; chroma G low 4
packuswb mm2, mm6 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
movq mm4, mm3
punpcklwd mm3, mm3 ; replicate low 2
punpckhwd mm4, mm4 ; replicate high 2
psubw mm1, mm3 ; 4 low G
psraw mm1, [esp+GRightShift] ; low G scaled down by 6+(8-5)
psubw mm5, mm4 ; 4 high G values in signed 16 bit
psraw mm5, [esp+GRightShift] ; high G scaled down by 6+(8-5)
paddusb mm2, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
packuswb mm1, mm5 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
psubusb mm2, [esp+BUpperLimit]
paddusb mm1, [esp+GUpperLimit] ; G
psubusb mm1, [esp+GUpperLimit]
paddusb mm0, [esp+RUpperLimit] ; R
mov eax,[esp+tmpCCOPitch]
psubusb mm0, [esp+RUpperLimit]
; here we are packing from RGB24 to RGB16
; mm1: G7 G6 G5 G4 G3 G2 G1 G0
; mm2: B7 B6 B5 B4 B3 B2 B1 B0
; mm0: R7 R6 R5 R4 R3 R2 R1 R0
; output:
; mm2- result: 4 low RGB16
; mm7- result: 4 high RGB16
; using: mm4- zero register
; mm3- temporary results
psllq mm0, [esp+RLeftShift] ; position R in the most significant part of the byte
movq mm7, mm2 ; mm7: Save B
; note: no need for shift to place B on the least significant part of the byte
; R in left position, B in the right position so they can be combined
punpcklbw mm2, mm0 ; mm1: 4 low 16 bit RB
pxor mm4, mm4 ; mm4: 0
movq mm3, mm1 ; mm3: G
punpckhbw mm7, mm0 ; mm7: 4 high 16 bit RB
punpcklbw mm1, mm4 ; mm1: low 4 G 16 bit
punpckhbw mm3, mm4 ; mm3: high 4 G 16 bit
psllw mm1, [esp+GLeftShift] ; shift low G 5 positions
por mm2, mm1 ; mm2: low RBG16
psllw mm3, [esp+GLeftShift] ; shift high G 5 positions
por mm7, mm3 ; mm7: high RBG16
movq [edi+eax], mm2
movq [edi+eax+8], mm7 ; aligned
add edi, 16 ; ih take 16 bytes (8 pixels-16 bit)
add ebx, 4 ; ? to take 4 pixels together instead of 2
jl near do_next_8x2_block ; ? update the loop for 8 y pixels at once
add edi,[esp+CCOSkipDistance] ; go to begin of next line
add edi,[esp+tmpCCOPitch] ; skip odd line (if it is needed)
; Leax AspectCount
; Lebp CCOPitch ; skip odd line
; sub eax, 2
; jg @f
; Addeax AspectBaseCount
; xor ebp, ebp
;@@:
; Seax AspectCount
; add edi, ebp
mov eax,[esp+YPitch]
mov ebp,[esp+tmpYCursorOdd]
add ebp, eax ; skip one line
; lea ebp, [ebp+2*eax] ; skip two lines
mov [esp+tmpYCursorEven],ebp
; Sebp tmpYCursorOdd
add ebp, eax ; skip one line
mov [esp+tmpYCursorOdd],ebp
; Lebp tmpYCursorEven
; lea ebp, [ebp+2*eax]
; Sebp tmpYCursorEven
add esi,[esp+ChromaPitch]
add edx,[esp+ChromaPitch]
; Leax YLimit ; Done with last line?
; cmp ebp, eax
; jbe PrepareChromaLine
sub word [esp+FrameHeight],2
ja near PrepareChromaLine
;------------------------------------------------------------------------------
finish:
emms
add esp, LocalFrameSize
pop ebx
pop ebp
pop edi
pop esi
ret
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -