/// baojinlong@sohu.com
// if have any problem, contact me.
// the asm function nearly have the same speed as mmx!
/// asm code
unsigned char *clip;
void init_clip(){ unsigned char *p; p=(unsigned char*)malloc(2048); clip=p+1024; for (int i=-1024;i<1024;i++) { clip[i]=(i>=0)? ((i<=255)?i:255):0; }}
// r=1.164(y-16)+1.596(v-128)// g=1.164(y-16)-0.391(u-128)-0.813(v-128)// b=1.164(y-16)+2.018(u-128)
const static int p_1164 = 75; const static int p_1596 = 102; const static int p_0391 = 25; const static int p_0813 = 52; const static int p_2018 = 129;
const static int ooffooff=0x00ff00ff;const static int ffooffoo=0xff00ff00;
const static short p_223[]= {25632,25632};const static short p_135[]= {4349,4349};const static short p_277[]= { 23906,23906};
void paroll_yuv2rgb(unsigned char *y, unsigned char *u, unsigned char *v, unsigned char *r, int h , int w){// h: height of y matrix// w: width of y matrix// chroma type:: must be 420
// r=1.164*y + 1.596*v -223// g=1.164*y - 0.391*u - 0.813*v +135.9// b=1.164*y + 2.018*u -276.93
int py1164_20; int py1164_31; int pv1596; int pv0813; int pu0391; int pu2018;int pr20,pr31,pg20,pg31,pb20,pb31;
int rw=w<<2;int rws16=rw-16;int lw=w>>2;int lh=h>>1;int lw0=lw;int iclip=(int)clip;
__asm { mov esi,yllw: mov edi,v
add [v],2
movzx ebx,byte ptr [edi] movzx eax,byte ptr [edi+1]
mov edi,u add [u],2
shl eax,16 or eax,ebx // 00 v1 00 v0
movzx ecx,byte ptr [edi+1]
mov ebx,eax mul [p_0813] shl ecx,16 mov [pv0813],eax mov eax,ebx mul dword ptr p_1596
movzx ebx,byte ptr [edi]
mov [pv1596],eax
mov eax,ecx or eax,ebx // 00 u1 00 u0
mov ecx,[esi] // y3 y2 y1 y0
mov ebx,eax mul dword ptr p_0391
mov edi,ecx
mov [pu0391],eax
mov eax,ebx mul dword ptr p_2018
and ecx,ooffooff // 0 y2 0 y0 mov [pu2018],eax
mov eax,ecx mul [p_1164] // y2 y0 and edi,ffooffoo // y3 0 y1 0 mov [py1164_20],eax mov eax,edi shr eax,8 // 0 y3 0 y2
mul [p_1164] // y3 y1
mov ecx,[pv1596]
mov ebx,[py1164_20]
mov edx,dword ptr p_223
mov [py1164_31],eax
add eax,ecx add ebx,ecx shr eax,1 shr ebx,1 add eax,edx add ebx,edx shl eax,1 shl ebx,1
mov [pr31],eax // r3 r1 mov [pr20],ebx // r2 r0
mov ecx,[pu2018] mov eax,[py1164_20] mov ebx,[py1164_31] mov edx,dword ptr [p_277] add eax,ecx add ebx,ecx shr eax,1 shr ebx,1 add eax,edx add ebx,edx shl eax,1 shl ebx,1 mov [pb20],eax mov [pb31],ebx
mov eax,[py1164_20] mov ebx,[py1164_31] mov ecx,[pu0391] mov edx,[pv0813] shr eax,1 shr ebx,1 shr ecx,1 shr edx,1 sub eax,ecx sub ebx,ecx mov ecx,dword ptr [p_135] sub eax,edx sub ebx,edx add eax,ecx add ebx,ecx shl eax,1 shl ebx,1 mov [pg20],eax mov [pg31],ebx
// clip and output mov edi,r
lea edx, [pr20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // r0 mov dl,[ebx] // r2 mov [edi+2],cl mov [edi+10],dl
lea edx,[pr31] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // r1 mov dl,[ebx] // r3 mov [edi+6],cl mov [edi+14],dl
lea edx,[pg20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // g0 mov dl,[ebx] // g2 mov [edi+1],cl mov [edi+9],dl
lea edx,[pg31] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // g1 mov dl,[ebx] // g3 mov [edi+5],cl mov [edi+13],dl
lea edx,[pb20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // b0 mov dl,[ebx] // b2 mov [edi],cl mov [edi+8],dl
lea edx,[pb31] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // b1 mov dl,[ebx] // b3
mov ebx,rw add esi,w add [r],ebx mov eax,[esi] // y3 y2 y1 y0
mov [edi+4],cl mov [edi+12],dl
// next row of y mov ebx,eax and eax,ooffooff // 0 y2 0 y0 mul [p_1164] and ebx,ffooffoo // y3 0 y1 0 shr ebx,8 mov [py1164_20],eax mov eax,ebx
mul [p_1164]
mov ecx,pv1596
mov ebx,py1164_20 mov edx,dword ptr p_223
mov [py1164_31],eax
add eax,ecx add ebx,ecx shr eax,1 shr ebx,1 add eax,edx add ebx,edx shl eax,1 shl ebx,1 mov [pr31],eax // r3 r1 mov [pr20],ebx // r2 r0
mov ecx,[pu2018] mov eax,[py1164_20] mov ebx,[py1164_31] mov edx,dword ptr [p_277] add eax,ecx add ebx,ecx shr eax,1 shr ebx,1 add eax,edx add ebx,edx shl eax,1 shl ebx,1 mov [pb20],eax mov [pb31],ebx
mov ecx,[pu0391] mov eax,[py1164_20] mov ebx,[py1164_31] mov edx,[pv0813] shr ecx,1 shr eax,1 shr ebx,1 shr edx,1 sub eax,ecx sub ebx,ecx mov ecx,dword ptr [p_135] sub eax,edx sub ebx,edx add eax,ecx add ebx,ecx shl eax,1 shl ebx,1 mov [pg20],eax mov [pg31],ebx
// clip and output mov edi,r
lea edx,[pr20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // r0 mov dl,[ebx] // r2 mov [edi+2],cl mov [edi+10],dl
lea edx,[pr31] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // r1 mov dl,[ebx] // r3 mov [edi+6],cl mov [edi+14],dl
lea edx,[pg20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // g0 mov dl,[ebx] // g2 mov [edi+1],cl mov [edi+9],dl
lea edx,[pg31] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // g1 mov dl,[ebx] // g3 mov [edi+5],cl mov [edi+13],dl
lea edx,[pb20] mov ecx,iclip movsx eax,word ptr [edx] movsx ebx,word ptr [edx+2] sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // b0 mov dl,[ebx] // b2 mov [edi],cl mov [edi+8],dl
lea edx,[pb31] mov ecx,iclip movsx eax,word ptr [edx] file://b1 movsx ebx,word ptr [edx+2] file://b3 sar eax,6 sar ebx,6 add eax,ecx add ebx,ecx
xor ecx,ecx xor edx,edx mov cl,[eax] // b1 mov dl,[ebx] // b3 mov [edi+4],cl mov [edi+12],dl
mov eax,rws16 sub esi,w add esi,4 sub [r],eax
sub [lw],1 jnz llw
mov eax,lw0 mov ebx,rw
add esi,w add [r],ebx
mov [lw],eax
sub [lh],1 jnz llw }
}
/ asm code end
/ mmx code begin
#ifdef __yuv2rgb_mul32
const static short t16[4]={16,16,16,16};const static short t128[4]={128,128,128,128}; const short t1164[4]= { 4768,4768,4768,4768 }; const short t1596[4]= { 6538,6538,6538,6538 }; const short t0391[4]= { 1602,1602,1602,1602 }; const short t0813[4]= { 3330,3330,3330,3330 }; const short t2018[4]= { 8266,8266,8266,8266 };
// r=1.164(y-16)+1.596(v-128)// g=1.164(y-16)-0.391(u-128)-0.813(v-128)// b=1.164(y-16)+2.018(u-128)
#define ___0rgb
void VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY, unsigned char *lpU, unsigned char *lpV, unsigned char *lpRGB, int nSrcHeight , int nSrcWidth)
{ int rgbwidth=nSrcWidth<<2;// 32 bits 0rgb; int nyw=nSrcWidth; int col=nSrcWidth>>3; int row=nSrcHeight>>1;
int t1596v_128_10[2]; int t1596v_128_32[2]; int t0813v_128_10[2]; int t0813v_128_32[2]; int t0391u_128_10[2]; int t0391u_128_32[2]; int t2018u_128_10[2]; int t2018u_128_32[2];
__asm { mov esi,lpU mov edi,lpV mov eax,lpY mov edx,lpRGB mov ecx,col mov ebx,row
rrr: pxor mm0,mm0
movq mm3,qword ptr t128 movd mm2,dword ptr [edi] file://00 00 00 00 v3 v2 v1 v0 movd mm1,dword ptr [esi] file://00 00 00 00 u3 u2 u1 u0 punpcklbw mm2,mm0 file://00 v3 00 v2 00 v1 00 v0 punpcklbw mm1,mm0 file://00 u3 00 u2 00 u1 00 u0 psubsw mm1,mm3 file://u-128 psubsw mm2,mm3 file://v-128file://compute u,v datafile://t0391u_128 movq mm7,qword ptr t0391 movq mm3,mm1 movq mm4,mm1 pmullw mm4,mm7 pmulhw mm3,mm7 movq mm7,mm4 punpckhwd mm4,mm3 file://t0391u_128_32-->mm4 punpcklwd mm7,mm3 file://t0391u_128_10-->mm7 movq qword ptr t0391u_128_32,mm4 movq qword ptr t0391u_128_10,mm7file://t2018u_128 movq mm7,qword ptr t2018 movq mm3,mm1 pmullw mm1,mm7 pmulhw mm3,mm7 movq mm7,mm1 punpckhwd mm1,mm3 file://t2018u_128_32-->mm1 punpcklwd mm7,mm3 file://t2018u_128_10-->mm7 movq qword ptr t2018u_128_32,mm1 movq qword ptr t2018u_128_10,mm7file://t1596v_128 movq mm7,qword ptr t1596 movq mm3,mm2 movq mm4,mm2 pmullw mm4,mm7 pmulhw mm3,mm7 movq mm7,mm4 punpckhwd mm4,mm3 file://t1596v_128_32-->mm4 punpcklwd mm7,mm3 file://t1596v_128_10-->mm7 movq qword ptr t1596v_128_32,mm4 movq qword ptr t1596v_128_10,mm7file://t0813v_128 movq mm7,qword ptr t0813 movq mm3,mm2 pmullw mm2,mm7 pmulhw mm3,mm7 movq mm7,mm2 punpckhwd mm2,mm3 file://t0813v_128_32-->mm2 punpcklwd mm7,mm3 file://t0813v_128_10-->mm7 movq qword ptr t0813v_128_32,mm2 movq qword ptr t0813v_128_10,mm7
movq mm3,dword ptr [eax] // 76 54 32 10 pxor mm0,mm0 movq mm2,mm3 punpcklbw mm2,mm0 // 03 02 01 00 punpckhbw mm3,mm0 // 07 06 05 04 movq mm4,mm2 movq mm5,mm3 punpcklwd mm2,mm0 // 00 01 00 00 punpckhwd mm0,mm4 // 03 00 02 00 pxor mm4,mm4 por mm0,mm2 // 03 01 02 00--->mm0 movq mm7,qword ptr t16 punpcklwd mm3,mm4 // 00 05 00 04 punpckhwd mm4,mm5 // 07 00 06 00 por mm4,mm3 // 07 05 06 04-->mm5 psubsw mm0,mm7 file://y-16 movq mm5,mm4 psubsw mm5,mm7 file://y-16file://compute movq mm7,qword ptr t1164 movq mm6,mm0 file://y3 y1 y2 y0 pmullw mm6,mm7 pmulhw mm0,mm7 movq mm7,mm6 punpckhwd mm7,mm0 // y3 y1 file://1.164(y-16)-->mm7 punpcklwd mm6,mm0 // y2 y0 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_10 movq mm1,mm6 // y2 y0 movq mm2,mm7 // y3 y1// r=1.164(y-16)+1.596(v-128) paddd mm1,mm0 // r2 r0 paddd mm2,mm0 // r3 r1 psrad mm1,12 psrad mm2,12 movq mm0,mm1 punpckhdq mm1,mm2 // r3 r2 punpckldq mm0,mm2 // r1 r0 packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_10 movq mm4,qword ptr t0813v_128_10// g=1.164(y-16)-0.391(u-128)-0.813(v-128) movq mm2,mm6 movq mm3,mm7 psubd mm2,mm1 psubd mm3,mm1 psubd mm2,mm4 psubd mm3,mm4 psrad mm2,12 psrad mm3,12 movq mm4,mm2 movq mm1,qword ptr t2018u_128_10 punpckhdq mm2,mm3 punpckldq mm4,mm3 packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128) paddd mm6,mm1 // b2 b0 paddd mm7,mm1 // b3 b1 psrad mm6,12 psrad mm7,12 movq mm1,mm6 punpckhdq mm1,mm7 punpckldq mm6,mm7 pxor mm2,mm2 packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb packuswb mm6,mm2 packuswb mm4,mm2 packuswb mm0,mm2 punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6 punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0 movq mm7,mm6 punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm6 movq qword ptr[edx+8], mm7#else packuswb mm0,mm2 file://r packuswb mm4,mm2 file://g packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0 punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6 movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0 punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm0 movq qword ptr[edx+8], mm7#endiffile://compute movq mm7,qword ptr t1164 movq mm6,mm5 file://y7 y5 y6 y4 pmullw mm6,mm7 pmulhw mm5,mm7 movq mm7,mm6 punpckhwd mm7,mm5 // y7 y5 file://1.164(y-16)-->mm7 punpcklwd mm6,mm5 // y6 y4 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_32 movq mm1,mm6 // y6 y4 movq mm2,mm7 // y7 y5// r=1.164(y-16)+1.596(v-128) paddd mm1,mm0 // r2 r0 paddd mm2,mm0 // r3 r1 psrad mm2,12 psrad mm1,12 movq mm0,mm1 punpckhdq mm1,mm2 // r3 r2 punpckldq mm0,mm2 // r1 r0 packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_32 movq mm4,qword ptr t0813v_128_32file://g=1.164(y-16)-0.391(u-128)-0.813(v-128) movq mm2,mm6 movq mm3,mm7 psubd mm2,mm1 psubd mm3,mm1 psubd mm2,mm4 psubd mm3,mm4 psrad mm2,12 psrad mm3,12 movq mm1,qword ptr t2018u_128_32 movq mm4,mm2 punpckhdq mm2,mm3 punpckldq mm4,mm3 packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128) paddd mm6,mm1 // b2 b0 paddd mm7,mm1 // b3 b1 psrad mm6,12 psrad mm7,12 movq mm1,mm6 punpckhdq mm1,mm7 punpckldq mm6,mm7 pxor mm2,mm2 packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0#ifdef ___0rgb packuswb mm6,mm2 packuswb mm4,mm2 punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6 packuswb mm0,mm2 punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0 movq mm7,mm6 punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx+16], mm6 movq qword ptr[edx+24], mm7#else packuswb mm0,mm2 file://r packuswb mm4,mm2 file://g packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0 punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6 movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0 punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx+16], mm0 movq qword ptr[edx+24], mm7#endif/file://second stage , next row of y add eax,nyw add edx,rgbwidth
movq mm3,dword ptr [eax] // 76 54 32 10 pxor mm0,mm0 movq mm2,mm3 punpcklbw mm2,mm0 // 03 02 01 00 punpckhbw mm3,mm0 // 07 06 05 04 movq mm4,mm2 punpcklwd mm2,mm0 // 00 01 00 00 punpckhwd mm0,mm4 // 03 00 02 00 pxor mm4,mm4 por mm0,mm2 // 03 01 02 00--->mm0 movq mm7,qword ptr t16 movq mm5,mm3 punpcklwd mm3,mm4 // 00 05 00 04 punpckhwd mm4,mm5 // 07 00 06 00 por mm4,mm3 // 07 05 06 04-->mm4 psubsw mm0,mm7 file://y-16 movq mm5,mm4 psubsw mm5,mm7 file://y-16 file://compute movq mm7,qword ptr t1164 movq mm6,mm0 file://y3 y1 y2 y0 pmullw mm6,mm7 pmulhw mm0,mm7 movq mm7,mm6 punpckhwd mm7,mm0 // y3 y1 file://1.164(y-16)-->mm7 punpcklwd mm6,mm0 // y2 y0 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_10 movq mm1,mm6 // y2 y0 movq mm2,mm7 // y3 y1// r=1.164(y-16)+1.596(v-128) paddd mm1,mm0 // r2 r0 paddd mm2,mm0 // r3 r1 psrad mm2,12 psrad mm1,12 movq mm0,mm1 punpckhdq mm1,mm2 // r3 r2 punpckldq mm0,mm2 // r1 r0 packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_10 movq mm4,qword ptr t0813v_128_10file://g=1.164(y-16)-0.391(u-128)-0.813(v-128) movq mm2,mm6 movq mm3,mm7 psubd mm2,mm1 psubd mm3,mm1 psubd mm2,mm4 psubd mm3,mm4 psrad mm2,12 psrad mm3,12 movq mm4,mm2 movq mm1,qword ptr t2018u_128_10 punpckhdq mm2,mm3 punpckldq mm4,mm3 packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128) paddd mm6,mm1 // b2 b0 paddd mm7,mm1 // b3 b1 psrad mm6,12 psrad mm7,12 movq mm1,mm6 punpckhdq mm1,mm7 punpckldq mm6,mm7 pxor mm2,mm2 packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0#ifdef ___0rgb packuswb mm6,mm2 packuswb mm4,mm2 punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6 packuswb mm0,mm2 punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0 movq mm7,mm6 punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm6 movq qword ptr[edx+8], mm7#else packuswb mm0,mm2 file://r packuswb mm4,mm2 file://g packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0 punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6 movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0 punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm0 movq qword ptr[edx+8], mm7#endiffile://compute movq mm7,qword ptr t1164 movq mm6,mm5 file://y7 y5 y6 y4 pmullw mm6,mm7 pmulhw mm5,mm7 movq mm7,mm6 punpckhwd mm7,mm5 // y7 y5 file://1.164(y-16)-->mm7 punpcklwd mm6,mm5 // y6 y4 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_32 movq mm1,mm6 // y6 y4 movq mm2,mm7 // y7 y5// r=1.164(y-16)+1.596(v-128) paddd mm1,mm0 // r2 r0 paddd mm2,mm0 // r3 r1 psrad mm1,12 psrad mm2,12 movq mm0,mm1 punpckhdq mm1,mm2 // r3 r2 punpckldq mm0,mm2 // r1 r0 packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_32 movq mm4,qword ptr t0813v_128_32file://g=1.164(y-16)-0.391(u-128)-0.813(v-128) movq mm2,mm6 movq mm3,mm7 psubd mm2,mm1 psubd mm3,mm1 psubd mm2,mm4 psubd mm3,mm4 psrad mm2,12 psrad mm3,12 movq mm1,qword ptr t2018u_128_32 movq mm4,mm2 punpckhdq mm2,mm3 punpckldq mm4,mm3 packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128) paddd mm6,mm1 // b2 b0 paddd mm7,mm1 // b3 b1 psrad mm6,12 psrad mm7,12 movq mm1,mm6 punpckhdq mm1,mm7 punpckldq mm6,mm7 pxor mm2,mm2 packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0#ifdef ___0rgb packuswb mm6,mm2 packuswb mm4,mm2 punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6 packuswb mm0,mm2 punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0 movq mm7,mm6 punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx+16], mm6 movq qword ptr[edx+24], mm7#else packuswb mm0,mm2 file://r packuswb mm4,mm2 file://g packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0 punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6 movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0 punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx+16], mm0 movq qword ptr[edx+24], mm7#endif sub eax,nyw sub edx,rgbwidth add esi,4 add edi,4 add eax,8 add edx,32 dec ecx jnz rrr
mov ecx,col add eax,nyw add edx,rgbwidth dec ebx jnz rrr emms }}
#else
short t1164[4]= { 19071,19071,19071,19071//<<2 }; short t1596[4]= { 26149,26149,26149,26149//<<2 }; short t0391[4]= { 25625,25625,25625,25625//<<0 }; short t0813[4]= { 26641,26641,26641,26641//<<1 }; short t2018[4]= { 16532,16532,16532,16532//<<3 }; short t16[4]= { 16,16,16,16 }; short t128[4]= { 128,128,128,128 };
void VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY, unsigned char *lpU, unsigned char *lpV, unsigned char *lpRGB, int nSrcHeight, int nSrcWidth){ int rgbwidth=nSrcWidth<<2;// 32 bits rgb0; int nyw=nSrcWidth; int col=nSrcWidth>>3; int row=nSrcHeight>>1;
#define mmt2018u mm1 #define mmt0813v mm2 #define mmt0391u mm3 #define mmt1596v mm4
__int64 ty;
__asm { mov esi,lpU mov edi,lpV mov eax,lpY mov edx,lpRGB mov ecx,col mov ebx,row
rrr: pxor mm0,mm0
movq mm3,qword ptr t128 movq mm4,qword ptr t0391 movq mm5,qword ptr t2018 movq mm6,qword ptr t1596 movq mm7,qword ptr t0813
movd mm1,dword ptr [esi] movd mm2,dword ptr [edi] punpcklbw mm1,mm0 punpcklbw mm2,mm0
file://copute u,v psubsw mm1,mm3 file://u-128 psubsw mm2,mm3 file://v-128 movq mm3,mm1 psllw mm1,3 pmulhw mm3,mm4 // t0391u-->mm3 pmulhw mm1,mm5 // t2018u-->mm1 movq mm4,mm2 psllw mm2,1 psllw mm4,2 pmulhw mm2,mm7 // t0813v-->mm2 pmulhw mm4,mm6 // t1596v-->mm4
movq mm5,dword ptr [eax] // 76 54 32 10 pxor mm0,mm0 movq mm6,mm5 punpcklbw mm5,mm0 // 03 02 01 00 punpckhbw mm0,mm6 // 70 60 50 40 por mm0,mm5 // 73 62 51 40 pxor mm6,mm6 pxor mm5,mm5 punpckhbw mm6,mm0 // 70 30 60 20 punpcklbw mm0,mm5 // 05 01 04 00 por mm0,mm6 // 75 31 64 20 pxor mm5,mm5 movq mm6,mm0 punpckhbw mm6,mm5 // y7 y5 y3 y1 punpcklbw mm0,mm5 movq mm5,qword ptr t16 movq mm7,qword ptr t1164 psubsw mm6,mm5 psubsw mm0,mm5 psllw mm6,2 psllw mm0,2 pmulhw mm6,mm7 pmulhw mm0,mm7 // y6 y4 y2 y0 -->mm0 movq qword ptr ty,mm6 // y7 y5 y3 y1 -->ty file://compute pxor mm7,mm7 movq mm5,mmt1596v movq mm6,mm0 file://copy 1.164(y-16) paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5 psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128) psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6 paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5 packuswb mm6,mm7 packuswb mm0,mm7 punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0 packuswb mm5,mm7 punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5 movq mm7,mm0 punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm0 movq mm0,qword ptr ty movq qword ptr[edx+8], mm7
file://compute pxor mm7,mm7 movq mm5,mmt1596v paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5 movq mm6,mm0 file://copy 1.164(y-16) psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128) psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6 paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5 packuswb mm6,mm7 packuswb mm0,mm7 punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0 packuswb mm5,mm7 punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5 movq mm7,mm0 movq mm6,[edx] // 2 0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq mm5,mm6 punpckldq mm6,mm0 // 1 0 punpckhdq mm5,mm0 // 3 2 movq mm0,[edx+8] // 4 6 movq [edx],mm6 movq [edx+8], mm5 movq mm6,mm0 punpckhdq mm0,mm7 // 7 6 punpckldq mm6,mm7 // 5 4 movq [edx+24], mm0 movq [edx+16],mm6
file://next row of y add eax,nyw add edx,rgbwidth
movq mm5,dword ptr [eax] // 76 54 32 10 pxor mm0,mm0 movq mm6,mm5 punpcklbw mm5,mm0 // 03 02 01 00 punpckhbw mm0,mm6 // 70 60 50 40 por mm0,mm5 // 73 62 51 40 pxor mm6,mm6 pxor mm5,mm5 punpckhbw mm6,mm0 // 70 30 60 20 punpcklbw mm0,mm5 // 05 01 04 00 por mm0,mm6 // 75 31 64 20 pxor mm5,mm5 movq mm6,mm0 punpckhbw mm6,mm5 // y7 y5 y3 y1 punpcklbw mm0,mm5 movq mm5,qword ptr t16 movq mm7,qword ptr t1164 psubsw mm6,mm5 psubsw mm0,mm5 psllw mm6,2 psllw mm0,2 pmulhw mm6,mm7 pmulhw mm0,mm7 // y6 y4 y2 y0 -->mm0 movq qword ptr ty,mm6 // y7 y5 y3 y1 -->ty file://compute pxor mm7,mm7 movq mm5,mmt1596v paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5 movq mm6,mm0 file://copy 1.164(y-16) psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128) psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6 paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5 packuswb mm6,mm7 packuswb mm0,mm7 punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0 packuswb mm5,mm7 punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5 movq mm7,mm0 punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2 movq qword ptr[edx], mm0 movq qword ptr[edx+8], mm7
file://compute movq mm0,qword ptr ty pxor mm7,mm7 movq mm5,mmt1596v paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5 movq mm6,mm0 file://copy 1.164(y-16) psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128) psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6 paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5 packuswb mm6,mm7 packuswb mm0,mm7 punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0 packuswb mm5,mm7 punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5 movq mm7,mm0
movq mm6,[edx] // 2 0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0 punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq mm5,mm6 punpckldq mm6,mm0 // 1 0 punpckhdq mm5,mm0 // 3 2 movq mm0,[edx+8] // 4 6 movq [edx],mm6 movq [edx+8], mm5 movq mm6,mm0 punpckhdq mm0,mm7 // 7 6 punpckldq mm6,mm7 // 5 4 movq [edx+24], mm0 movq [edx+16],mm6 sub eax,nyw sub edx,rgbwidth add esi,4 add edi,4 add eax,8 add edx,32 dec ecx jnz rrr
mov ecx,col add eax,nyw add edx,rgbwidth dec ebx jnz rrr emms }}#endif