postproc/swscale_template.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134


// Software scaling and colorspace conversion routines for MPlayer

// temporary storage for 4 yuv lines:
static unsigned int pix_buf_y[4][2048];
static unsigned int pix_buf_uv[2][2048*2];

// clipping helper table for C implementations:
static unsigned char clip_table[768];

// yuv->rgb conversion tables:
static    int yuvtab_2568[256];
static    int yuvtab_3343[256];
static    int yuvtab_0c92[256];
static    int yuvtab_1a1e[256];
static    int yuvtab_40cf[256];

// *** bilinear scaling and yuv->rgb conversion of yv12 slices:
// *** Note: it's called multiple times while decoding a frame, first time y==0
// *** Designed to upscale, but may work for downscale too.
// s_xinc = (src_width << 8) / dst_width
// s_yinc = (src_height << 16) / dst_height
void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
			     unsigned char* dstptr, int dststride, int dstw, int dstbpp,
			     unsigned int s_xinc,unsigned int s_yinc){

// scaling factors:
//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;

unsigned int s_xinc2=s_xinc>>1;

static int s_srcypos;
static int s_ypos;
static int s_last_ypos;

  if(y==0){
      s_srcypos=-2*s_yinc;
      s_ypos=-2;
      s_last_ypos=-2;
  } // reset counters
  
  while(1){
    unsigned char *dest=dstptr+dststride*s_ypos;
    int y0=2+(s_srcypos>>16);
    int y1=1+(s_srcypos>>17);
    int yalpha=(s_srcypos&0xFFFF)>>8;
    int yalpha1=yalpha^255;
    int uvalpha=((s_srcypos>>1)&0xFFFF)>>8;
    int uvalpha1=uvalpha^255;
    unsigned int *buf0=pix_buf_y[y0&3];
    unsigned int *buf1=pix_buf_y[((y0+1)&3)];
    unsigned int *uvbuf0=pix_buf_uv[y1&1];
    unsigned int *uvbuf1=pix_buf_uv[(y1&1)^1];
    int i;

    if(y0>=y+h) break;

    s_ypos++; s_srcypos+=s_yinc;

    if(s_last_ypos!=y0){
      unsigned char *src=srcptr[0]+(y0-y)*stride[0];
      unsigned int xpos=0;
      s_last_ypos=y0;
      // *** horizontal scale Y line to temp buffer
      // this loop should be rewritten in MMX assembly!!!!
      for(i=0;i<dstw;i++){
	register unsigned int xx=xpos>>8;
        register unsigned int xalpha=xpos&0xFF;
	buf1[i]=(src[xx]*(xalpha^255)+src[xx+1]*xalpha);
	xpos+=s_xinc;
      }
      // *** horizontal scale U and V lines to temp buffer
      if(!(y0&1)){
        unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
        unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
        xpos=0;
        // this loop should be rewritten in MMX assembly!!!!
        for(i=0;i<dstw;i++){
	  register unsigned int xx=xpos>>8;
          register unsigned int xalpha=xpos&0xFF;
	  uvbuf1[i]=(src1[xx]*(xalpha^255)+src1[xx+1]*xalpha);
	  uvbuf1[i+2048]=(src2[xx]*(xalpha^255)+src2[xx+1]*xalpha);
	  xpos+=s_xinc2;
        }
      }
      if(!y0) continue;
    }

    // this loop should be rewritten in MMX assembly!!!!
    // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
    // Note2: instead of using lookup tabs, mmx version could do the multiply...
    // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
    for(i=0;i<dstw;i++){
	// vertical linear interpolation && yuv2rgb in a single step:
	int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
	int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
	int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
#if 1
	// 24/32 bpp
	dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
	dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
	dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
#else
	unsigned short *d=dest;
	unsigned int r=clip_table[((Y + yuvtab_3343[U]) >>13)];
	unsigned int g=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
	unsigned int b=clip_table[((Y + yuvtab_40cf[V]) >>13)];
	d[0]=((r>>3)<<10)|((g>>3)<<5)|((b>>3)); // 15bpp
//	d[0]=((r>>3)<<11)|((g>>2)<<5)|((b>>3)); // 16bpp
#endif
	dest+=dstbpp;
    }
  
  }

}


void SwScale_Init(){
    // generating tables:
    int i;
    for(i=0;i<256;i++){
        clip_table[i]=0;
        clip_table[i+256]=i;
        clip_table[i+512]=255;
	yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
	yuvtab_3343[i]=0x3343*(i-128);
	yuvtab_0c92[i]=-0x0c92*(i-128);
	yuvtab_1a1e[i]=-0x1a1e*(i-128);
	yuvtab_40cf[i]=0x40cf*(i-128);
    }

}