8 files changed, 617 insertions, 808 deletions
diff --git a/postproc/rgb2rgb.h b/postproc/rgb2rgb.h
index 1a7720c7aa..c74a197bfa 100644
--- a/postproc/rgb2rgb.h
+++ b/postproc/rgb2rgb.h
@@ -84,13 +84,15 @@ extern void yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t
 #define MODE_RGB  0x1
 #define MODE_BGR  0x2
 
-typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py,
+static void yuv2rgb(uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride);
+			      int rgb_stride, int y_stride, int uv_stride){
+printf("broken, this should use the swscaler\n");
+}
 
-extern yuv2rgb_fun yuv2rgb;
-
-void yuv2rgb_init (unsigned bpp, int mode);
+static void yuv2rgb_init (unsigned bpp, int mode){
+printf("broken, this should use the swscaler\n");
+}
 
 #endif
diff --git a/postproc/swscale.c b/postproc/swscale.c
index b4febf948f..dbbf0726e8 100644
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -62,6 +62,7 @@ untested special converters
 #include <stdlib.h>
 #endif
 #include "swscale.h"
+#include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../bswap.h"
 #include "../libvo/img_format.h"
@@ -147,7 +148,6 @@ add support for Y8 output
 optimize bgr24 & bgr32
 add BGR4 output support
 write special BGR->BGR scaler
-deglobalize yuv2rgb*.c
 */
 
 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -230,8 +230,6 @@ void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSli
              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
 
 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
-static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
-void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);
 
 extern const uint8_t dither_2x2_4[2][8];
 extern const uint8_t dither_2x2_8[2][8];
@@ -1634,18 +1632,6 @@ static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[],
 		interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
 }
 
-
-/* Warper functions for yuv2bgr */
-static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
-	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
-
-	if(c->srcFormat==IMGFMT_YV12)
-		yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
-	else /* I420 & IYUV */
-		yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
-}
-
 static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
@@ -1773,7 +1759,7 @@ static void yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], in
 /**
  * bring pointers in YUV order instead of YVU
  */
-static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
+inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
 	if(format == IMGFMT_YV12 || format == IMGFMT_YVU9 
            || format == IMGFMT_444P || format == IMGFMT_422P || format == IMGFMT_411P){
 		sortedP[0]= p[0];
@@ -1814,8 +1800,8 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 	uint8_t *src[3];
 	uint8_t *dst[3];
 
-	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
-	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
+	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 
 	if(isPacked(c->srcFormat))
 	{
@@ -1923,41 +1909,51 @@ static void getSubSampleFactors(int *h, int *v, int format){
 	}
 }
 
-static uint16_t roundToInt16(float f){
-	     if(f<-0x7FFF) f= -0x7FFF;
-	else if(f> 0x7FFF) f=  0x7FFF;
-	
-	return (int)floor(f + 0.5);
+static uint16_t roundToInt16(int64_t f){
+	int r= (f + (1<<15))>>16;
+	     if(r<-0x7FFF) return 0x8000;
+	else if(r> 0x7FFF) return 0x7FFF;
+	else               return r;
 }
 
 /**
- * @param colorspace colorspace
+ * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
  * @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
+ * @return -1 if not supported
  */
-void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, float brightness, float contrast, float saturation){
-
-	float crv =  Inverse_Table_6_9[colorspace][0]/65536.0;
-	float cbu =  Inverse_Table_6_9[colorspace][1]/65536.0;
-	float cgu = -Inverse_Table_6_9[colorspace][2]/65536.0;
-	float cgv = -Inverse_Table_6_9[colorspace][3]/65536.0;
-	float cy  = 1.0;
-	float oy  = 0;
+int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){
+	int64_t crv =  inv_table[0];
+	int64_t cbu =  inv_table[1];
+	int64_t cgu = -inv_table[2];
+	int64_t cgv = -inv_table[3];
+	int64_t cy  = 1<<16;
+	int64_t oy  = 0;
+
+	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
+	memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
+	memcpy(c->dstColorspaceTable,     table, sizeof(int)*4);
+
+	c->brightness= brightness;
+	c->contrast  = contrast;
+	c->saturation= saturation;
+	c->srcRange  = srcRange;
+	c->dstRange  = dstRange;
 
 	c->uOffset=   0x0400040004000400LL;
 	c->vOffset=   0x0400040004000400LL;
 
-	if(!fullRange){
-		cy= (cy*255.0) / 219.0;
-		oy= 16.0;
+	if(!srcRange){
+		cy= (cy*255) / 219;
+		oy= 16<<16;
 	}
 
-	cy *= contrast;
-	crv*= contrast * saturation;
-	cbu*= contrast * saturation;
-	cgu*= contrast * saturation;
-	cgv*= contrast * saturation;
+	cy = (cy *contrast             )>>16;
+	crv= (crv*contrast * saturation)>>32;
+	cbu= (cbu*contrast * saturation)>>32;
+	cgu= (cgu*contrast * saturation)>>32;
+	cgv= (cgv*contrast * saturation)>>32;
 
-	oy -= 256.0*brightness;
+	oy -= 256*brightness;
 
 	c->yCoeff=    roundToInt16(cy *8192) * 0x0001000100010001ULL;
 	c->vrCoeff=   roundToInt16(crv*8192) * 0x0001000100010001ULL;
@@ -1965,6 +1961,28 @@ void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, flo
 	c->vgCoeff=   roundToInt16(cgv*8192) * 0x0001000100010001ULL;
 	c->ugCoeff=   roundToInt16(cgu*8192) * 0x0001000100010001ULL;
 	c->yOffset=   roundToInt16(oy *   8) * 0x0001000100010001ULL;
+
+	yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
+	//FIXME factorize
+	
+	return 0;
+}
+
+/**
+ * @return -1 if not supported
+ */
+int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){
+	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
+
+	*inv_table = c->srcColorspaceTable;
+	*table     = c->dstColorspaceTable;
+	*srcRange  = c->srcRange;
+	*dstRange  = c->dstRange;
+	*brightness= c->brightness;
+	*contrast  = c->contrast;
+	*saturation= c->saturation;
+	
+	return 0;	
 }
 
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
@@ -2026,8 +2044,6 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	c->dstFormat= dstFormat;
 	c->srcFormat= srcFormat;
 
-	setInputColorspaceDetails(c, SWS_CS_DEFAULT, 0, 0.0, 1.0, 1.0);
-	
 	usesFilter=0;
 	if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
 	if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
@@ -2054,17 +2070,14 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 
 	c->chrIntHSubSample= c->chrDstHSubSample;
 	c->chrIntVSubSample= c->chrSrcVSubSample;
-	
+
 	// note the -((-x)>>y) is so that we allways round toward +inf
 	c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
 	c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
 	c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
 	c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
-	
-	if(isBGR(dstFormat))
-		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_RGB, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
-	if(isRGB(dstFormat))
-		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_BGR, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
+
+	sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], 0, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, 0, 0, 1<<16, 1<<16); 
 
 	/* unscaled special Cases */
 	if(unscaled && !usesFilter)
@@ -2075,19 +2088,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= PlanarToNV12Wrapper;
 		}
 		/* yuv2bgr */
-		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
+		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420 || srcFormat==IMGFMT_422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
 		{
-			// FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
-			//FIXME rgb vs. bgr ? 
-#ifdef WORDS_BIGENDIAN
-			if(dstFormat==IMGFMT_BGR32)
-				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
-			else
-				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
-#else
-			yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
-#endif
-			c->swScale= planarYuvToBgr;
+			c->swScale= yuv2rgb_get_func_ptr(c);
 		}
 		
 		if( srcFormat==IMGFMT_YVU9 && (dstFormat==IMGFMT_YV12 || dstFormat==IMGFMT_I420) )
diff --git a/postproc/swscale.h b/postproc/swscale.h
index fd1539b1e6..0471d6a998 100644
--- a/postproc/swscale.h
+++ b/postproc/swscale.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,6 +16,11 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
+#ifndef SWSCALE_H
+#define SWSCALE_H
+
+#include "swscale_internal.h" //FIXME HACK REMOVE
+
 /* values for the flags, the stuff on the command line is different */
 #define SWS_FAST_BILINEAR 1
 #define SWS_BILINEAR 2
@@ -44,8 +49,6 @@
 #define SWS_FULL_CHR_H_INP	0x4000
 #define SWS_DIRECT_BGR		0x8000
 
-#define MAX_FILTER_SIZE 256
-
 #define SWS_MAX_REDUCE_CUTOFF 0.002
 
 #define SWS_CS_ITU709		1
@@ -56,97 +59,6 @@
 #define SWS_CS_SMPTE240M 	7
 #define SWS_CS_DEFAULT 		5
 
-/* this struct should be aligned on at least 32-byte boundary */
-typedef struct SwsContext{
-	int srcW, srcH, dstH;
-	int chrSrcW, chrSrcH, chrDstW, chrDstH;
-	int lumXInc, chrXInc;
-	int lumYInc, chrYInc;
-	int dstFormat, srcFormat;
-	int chrSrcHSubSample, chrSrcVSubSample;
-	int chrIntHSubSample, chrIntVSubSample;
-	int chrDstHSubSample, chrDstVSubSample;
-	int vChrDrop;
-
-	int16_t **lumPixBuf;
-	int16_t **chrPixBuf;
-	int16_t *hLumFilter;
-	int16_t *hLumFilterPos;
-	int16_t *hChrFilter;
-	int16_t *hChrFilterPos;
-	int16_t *vLumFilter;
-	int16_t *vLumFilterPos;
-	int16_t *vChrFilter;
-	int16_t *vChrFilterPos;
-
-	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull
-
-	int hLumFilterSize;
-	int hChrFilterSize;
-	int vLumFilterSize;
-	int vChrFilterSize;
-	int vLumBufSize;
-	int vChrBufSize;
-
-	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
-	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
-	int32_t *lumMmx2FilterPos;
-	int32_t *chrMmx2FilterPos;
-	int16_t *lumMmx2Filter;
-	int16_t *chrMmx2Filter;
-
-	int canMMX2BeUsed;
-
-	int lastInLumBuf;
-	int lastInChrBuf;
-	int lumBufIndex;
-	int chrBufIndex;
-	int dstY;
-	int flags;
-	void * yuvTable;
-	void * table_rV[256];
-	void * table_gU[256];
-	int    table_gV[256];
-	void * table_bU[256];
-
-	void (*swScale)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
-             int srcSliceH, uint8_t* dst[], int dstStride[]);
-
-#define RED_DITHER   "0*8"
-#define GREEN_DITHER "1*8"
-#define BLUE_DITHER  "2*8"
-#define Y_COEFF      "3*8"
-#define VR_COEFF     "4*8"
-#define UB_COEFF     "5*8"
-#define VG_COEFF     "6*8"
-#define UG_COEFF     "7*8"
-#define Y_OFFSET     "8*8"
-#define U_OFFSET     "9*8"
-#define V_OFFSET     "10*8"
-#define LUM_MMX_FILTER_OFFSET "11*8"
-#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
-#define DSTW_OFFSET  "11*8+4*4*256*2"
-#define ESP_OFFSET  "11*8+4*4*256*2+4"
-                  
-	uint64_t redDither   __attribute__((aligned(8)));
-	uint64_t greenDither __attribute__((aligned(8)));
-	uint64_t blueDither  __attribute__((aligned(8)));
-
-	uint64_t yCoeff      __attribute__((aligned(8)));
-	uint64_t vrCoeff     __attribute__((aligned(8)));
-	uint64_t ubCoeff     __attribute__((aligned(8)));
-	uint64_t vgCoeff     __attribute__((aligned(8)));
-	uint64_t ugCoeff     __attribute__((aligned(8)));
-	uint64_t yOffset     __attribute__((aligned(8)));
-	uint64_t uOffset     __attribute__((aligned(8)));
-	uint64_t vOffset     __attribute__((aligned(8)));
-	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
-	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
-	int dstW;
-	int esp;
-} SwsContext;
-//FIXME check init (where 0)
-//FIXME split private & public
 
 
 // when used for filters they must have an odd number of elements
@@ -185,6 +97,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			 SwsFilter *srcFilter, SwsFilter *dstFilter);
 void swsGetFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam);
 
+int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
+int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);
+
 SwsVector *getGaussianVec(double variance, double quality);
 SwsVector *getConstVec(double c, int length);
 SwsVector *getIdentityVec(void);
@@ -199,3 +114,4 @@ SwsVector *cloneVec(SwsVector *a);
 void printVec(SwsVector *a);
 void freeVec(SwsVector *a);
 
+#endif
diff --git a/postproc/swscale_internal.h b/postproc/swscale_internal.h
new file mode 100644
index 0000000000..0697d97e62
--- /dev/null
+++ b/postproc/swscale_internal.h
@@ -0,0 +1,130 @@
+/*
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef SWSCALE_INTERNAL_H
+#define SWSCALE_INTERNAL_H
+
+#define MAX_FILTER_SIZE 256
+
+struct SwsContext;
+
+typedef void (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]);
+
+/* this struct should be aligned on at least 32-byte boundary */
+typedef struct SwsContext{
+	int srcW, srcH, dstH;
+	int chrSrcW, chrSrcH, chrDstW, chrDstH;
+	int lumXInc, chrXInc;
+	int lumYInc, chrYInc;
+	int dstFormat, srcFormat;
+	int chrSrcHSubSample, chrSrcVSubSample;
+	int chrIntHSubSample, chrIntVSubSample;
+	int chrDstHSubSample, chrDstVSubSample;
+	int vChrDrop;
+
+	int16_t **lumPixBuf;
+	int16_t **chrPixBuf;
+	int16_t *hLumFilter;
+	int16_t *hLumFilterPos;
+	int16_t *hChrFilter;
+	int16_t *hChrFilterPos;
+	int16_t *vLumFilter;
+	int16_t *vLumFilterPos;
+	int16_t *vChrFilter;
+	int16_t *vChrFilterPos;
+
+	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull
+
+	int hLumFilterSize;
+	int hChrFilterSize;
+	int vLumFilterSize;
+	int vChrFilterSize;
+	int vLumBufSize;
+	int vChrBufSize;
+
+	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
+	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
+	int32_t *lumMmx2FilterPos;
+	int32_t *chrMmx2FilterPos;
+	int16_t *lumMmx2Filter;
+	int16_t *chrMmx2Filter;
+
+	int canMMX2BeUsed;
+
+	int lastInLumBuf;
+	int lastInChrBuf;
+	int lumBufIndex;
+	int chrBufIndex;
+	int dstY;
+	int flags;
+	void * yuvTable;			// pointer to the yuv->rgb table start so it can be freed()
+	void * table_rV[256];
+	void * table_gU[256];
+	int    table_gV[256];
+	void * table_bU[256];
+
+	//Colorspace stuff
+	int contrast, brightness, saturation;	// for sws_getColorspaceDetails
+	int srcColorspaceTable[4];
+	int dstColorspaceTable[4];
+	int srcRange, dstRange;
+
+	SwsFunc swScale;
+
+#define RED_DITHER   "0*8"
+#define GREEN_DITHER "1*8"
+#define BLUE_DITHER  "2*8"
+#define Y_COEFF      "3*8"
+#define VR_COEFF     "4*8"
+#define UB_COEFF     "5*8"
+#define VG_COEFF     "6*8"
+#define UG_COEFF     "7*8"
+#define Y_OFFSET     "8*8"
+#define U_OFFSET     "9*8"
+#define V_OFFSET     "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET  "11*8+4*4*256*2"
+#define ESP_OFFSET  "11*8+4*4*256*2+4"
+                  
+	uint64_t redDither   __attribute__((aligned(8)));
+	uint64_t greenDither __attribute__((aligned(8)));
+	uint64_t blueDither  __attribute__((aligned(8)));
+
+	uint64_t yCoeff      __attribute__((aligned(8)));
+	uint64_t vrCoeff     __attribute__((aligned(8)));
+	uint64_t ubCoeff     __attribute__((aligned(8)));
+	uint64_t vgCoeff     __attribute__((aligned(8)));
+	uint64_t ugCoeff     __attribute__((aligned(8)));
+	uint64_t yOffset     __attribute__((aligned(8)));
+	uint64_t uOffset     __attribute__((aligned(8)));
+	uint64_t vOffset     __attribute__((aligned(8)));
+	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
+	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
+	int dstW;
+	int esp;
+} SwsContext;
+//FIXME check init (where 0)
+//FIXME split private & public
+
+inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
+SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
+int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+
+#endif
diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c
index b108fb1529..4166e50203 100644
--- a/postproc/swscale_template.c
+++ b/postproc/swscale_template.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -741,7 +741,6 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t *
 				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
 {
-	int dummy=0;
 #ifdef HAVE_MMX
 	if(uDest != NULL)
 	{
@@ -2553,8 +2552,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 	uint8_t *src[3];
 	uint8_t *dst[3];
 	
-	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
-	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
+	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 
 	if(isPacked(c->srcFormat)){
 		src[0]=
diff --git a/postproc/yuv2rgb.c b/postproc/yuv2rgb.c
index 658d8a37db..94196c4c2d 100644
--- a/postproc/yuv2rgb.c
+++ b/postproc/yuv2rgb.c
@@ -27,18 +27,23 @@
  *
  * MMX/MMX2 Template stuff from Michael Niedermayer (michaelni@gmx.at) (needed for fast movntq support)
  * 1,4,8bpp support by Michael Niedermayer (michaelni@gmx.at)
+ * context / deglobalize stuff by Michael Niedermayer
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
+#include <assert.h>
 
 #include "config.h"
 //#include "video_out.h"
 #include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../mangle.h"
 #include "../mp_msg.h"
+#include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
 
 #ifdef HAVE_MLIB
 #include "yuv2rgb_mlib.c"
@@ -46,10 +51,6 @@
 
 #define DITHER1XBPP // only for mmx
 
-#ifdef ARCH_X86
-#define CAN_COMPILE_X86_ASM
-#endif
-
 const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
 {  1,   3,   1,   3,   1,   3,   1,   3, },
 {  2,   0,   2,   0,   2,   0,   2,   0, },
@@ -157,21 +158,10 @@ const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
 };
 #endif
 
-#ifdef CAN_COMPILE_X86_ASM
+#ifdef ARCH_X86
 
 /* hope these constant values are cache line aligned */
-uint64_t __attribute__((aligned(8))) mmx_80w = 0x0080008000800080;
-uint64_t __attribute__((aligned(8))) mmx_10w = 0x1010101010101010;
 uint64_t __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ff;
-uint64_t __attribute__((aligned(8))) mmx_Y_coeff = 0x253f253f253f253f;
-
-/* hope these constant values are cache line aligned */
-uint64_t __attribute__((aligned(8))) mmx_U_green = 0xf37df37df37df37d;
-uint64_t __attribute__((aligned(8))) mmx_U_blue = 0x4093409340934093;
-uint64_t __attribute__((aligned(8))) mmx_V_red = 0x3312331233123312;
-uint64_t __attribute__((aligned(8))) mmx_V_green = 0xe5fce5fce5fce5fc;
-
-/* hope these constant values are cache line aligned */
 uint64_t __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8;
 uint64_t __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfc;
 
@@ -217,8 +207,6 @@ uint64_t __attribute__((aligned(8))) dither8[2]={
 
 #endif // CAN_COMPILE_X86_ASM
 
-uint32_t matrix_coefficients = 6;
-
 const int32_t Inverse_Table_6_9[8][4] = {
     {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
     {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
@@ -230,82 +218,12 @@ const int32_t Inverse_Table_6_9[8][4] = {
     {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };
 
-void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);
-
-yuv2rgb_fun yuv2rgb= NULL;
-
-static void (* yuv2rgb_c_internal) (uint8_t *, uint8_t *,
-				    uint8_t *, uint8_t *,
-				    void *, void *, int, int);
-
-static void yuv2rgb_c (void * dst, uint8_t * py,
-		       uint8_t * pu, uint8_t * pv,
-		       unsigned h_size, unsigned v_size,
-		       unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
-{
-    v_size >>= 1;
-
-    while (v_size--) {
-	yuv2rgb_c_internal (py, py + y_stride, pu, pv, dst, dst + rgb_stride,
-			    h_size, v_size<<1);
-
-	py += 2 * y_stride;
-	pu += uv_stride;
-	pv += uv_stride;
-	dst += 2 * rgb_stride;
-    }
-}
-
-void * table_rV[256];
-void * table_gU[256];
-int table_gV[256];
-void * table_bU[256];
-
-void yuv2rgb_init (unsigned bpp, int mode)
-{
-    if(yuv2rgb) return;
-#ifdef CAN_COMPILE_X86_ASM
-    if(gCpuCaps.hasMMX2)
-    {
-	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
-		yuv2rgb = yuv2rgb_init_MMX2 (bpp, mode);
-		if (yuv2rgb != NULL)
-			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX2 for colorspace transform\n");
-		else
-			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX2 colorspace transform\n");
-	}
-    }
-    else if(gCpuCaps.hasMMX)
-    {
-	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
-		yuv2rgb = yuv2rgb_init_MMX (bpp, mode);
-		if (yuv2rgb != NULL)
-			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX for colorspace transform\n");
-		else
-			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX colorspace transform\n");
-	}
-    }
-#endif
-#ifdef HAVE_MLIB
-    if (yuv2rgb == NULL /*&& (config.flags & VO_MLIB_ENABLE)*/) {
-	yuv2rgb = yuv2rgb_init_mlib (bpp, mode);
-	if (yuv2rgb != NULL)
-	    mp_msg(MSGT_SWS,MSGL_INFO,"Using mlib for colorspace transform\n");
-    }
-#endif
-    if (yuv2rgb == NULL) {
-	mp_msg(MSGT_SWS,MSGL_INFO,"No accelerated colorspace conversion found\n");
-	yuv2rgb_c_init (bpp, mode, table_rV, table_gU, table_gV, table_bU);
-	yuv2rgb = (yuv2rgb_fun)yuv2rgb_c;
-    }
-}
-
 #define RGB(i)					\
 	U = pu[i];				\
 	V = pv[i];				\
-	r = table_rV[V];			\
-	g = table_gU[U] + table_gV[V];		\
-	b = table_bU[U];
+	r = c->table_rV[V];			\
+	g = c->table_gU[U] + c->table_gV[V];		\
+	b = c->table_bU[U];
 
 #define DST1(i)					\
 	Y = py_1[2*i];				\
@@ -343,19 +261,42 @@ void yuv2rgb_init (unsigned bpp, int mode)
 	Y = py_2[2*i+1];						\
 	dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
 
-static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint32_t * r, * g, * b;
-    uint32_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
+#define PROLOG(func_name, dst_type) \
+static void func_name(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, \
+             int srcSliceH, uint8_t* dst[], int dstStride[]){\
+    uint8_t *src[3];\
+    int srcStride[3];\
+    int y;\
+\
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);\
+    if(c->srcFormat == IMGFMT_422P){\
+	srcStride[1] *= 2;\
+	srcStride[2] *= 2;\
+    }\
+    for(y=0; y<srcSliceH; y+=2){\
+	dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
+	dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
+	dst_type *r, *g, *b;\
+	uint8_t *py_1= src[0] + y*srcStride[0];\
+	uint8_t *py_2= py_1 + srcStride[0];\
+	uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
+	uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
+	unsigned int h_size= c->dstW>>3;\
+	while (h_size--) {\
+	    int U, V, Y;\
+
+#define EPILOG(dst_delta)\
+	    pu += 4;\
+	    pv += 4;\
+	    py_1 += 8;\
+	    py_2 += 8;\
+	    dst_1 += dst_delta;\
+	    dst_2 += dst_delta;\
+	}\
+    }\
+}
 
-    while (h_size--) {
+PROLOG(yuv2rgb_c_32, uint32_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -371,30 +312,9 @@ static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
+EPILOG(8)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
-
-// This is very near from the yuv2rgb_c_32 code
-static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
-			      uint8_t * pu, uint8_t * pv,
-			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_24_rgb, uint8_t)
 	RGB(0);
 	DST1RGB(0);
 	DST2RGB(0);
@@ -410,30 +330,10 @@ static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2RGB(3);
 	DST1RGB(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 24;
-	dst_2 += 24;
-    }
-}
+EPILOG(24)
 
 // only trivial mods from yuv2rgb_c_24_rgb
-static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
-			      uint8_t * pu, uint8_t * pv,
-			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_24_bgr, uint8_t)
 	RGB(0);
 	DST1BGR(0);
 	DST2BGR(0);
@@ -449,31 +349,11 @@ static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2BGR(3);
 	DST1BGR(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 24;
-	dst_2 += 24;
-    }
-}
+EPILOG(24)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint16_t * r, * g, * b;
-    uint16_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_16, uint16_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -489,31 +369,11 @@ static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_8, uint8_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -529,32 +389,12 @@ static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
-	const uint8_t *d32= dither_8x8_32[v_pos&7];
-	const uint8_t *d64= dither_8x8_73[v_pos&7];
+PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
+	const uint8_t *d32= dither_8x8_32[y&7];
+	const uint8_t *d64= dither_8x8_73[y&7];
 #define DST1bpp8(i,o)					\
 	Y = py_1[2*i];				\
 	dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]];	\
@@ -583,32 +423,12 @@ static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp8(3,6);
 	DST1bpp8(3,6);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_4, uint8_t)
         int acc;
 #define DST1_4(i)					\
 	Y = py_1[2*i];				\
@@ -639,31 +459,11 @@ static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2_4(3);
 	DST1_4(3);
+EPILOG(4)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 4;
-	dst_2 += 4;
-    }
-}
-
-static void yuv2rgb_c_4_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
-	const uint8_t *d64= dither_8x8_73[v_pos&7];
-	const uint8_t *d128=dither_