From ea9075f2c7f342807f37edb0d4bab57bb88aad52 Mon Sep 17 00:00:00 2001 From: gpoirier Date: Sun, 13 May 2007 19:22:32 +0000 Subject: Blackfin optimized YUV420 to RGB CSC Color Space Converters. YUV2 -> RGB BGR for 565, 555 and 888 a.k.a. 24bit color. Speed-up compared to C version compiled with -O3 187.28% Patch by Marc Hoffman %mmh A pleasantst P com% Original thread: Date: May 9, 2007 2:46 AM Subject: [FFmpeg-devel] PATCH BlackFin yuv2rgb color space conversion git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@23307 b3059339-0415-0410-9bf9-f77b7e298cf2 --- libswscale/Makefile | 3 + libswscale/internal_bfin.S | 454 ++++++++++++++++++++++++++++++++++++++++++ libswscale/swscale.c | 4 +- libswscale/swscale.h | 1 + libswscale/swscale_internal.h | 16 ++ libswscale/yuv2rgb.c | 8 + libswscale/yuv2rgb_bfin.c | 205 +++++++++++++++++++ 7 files changed, 690 insertions(+), 1 deletion(-) create mode 100644 libswscale/internal_bfin.S create mode 100644 libswscale/yuv2rgb_bfin.c diff --git a/libswscale/Makefile b/libswscale/Makefile index f6adf2a514..dac331e474 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -12,6 +12,9 @@ OBJS= swscale.o rgb2rgb.o OBJS-$(TARGET_ALTIVEC) += yuv2rgb_altivec.o OBJS-$(CONFIG_GPL) += yuv2rgb.o +OBJS-$(TARGET_ARCH_BFIN) += yuv2rgb_bfin.o +ASM_OBJS-$(TARGET_ARCH_BFIN) += internal_bfin.o + HEADERS = swscale.h rgb2rgb.h include ../common.mak diff --git a/libswscale/internal_bfin.S b/libswscale/internal_bfin.S new file mode 100644 index 0000000000..d61b07e9cb --- /dev/null +++ b/libswscale/internal_bfin.S @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2007 Marc Hoffman + * April 20, 2007 + * + * Blackfin Video Color Space Converters Operations + * convert I420 YV12 to RGB in various formats, + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +/* + YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock + and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts + + + The following calculation is used for the conversion: + + r = clipz((y-oy)*cy + crv*(v-128)) + g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) + b = clipz((y-oy)*cy + cbu*(u-128)) + + y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. + + + New factorization to elliminate the truncation error which was + occuring due to the byteop3p. + + + 1) use the bytop16m to subtract quad bytes we use this in U8 this + then so the offsets need to be renormalized to 8bits. + + 2) scale operands up by a factor of 4 not 8 because Blackfin + multiplies include a shift. + + 3) compute into the accumulators cy*yx0, cy*yx1 + + 4) compute each of the linear equations + r = clipz((y-oy)*cy + crv*(v-128)) + + g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) + + b = clipz((y-oy)*cy + cbu*(u-128)) + + reuse of the accumulators requires that we actually multiply + twice once with addition and the second time with a subtaction. + + because of this we need to compute the equations in the order R B + then G saving the writes for B in the case of 24/32 bit color + formats. + + api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, + int dW, uint32_t *coeffs); + + A B + --- --- + i2 = cb i3 = cr + i1 = coeff i0 = y + + Where coeffs have the following layout in memory. + + uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; + + coeffs is a pointer to oy. + + the {rgb} masks are only utilized by the 565 packing algorithm. Note the data + replication is used to simplify the internal algorithms for the dual mac architecture + of BlackFin. + + All routines are exported with _ff_bfin_ as a symbol prefix + + rough performance gain compared against -O3: + + 2779809/1484290 187.28% + + which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 + c/pel for the optimized implementations. Not sure why there is such a + huge variation on the reference codes on Blackfin I guess it must have + to do with the memory system. + +*/ + +#define mL1 .l1.text +#define mL3 .text +#define MEM mL1 + +#define DEFUN(fname,where,interface) \ + .section where; \ + .global _ff_bfin_ ## fname; \ + .type _ff_bfin_ ## fname, STT_FUNC; \ + .align 8; \ + _ff_bfin_ ## fname + +#define DEFUN_END(fname) \ + .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname + + +.text + +#define COEFF_LEN 11*4 +#define COEFF_REL_CY_OFF 4*4 + +#define ARG_OUT 20 +#define ARG_W 24 +#define ARG_COEFF 28 + +DEFUN(yuv2rgb565_line,MEM, + (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): + link 0; + [--sp] = (r7:4); + p1 = [fp+ARG_OUT]; + r3 = [fp+ARG_W]; + + i0 = r0; + i2 = r1; + i3 = r2; + + r0 = [fp+ARG_COEFF]; + i1 = r0; + b1 = i1; + l1 = COEFF_LEN; + m0 = COEFF_REL_CY_OFF; + p0 = r3; + + r0 = [i0++]; // 2Y + r1.l = w[i2++]; // 2u + r1.h = w[i3++]; // 2v + p0 = p0>>2; + + lsetup (.L0565, .L1565) lc0 = p0; + + /* + uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv + r0 -- used to load 4ys + r1 -- used to load 2us,2vs + r4 -- y3,y2 + r5 -- y1,y0 + r6 -- u1,u0 + r7 -- v1,v0 + */ + r2=[i1++]; // oy +.L0565: + /* + rrrrrrrr gggggggg bbbbbbbb + 5432109876543210 + bbbbb >>3 + gggggggg <<3 + rrrrrrrr <<8 + rrrrrggggggbbbbb + */ + (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc + (r7,r6) = byteop16m (r1:0, r3:2) (r); + r5 = r5 << 2 (v); // y1,y0 + r4 = r4 << 2 (v); // y3,y2 + r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero + r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy + /* Y' = y*cy */ + a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2 = r2 >> 3 (v); + r3 = r2 & r5; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); + a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + r2 = r2 << 8 (v); + r2 = r2 & r5; + r3 = r3 | r2; + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask + r2 = r2 << 3 (v); + r2 = r2 & r5; + r3 = r3 | r2; + [p1++]=r3 || r1=[i1++]; // cy + + /* Y' = y*cy */ + + a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); + a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2 = r2 >> 3 (v); + r3 = r2 & r5; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); + a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + r2 = r2 << 8 (v); + r2 = r2 & r5; + r3 = r3 | r2; + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask + r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y + r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u + r2 = r2 & r5; + r3 = r3 | r2; + [p1++]=r3 || r1.h = w[i3++]; // 2v +.L1565: r2=[i1++]; // oy + + l1 = 0; + + (r7:4) = [sp++]; + unlink; + rts; +DEFUN_END(yuv2rgb565_line) + +DEFUN(yuv2rgb555_line,MEM, + (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): + link 0; + [--sp] = (r7:4); + p1 = [fp+ARG_OUT]; + r3 = [fp+ARG_W]; + + i0 = r0; + i2 = r1; + i3 = r2; + + r0 = [fp+ARG_COEFF]; + i1 = r0; + b1 = i1; + l1 = COEFF_LEN; + m0 = COEFF_REL_CY_OFF; + p0 = r3; + + r0 = [i0++]; // 2Y + r1.l = w[i2++]; // 2u + r1.h = w[i3++]; // 2v + p0 = p0>>2; + + lsetup (.L0555, .L1555) lc0 = p0; + + /* + uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv + r0 -- used to load 4ys + r1 -- used to load 2us,2vs + r4 -- y3,y2 + r5 -- y1,y0 + r6 -- u1,u0 + r7 -- v1,v0 + */ + r2=[i1++]; // oy +.L0555: + /* + rrrrrrrr gggggggg bbbbbbbb + 5432109876543210 + bbbbb >>3 + gggggggg <<2 + rrrrrrrr <<7 + xrrrrrgggggbbbbb + */ + + (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc + (r7,r6) = byteop16m (r1:0, r3:2) (r); + r5 = r5 << 2 (v); // y1,y0 + r4 = r4 << 2 (v); // y3,y2 + r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero + r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy + /* Y' = y*cy */ + a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2 = r2 >> 3 (v); + r3 = r2 & r5; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); + a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + r2 = r2 << 7 (v); + r2 = r2 & r5; + r3 = r3 | r2; + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask + r2 = r2 << 2 (v); + r2 = r2 & r5; + r3 = r3 | r2; + [p1++]=r3 || r1=[i1++]; // cy + + /* Y' = y*cy */ + + a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); + a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2 = r2 >> 3 (v); + r3 = r2 & r5; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); + a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + r2 = r2 << 7 (v); + r2 = r2 & r5; + r3 = r3 | r2; + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask + r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y + r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u + r2 = r2 & r5; + r3 = r3 | r2; + [p1++]=r3 || r1.h=w[i3++]; // 2v + +.L1555: r2=[i1++]; // oy + + l1 = 0; + + (r7:4) = [sp++]; + unlink; + rts; +DEFUN_END(yuv2rgb555_line) + +DEFUN(yuv2rgb24_line,MEM, + (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): + link 0; + [--sp] = (r7:4); + p1 = [fp+ARG_OUT]; + r3 = [fp+ARG_W]; + p2 = p1; + p2 += 3; + + i0 = r0; + i2 = r1; + i3 = r2; + + r0 = [fp+ARG_COEFF]; // coeff buffer + i1 = r0; + b1 = i1; + l1 = COEFF_LEN; + m0 = COEFF_REL_CY_OFF; + p0 = r3; + + r0 = [i0++]; // 2Y + r1.l = w[i2++]; // 2u + r1.h = w[i3++]; // 2v + p0 = p0>>2; + + lsetup (.L0888, .L1888) lc0 = p0; + + /* + uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv + r0 -- used to load 4ys + r1 -- used to load 2us,2vs + r4 -- y3,y2 + r5 -- y1,y0 + r6 -- u1,u0 + r7 -- v1,v0 + */ + r2=[i1++]; // oy +.L0888: + (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc + (r7,r6) = byteop16m (r1:0, r3:2) (r); + r5 = r5 << 2 (v); // y1,y0 + r4 = r4 << 2 (v); // y3,y2 + r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero + r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy + + /* Y' = y*cy */ + a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2=r2>>16 || B[p1++]=r2; + B[p2++]=r2; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); + a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask + r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); + r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero + + r2=r2>>16 || B[p1++]=r2; + B[p2++]=r2; + + r3=r3>>16 || B[p1++]=r3; + B[p2++]=r3 || r1=[i1++]; // cy + + p1+=3; + p2+=3; + /* Y' = y*cy */ + a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv + + /* R = Y+ crv*(Cr-128) */ + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); + a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask + r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu + r2=r2>>16 || B[p1++]=r2; + B[p2++]=r2; + + /* B = Y+ cbu*(Cb-128) */ + r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); + a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask + r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu + + /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ + a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv + r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); + r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask + r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y + B[p2++]=r2 || r1.l = w[i2++]; // 2u + r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v + B[p2++]=r3 || r2=[i1++]; // oy + + p1+=3; +.L1888: p2+=3; + + l1 = 0; + + (r7:4) = [sp++]; + unlink; + rts; +DEFUN_END(yuv2rgb888_line) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 583bf1781c..f6261e6461 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -1992,7 +1992,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH #endif #if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off - flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC); + flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN); #ifdef HAVE_MMX2 flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2; #elif defined (HAVE_3DNOW) @@ -2001,6 +2001,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH flags |= SWS_CPU_CAPS_MMX; #elif defined (HAVE_ALTIVEC) flags |= SWS_CPU_CAPS_ALTIVEC; +#elif defined (ARCH_BFIN) + flags |= SWS_CPU_CAPS_BFIN; #endif #endif /* RUNTIME_CPUDETECT */ if (clip_table[512] != 255) globalInit(); diff --git a/libswscale/swscale.h b/libswscale/swscale.h index a86ef45348..adad02f5a5 100644 --- a/libswscale/swscale.h +++ b/libswscale/swscale.h @@ -74,6 +74,7 @@ extern "C" { #define SWS_CPU_CAPS_MMX2 0x20000000 #define SWS_CPU_CAPS_3DNOW 0x40000000 #define SWS_CPU_CAPS_ALTIVEC 0x10000000 +#define SWS_CPU_CAPS_BFIN 0x01000000 #define SWS_MAX_REDUCE_CUTOFF 0.002 diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 2d766f4746..85062ca2dc 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -162,6 +162,22 @@ typedef struct SwsContext{ #endif + +#ifdef ARCH_BFIN + uint32_t oy __attribute__((aligned(4))); + uint32_t oc __attribute__((aligned(4))); + uint32_t zero __attribute__((aligned(4))); + uint32_t cy __attribute__((aligned(4))); + uint32_t crv __attribute__((aligned(4))); + uint32_t rmask __attribute__((aligned(4))); + uint32_t cbu __attribute__((aligned(4))); + uint32_t bmask __attribute__((aligned(4))); + uint32_t cgu __attribute__((aligned(4))); + uint32_t cgv __attribute__((aligned(4))); + uint32_t gmask __attribute__((aligned(4))); +#endif + + } SwsContext; //FIXME check init (where 0) diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index 98a3569d85..9f3f071213 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -611,6 +611,14 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) } #endif +#ifdef ARCH_BFIN + if (c->flags & SWS_CPU_CAPS_BFIN) + { + SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c); + if (t) return t; + } +#endif + av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n"); switch(c->dstFormat){ diff --git a/libswscale/yuv2rgb_bfin.c b/libswscale/yuv2rgb_bfin.c new file mode 100644 index 0000000000..98c86c06f9 --- /dev/null +++ b/libswscale/yuv2rgb_bfin.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2007 Marc Hoffman + * April 20, 2007 + * + * Blackfin Video Color Space Converters Operations + * convert I420 YV12 to RGB in various formats, + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include "config.h" +#ifdef HAVE_MALLOC_H +#include +#endif +#include +#include +#include "rgb2rgb.h" +#include "swscale.h" +#include "swscale_internal.h" + + +#define L1CODE __attribute__ ((l1_text)) + +extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs) L1CODE; + +extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs) L1CODE; + +extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs) L1CODE; + +typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs); + + +static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks) +{ + int oy; + oy = c->yOffset&0xffff; + oy = oy >> 3; // keep everything U8.0 for offset calculation + + c->oc = 128*0x01010101U; + c->oy = oy*0x01010101U; + + /* copy 64bit vector coeffs down to 32bit vector coeffs */ + c->cy = c->yCoeff; + c->zero = 0; + + if (rgb) { + c->crv = c->vrCoeff; + c->cbu = c->ubCoeff; + c->cgu = c->ugCoeff; + c->cgv = c->vgCoeff; + } else { + c->crv = c->ubCoeff; + c->cbu = c->vrCoeff; + c->cgu = c->vgCoeff; + c->cgv = c->ugCoeff; + } + + + if (masks == 555) { + c->rmask = 0x001f * 0x00010001U; + c->gmask = 0x03e0 * 0x00010001U; + c->bmask = 0x7c00 * 0x00010001U; + } else if (masks == 565) { + c->rmask = 0x001f * 0x00010001U; + c->gmask = 0x07e0 * 0x00010001U; + c->bmask = 0xf800 * 0x00010001U; + } +} + +static int core_yuv420_rgb (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides, + ltransform_t lcscf, int rgb, int masks) +{ + uint8_t *py,*pu,*pv,*op; + int w = instrides[0]; + int h2 = srcSliceH>>1; + int i; + + bfin_prepare_coefficients (c, rgb, masks); + + py = in[0]; + pu = in[1+(1^rgb)]; + pv = in[1+(0^rgb)]; + + op = oplanes[0] + srcSliceY*outstrides[0]; + + for (i=0;ioy); + + py += instrides[0]; + op += outstrides[0]; + + lcscf (py,pu,pv,op,w,&c->oy); + + py += instrides[0]; + pu += instrides[1]; + pv += instrides[2]; + op += outstrides[0]; + } + + return srcSliceH; +} + + +static int bfin_yuv420_rgb555 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb555_line, 1, 555); +} + +static int bfin_yuv420_bgr555 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb555_line, 0, 555); +} + +static int bfin_yuv420_rgb24 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb24_line, 1, 888); +} + +static int bfin_yuv420_bgr24 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb24_line, 0, 888); +} + +static int bfin_yuv420_rgb565 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb565_line, 1, 565); +} + +static int bfin_yuv420_bgr565 (SwsContext *c, + uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, + uint8_t **oplanes, int *outstrides) +{ + return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides, + ff_bfin_yuv2rgb565_line, 0, 565); +} + + +SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c) +{ + SwsFunc f; + + switch(c->dstFormat) { + case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break; + case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break; + case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break; + case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break; + case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break; + case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break; + default: + return 0; + } + + av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n", + sws_format_name (c->dstFormat)); + + return f; +} -- cgit v1.2.3