3 files changed, 272 insertions, 8 deletions
diff --git a/libvo/d3d_shader_yuv.hlsl b/libvo/d3d_shader_yuv.hlsl
index 9d46e536fc..b17e257210 100644
--- a/libvo/d3d_shader_yuv.hlsl
+++ b/libvo/d3d_shader_yuv.hlsl
@@ -1,20 +1,44 @@
 // Compile with:
 // fxc.exe /Tps_2_0 /Fhd3d_shader_yuv.h d3d_shader_yuv.hlsl /Vnd3d_shader_yuv
+// fxc.exe /Tps_2_0 /Fhd3d_shader_yuv_2ch.h d3d_shader_yuv.hlsl /Vnd3d_shader_yuv_2ch /DUSE_2CH=1
+
+// Be careful with this shader. You can't use constant slots, since we don't
+// load the shader with D3DX. All uniform variables are mapped to hardcoded
+// constant slots.
 
 sampler2D tex0 : register(s0);
 sampler2D tex1 : register(s1);
 sampler2D tex2 : register(s2);
 
 uniform float4x4 colormatrix : register(c0);
+uniform float2 depth : register(c5);
+
+#ifdef USE_2CH
+
+float1 sample(sampler2D tex, float2 t)
+{
+    // Sample from A8L8 format as if we sampled a single value from L16.
+    // We compute the 2 channel values back into one.
+    return dot(tex2D(tex, t).xw, depth);
+}
+
+#else
+
+float1 sample(sampler2D tex, float2 t)
+{
+    return tex2D(tex, t).x;
+}
+
+#endif
 
 float4 main(float2 t0 : TEXCOORD0,
             float2 t1 : TEXCOORD1,
             float2 t2 : TEXCOORD2)
             : COLOR
 {
-    float4 c = float4(tex2D(tex0, t0).x,
-                      tex2D(tex1, t1).x,
-                      tex2D(tex2, t2).x,
+    float4 c = float4(sample(tex0, t0),
+                      sample(tex1, t1),
+                      sample(tex2, t2),
                       1);
     return mul(c, colormatrix);
 }
diff --git a/libvo/d3d_shader_yuv_2ch.h b/libvo/d3d_shader_yuv_2ch.h
new file mode 100644
index 0000000000..45dcc73992
--- /dev/null
+++ b/libvo/d3d_shader_yuv_2ch.h
@@ -0,0 +1,170 @@
+#if 0
+//
+// Generated by Microsoft (R) HLSL Shader Compiler 9.27.952.3022
+//
+//   fxc /Tps_2_0 /Fhz:\tmp\mplayer\libvo\d3d_shader_yuv_2ch.h
+//    z:\tmp\mplayer\libvo\d3d_shader_yuv.hlsl /Vnd3d_shader_yuv_2ch
+//    /DUSE_2CH=1
+//
+//
+// Parameters:
+//
+//   float4x4 colormatrix;
+//   float2 depth;
+//   sampler2D tex0;
+//   sampler2D tex1;
+//   sampler2D tex2;
+//
+//
+// Registers:
+//
+//   Name         Reg   Size
+//   ------------ ----- ----
+//   colormatrix  c0       4
+//   depth        c5       1
+//   tex0         s0       1
+//   tex1         s1       1
+//   tex2         s2       1
+//
+
+    ps_2_0
+    def c4, 1, 0, 0, 0
+    dcl t0.xy
+    dcl t1.xy
+    dcl t2.xy
+    dcl_2d s0
+    dcl_2d s1
+    dcl_2d s2
+    texld r0, t0, s0
+    texld r1, t1, s1
+    texld r2, t2, s2
+    mul r0.x, r0.x, c5.x
+    mad r0.x, r0.w, c5.y, r0.x
+    mul r1.x, r1.x, c5.x
+    mad r0.y, r1.w, c5.y, r1.x
+    mul r1.x, r2.x, c5.x
+    mad r0.z, r2.w, c5.y, r1.x
+    mov r0.w, c4.x
+    dp4 r1.x, r0, c0
+    dp4 r1.y, r0, c1
+    dp4 r1.z, r0, c2
+    dp4 r1.w, r0, c3
+    mov oC0, r1
+
+// approximately 15 instruction slots used (3 texture, 12 arithmetic)
+#endif
+
+const BYTE d3d_shader_yuv_2ch[] =
+{
+      0,   2, 255, 255, 254, 255, 
+     78,   0,  67,  84,  65,  66, 
+     28,   0,   0,   0,   3,   1, 
+      0,   0,   0,   2, 255, 255, 
+      5,   0,   0,   0,  28,   0, 
+      0,   0,   0,   1,   0,   0, 
+    252,   0,   0,   0, 128,   0, 
+      0,   0,   2,   0,   0,   0, 
+      4,   0,   2,   0, 140,   0, 
+      0,   0,   0,   0,   0,   0, 
+    156,   0,   0,   0,   2,   0, 
+      5,   0,   1,   0,  22,   0, 
+    164,   0,   0,   0,   0,   0, 
+      0,   0, 180,   0,   0,   0, 
+      3,   0,   0,   0,   1,   0, 
+      2,   0, 188,   0,   0,   0, 
+      0,   0,   0,   0, 204,   0, 
+      0,   0,   3,   0,   1,   0, 
+      1,   0,   6,   0, 212,   0, 
+      0,   0,   0,   0,   0,   0, 
+    228,   0,   0,   0,   3,   0, 
+      2,   0,   1,   0,  10,   0, 
+    236,   0,   0,   0,   0,   0, 
+      0,   0,  99, 111, 108, 111, 
+    114, 109,  97, 116, 114, 105, 
+    120,   0,   3,   0,   3,   0, 
+      4,   0,   4,   0,   1,   0, 
+      0,   0,   0,   0,   0,   0, 
+    100, 101, 112, 116, 104,   0, 
+    171, 171,   1,   0,   3,   0, 
+      1,   0,   2,   0,   1,   0, 
+      0,   0,   0,   0,   0,   0, 
+    116, 101, 120,  48,   0, 171, 
+    171, 171,   4,   0,  12,   0, 
+      1,   0,   1,   0,   1,   0, 
+      0,   0,   0,   0,   0,   0, 
+    116, 101, 120,  49,   0, 171, 
+    171, 171,   4,   0,  12,   0, 
+      1,   0,   1,   0,   1,   0, 
+      0,   0,   0,   0,   0,   0, 
+    116, 101, 120,  50,   0, 171, 
+    171, 171,   4,   0,  12,   0, 
+      1,   0,   1,   0,   1,   0, 
+      0,   0,   0,   0,   0,   0, 
+    112, 115,  95,  50,  95,  48, 
+      0,  77, 105,  99, 114, 111, 
+    115, 111, 102, 116,  32,  40, 
+     82,  41,  32,  72,  76,  83, 
+     76,  32,  83, 104,  97, 100, 
+    101, 114,  32,  67, 111, 109, 
+    112, 105, 108, 101, 114,  32, 
+     57,  46,  50,  55,  46,  57, 
+     53,  50,  46,  51,  48,  50, 
+     50,   0,  81,   0,   0,   5, 
+      4,   0,  15, 160,   0,   0, 
+    128,  63,   0,   0,   0,   0, 
+      0,   0,   0,   0,   0,   0, 
+      0,   0,  31,   0,   0,   2, 
+      0,   0,   0, 128,   0,   0, 
+      3, 176,  31,   0,   0,   2, 
+      0,   0,   0, 128,   1,   0, 
+      3, 176,  31,   0,   0,   2, 
+      0,   0,   0, 128,   2,   0, 
+      3, 176,  31,   0,   0,   2, 
+      0,   0,   0, 144,   0,   8, 
+     15, 160,  31,   0,   0,   2, 
+      0,   0,   0, 144,   1,   8, 
+     15, 160,  31,   0,   0,   2, 
+      0,   0,   0, 144,   2,   8, 
+     15, 160,  66,   0,   0,   3, 
+      0,   0,  15, 128,   0,   0, 
+    228, 176,   0,   8, 228, 160, 
+     66,   0,   0,   3,   1,   0, 
+     15, 128,   1,   0, 228, 176, 
+      1,   8, 228, 160,  66,   0, 
+      0,   3,   2,   0,  15, 128, 
+      2,   0, 228, 176,   2,   8, 
+    228, 160,   5,   0,   0,   3, 
+      0,   0,   1, 128,   0,   0, 
+      0, 128,   5,   0,   0, 160, 
+      4,   0,   0,   4,   0,   0, 
+      1, 128,   0,   0, 255, 128, 
+      5,   0,  85, 160,   0,   0, 
+      0, 128,   5,   0,   0,   3, 
+      1,   0,   1, 128,   1,   0, 
+      0, 128,   5,   0,   0, 160, 
+      4,   0,   0,   4,   0,   0, 
+      2, 128,   1,   0, 255, 128, 
+      5,   0,  85, 160,   1,   0, 
+      0, 128,   5,   0,   0,   3, 
+      1,   0,   1, 128,   2,   0, 
+      0, 128,   5,   0,   0, 160, 
+      4,   0,   0,   4,   0,   0, 
+      4, 128,   2,   0, 255, 128, 
+      5,   0,  85, 160,   1,   0, 
+      0, 128,   1,   0,   0,   2, 
+      0,   0,   8, 128,   4,   0, 
+      0, 160,   9,   0,   0,   3, 
+      1,   0,   1, 128,   0,   0, 
+    228, 128,   0,   0, 228, 160, 
+      9,   0,   0,   3,   1,   0, 
+      2, 128,   0,   0, 228, 128, 
+      1,   0, 228, 160,   9,   0, 
+      0,   3,   1,   0,   4, 128, 
+      0,   0, 228, 128,   2,   0, 
+    228, 160,   9,   0,   0,   3, 
+      1,   0,   8, 128,   0,   0, 
+    228, 128,   3,   0, 228, 160, 
+      1,   0,   0,   2,   0,   8, 
+     15, 128,   1,   0, 228, 128, 
+    255, 255,   0,   0
+};
diff --git a/libvo/vo_direct3d.c b/libvo/vo_direct3d.c
index 95bff26454..fdecee8c94 100644
--- a/libvo/vo_direct3d.c
+++ b/libvo/vo_direct3d.c
@@ -48,13 +48,15 @@
 
 // shaders generated by fxc.exe from d3d_shader_yuv.hlsl
 #include "d3d_shader_yuv.h"
+#include "d3d_shader_yuv_2ch.h"
 
 
 // TODO: beg someone to add this (there is already IMGFMT_Y8)
 // equals MAKEFOURCC('Y', '1', '6', ' ')
 #define IMGFMT_Y16 0x20363159
+#define IMGFMT_A8Y8 MAKEFOURCC('A', '8', 'Y', '8')
 
-#define IMGFMT_IS_Y(x) ((x) == IMGFMT_Y8 || (x) == IMGFMT_Y16)
+#define IMGFMT_IS_Y(x) ((x) == IMGFMT_Y8 || (x) == IMGFMT_Y16 || (x) == IMGFMT_A8Y8)
 #define IMGFMT_Y_DEPTH(x) ((x) == IMGFMT_Y8 ? 8 : 16)
 
 #define DEVTYPE D3DDEVTYPE_HAL
@@ -132,6 +134,7 @@ typedef struct d3d_priv {
     int opt_texture_memory;
     int opt_swap_discard;
     int opt_exact_backbuffer;
+    int opt_16bit_textures;
 
     struct vo *vo;
 
@@ -151,6 +154,7 @@ typedef struct d3d_priv {
                                 StretchRect */
     bool use_shaders;           /**< use shader for YUV color conversion
                                 (or possibly for RGB video equalizers) */
+    bool use_2ch_hack;          /**< 2 byte YUV formats use 2 channel hack */
 
     int plane_count;
     struct texplane planes[3];
@@ -190,6 +194,7 @@ typedef struct d3d_priv {
     int max_texture_height;         /**< from the device capabilities */
 
     D3DMATRIX d3d_colormatrix;
+    float d3d_depth_vector[4];
     struct mp_csp_details colorspace;
     struct mp_csp_equalizer video_eq;
 
@@ -224,6 +229,7 @@ static const struct fmt_entry fmt_table[] = {
     // grayscale (can be considered both packed and planar)
     {IMGFMT_Y8,    D3DFMT_L8},
     {IMGFMT_Y16,   D3DFMT_L16},
+    {IMGFMT_A8Y8,  D3DFMT_A8L8},
     {0},
 };
 
@@ -1013,6 +1019,9 @@ static uint32_t d3d_draw_frame(d3d_priv *priv)
             IDirect3DDevice9_SetPixelShaderConstantF(priv->d3d_device, 0,
                                                      &priv->d3d_colormatrix._11,
                                                      4);
+            IDirect3DDevice9_SetPixelShaderConstantF(priv->d3d_device, 5,
+                                                     priv->d3d_depth_vector,
+                                                     1);
         }
 
         IDirect3DDevice9_SetFVF(priv->d3d_device, D3DFVF_VIDEO_VERTEX);
@@ -1130,7 +1139,14 @@ static D3DFORMAT check_shader_conversion(d3d_priv *priv, uint32_t fmt)
     bool is_8bit = component_bits == 8;
     if (!is_8bit && priv->opt_only_8bit)
         return 0;
-    return check_format(priv, is_8bit ? IMGFMT_Y8 : IMGFMT_Y16, true);
+    int texfmt = IMGFMT_Y8;
+    if (!is_8bit) {
+        if (priv->opt_16bit_textures)
+            texfmt = IMGFMT_Y16;
+        else
+            texfmt = IMGFMT_A8Y8;
+    }
+    return check_format(priv, texfmt, true);
 }
 
 // Return if the image format can be used. If it can, decide which rendering
@@ -1165,6 +1181,7 @@ static bool init_rendering_mode(d3d_priv *priv, uint32_t fmt, bool initialize)
 
     priv->use_shaders = false;
     priv->use_textures = false;
+    priv->use_2ch_hack = false;
     priv->movie_src_fmt = 0;
     priv->pixel_shader_data = NULL;
     priv->plane_count = 0;
@@ -1209,7 +1226,14 @@ static bool init_rendering_mode(d3d_priv *priv, uint32_t fmt, bool initialize)
                     planes[n].clearval = get_chroma_clear_val(component_bits);
                 }
             }
-            priv->pixel_shader_data = d3d_shader_yuv;
+            if (shader_d3dfmt != D3DFMT_A8L8) {
+                priv->pixel_shader_data = d3d_shader_yuv;
+            } else {
+                mp_msg(MSGT_VO, MSGL_WARN, "<vo_direct3d>Using YUV 2ch hack.\n");
+
+                priv->pixel_shader_data = d3d_shader_yuv_2ch;
+                priv->use_2ch_hack = true;
+            }
         }
 
         for (n = 0; n < priv->plane_count; n++) {
@@ -1253,6 +1277,30 @@ static int query_format(d3d_priv *priv, uint32_t movie_fmt)
  *                                                                          *
  ****************************************************************************/
 
+static void get_2ch_depth_multiplier(int depth, float *out_f1, float *out_f2) {
+    // How to get these values:
+    //  The suffix i8 and i16 is for values with 8/16 bit fixed point numbers.
+    //  The suffix f is for float, ideally in the range 0.0-1.0.
+    //  c_i8 is a two component vector, sampled from a two channel texture.
+    //  (c_i8.x is the low byte, c_i8.y is the high byte)
+    //  r_f is the resulting color scalar value.
+    //
+    //  c_i8 = c_f * (2^8-1)
+    //  r_i16 = c_i8.x + c_i8.y * 2^8
+    //  r_f = r_i16 / (2^16-1)
+    //      = c_f.x * (2^8-1) / (2^16-1) + c_f.y * (2^8-1) * 2^8 / (2^16-1)
+    //      = c_f.x * ((2^8-1) / (2^16-1)) + c_f.y * (2^8 * ((2^8-1) / (2^16-1)))
+    // out = ((2^8-1) / (2^16-1),  2^8 * ((2^8-1) / (2^16-1)))
+    // The result color is r_f = dot(c_f, out).
+    // Same goes for other bit depth, such as 10 bit. Assuming (2^depth-1) is
+    // the maximum possible value at that depth, you have to scale the value
+    // r_i16 with it, the factor (2^16-1) in the formula above has to be
+    // replaced with (2^depth-1).
+    float factor = (float)((1 << 8) - 1) / (float)((1 << depth) - 1);
+    *out_f1 = factor;
+    *out_f2 = 256.0 * factor;
+}
+
 static void update_colorspace(d3d_priv *priv)
 {
     float coeff[3][4];
@@ -1260,8 +1308,19 @@ static void update_colorspace(d3d_priv *priv)
     mp_csp_copy_equalizer_values(&csp, &priv->video_eq);
 
     if (priv->use_shaders) {
-        csp.input_bits = priv->planes[0].bits_per_pixel;
-        csp.texture_bits = (csp.input_bits + 7) & ~7;
+        if (!priv->use_2ch_hack) {
+            csp.input_bits = priv->planes[0].bits_per_pixel;
+            csp.texture_bits = (csp.input_bits + 7) & ~7;
+        } else {
+            float f1, f2;
+            get_2ch_depth_multiplier(priv->planes[0].bits_per_pixel, &f1, &f2);
+            priv->d3d_depth_vector[0] = f1;
+            priv->d3d_depth_vector[1] = f2;
+            priv->d3d_depth_vector[2] = priv->d3d_depth_vector[3] = 0;
+            // no change
+            csp.input_bits = 8;
+            csp.texture_bits = 8;
+        }
     }
 
     mp_get_yuv2rgb_coeffs(&csp, coeff);
@@ -1314,6 +1373,14 @@ const char *options_help_text = "-vo direct3d command line help:\n"
 "        Might be slower too, as it must (?) clear every frame.\n"
 "    exact-backbuffer\n"
 "        Always resize the backbuffer to window size.\n"
+"    no16bit-textures\n"
+"        Don't use textures with a 16 bit color channel for YUV formats that\n"
+"        use more than 8 bits per component. Instead, use D3DFMT_A8L8 textures\n"
+"        and compute the values sampled from the 2 channels back into one.\n"
+"        Might be slower, since the shader becomes slightly more complicated.\n"
+"        Might work better, if your drivers either don't support D3DFMT_L16,\n"
+"        or if either the texture unit or the shaders don't operate in at least\n"
+"        16 bit precision.\n"
 "";
 
 /** @brief libvo Callback: Preinitialize the video card.
@@ -1331,6 +1398,8 @@ static int preinit_internal(struct vo *vo, const char *arg, bool allow_shaders)
     *priv = (d3d_priv) {
         .vo = vo,
 
+        .opt_16bit_textures = true,
+
         .colorspace = MP_CSP_DETAILS_DEFAULTS,
         .video_eq = { MP_CSP_EQ_CAPS_COLORMATRIX },
     };
@@ -1351,6 +1420,7 @@ static int preinit_internal(struct vo *vo, const char *arg, bool allow_shaders)
         {"texture-memory", OPT_ARG_INT, &priv->opt_texture_memory},
         {"swap-discard", OPT_ARG_BOOL, &priv->opt_swap_discard},
         {"exact-backbuffer", OPT_ARG_BOOL, &priv->opt_exact_backbuffer},
+        {"16bit-textures", OPT_ARG_BOOL, &priv->opt_16bit_textures},
         {NULL}
     };
     if (subopt_parse(arg, subopts) != 0) {