From 0b6eb24b9a8034287f67f800fc61d07b7f018891 Mon Sep 17 00:00:00 2001
From: arpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>
Date: Sun, 6 Apr 2003 16:36:02 +0000
Subject: Importing libmpeg2 from mpeg2dec-0.3.1

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@9853 b3059339-0415-0410-9bf9-f77b7e298cf2
---
 libmpeg2/Makefile           |    5 +-
 libmpeg2/attributes.h       |   14 +-
 libmpeg2/header.c           |  714 +++++++++++++++++-----
 libmpeg2/idct.c             |  416 +++++++------
 libmpeg2/idct_mlib.c        |   25 +-
 libmpeg2/idct_mmx.c         |  605 +++++++++++--------
 libmpeg2/mm_accel.h         |   30 -
 libmpeg2/mmx.h              |   26 +-
 libmpeg2/motion_comp.c      |  142 ++---
 libmpeg2/motion_comp_mlib.c |  148 ++---
 libmpeg2/motion_comp_mmx.c  |  530 ++++++++---------
 libmpeg2/mpeg2.h            |  176 ++++--
 libmpeg2/mpeg2_internal.h   |  287 +++++----
 libmpeg2/slice.c            | 1368 +++++++++++++++++++++----------------------
 libmpeg2/sse.h              |  256 --------
 libmpeg2/stats.c            |  315 ----------
 libmpeg2/vlc.h              |   59 +-
 17 files changed, 2605 insertions(+), 2511 deletions(-)
 delete mode 100644 libmpeg2/mm_accel.h
 delete mode 100644 libmpeg2/sse.h
 delete mode 100644 libmpeg2/stats.c

(limited to 'libmpeg2')

diff --git a/libmpeg2/Makefile b/libmpeg2/Makefile
index 914b41844d..6ee925ddb9 100644
--- a/libmpeg2/Makefile
+++ b/libmpeg2/Makefile
@@ -3,9 +3,8 @@ LIBNAME = libmpeg2.a
 
 include ../config.mak
 
-SRCS	= header.c idct.c idct_mmx.c idct_mlib.c \
-		motion_comp.c motion_comp_mmx.c motion_comp_mlib.c \
-		slice.c stats.c # decode.c
+SRCS	= alloc.c cpu_accel.c cpu_state.c decode.c header.c idct.c idct_alpha.c idct_altivec.c idct_mlib.c idct_mmx.c motion_comp.c motion_comp_alpha.c motion_comp_altivec.c motion_comp_mlib.c motion_comp_mmx.c slice.c
+
 OBJS	= $(SRCS:.c=.o)
 INCLUDE = -I. -I../libvo -I.. $(EXTRA_INC) $(MLIB_INC)
 CFLAGS  = $(OPTFLAGS) $(INCLUDE) -DMPG12PLAY
diff --git a/libmpeg2/attributes.h b/libmpeg2/attributes.h
index ab7105c2df..96a86b26c0 100644
--- a/libmpeg2/attributes.h
+++ b/libmpeg2/attributes.h
@@ -1,8 +1,10 @@
 /*
  * attributes.h
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,7 +23,15 @@
 
 /* use gcc attribs to align critical data structures */
 #ifdef ATTRIBUTE_ALIGNED_MAX
-#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < (align)) ? ATTRIBUTE_ALIGNED_MAX : (align))))
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
 #else
 #define ATTR_ALIGN(align)
 #endif
+
+#ifdef HAVE_BUILTIN_EXPECT
+#define likely(x) __builtin_expect ((x) != 0, 1)
+#define unlikely(x) __builtin_expect ((x) != 0, 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
diff --git a/libmpeg2/header.c b/libmpeg2/header.c
index 68483a71c1..548d6bf21e 100644
--- a/libmpeg2/header.c
+++ b/libmpeg2/header.c
@@ -1,8 +1,10 @@
 /*
- * slice.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * header.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,13 +24,23 @@
 #include "config.h"
 
 #include <inttypes.h>
-#include <stdio.h>
+#include <stdlib.h>	/* defines NULL */
+#include <string.h>	/* memcmp */
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
+#include "convert.h"
 #include "attributes.h"
 
+#define SEQ_EXT 2
+#define SEQ_DISPLAY_EXT 4
+#define QUANT_MATRIX_EXT 8
+#define COPYRIGHT_EXT 0x10
+#define PIC_DISPLAY_EXT 0x80
+#define PIC_CODING_EXT 0x100
+
 /* default intra quant matrix, in zig-zag order */
-static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
+static const uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
     8,
     16, 16,
     19, 16, 19,
@@ -46,214 +58,634 @@ static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
     83
 };
 
-uint8_t scan_norm[64] ATTR_ALIGN(16) =
-{
+uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) = {
     /* Zig-Zag scan pattern */
-     0, 1, 8,16, 9, 2, 3,10,
-    17,24,32,25,18,11, 4, 5,
-    12,19,26,33,40,48,41,34,
-    27,20,13, 6, 7,14,21,28,
-    35,42,49,56,57,50,43,36,
-    29,22,15,23,30,37,44,51,
-    58,59,52,45,38,31,39,46,
-    53,60,61,54,47,55,62,63
+     0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
-uint8_t scan_alt[64] ATTR_ALIGN(16) =
-{
+uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = {
     /* Alternate scan pattern */
-    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
-    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
-    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
-    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+     0, 8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
 };
 
-void header_state_init (picture_t * picture)
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec)
 {
-    picture->scan = scan_norm;
+    mpeg2dec->decoder.scan = mpeg2_scan_norm;
+    mpeg2dec->picture = mpeg2dec->pictures;
+    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf;
+    mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf;
+    mpeg2dec->fbuf[2] = &mpeg2dec->fbuf_alloc[2].fbuf;
+    mpeg2dec->first = 1;
+    mpeg2dec->alloc_index = 0;
+    mpeg2dec->alloc_index_user = 0;
 }
 
-int header_process_sequence_header (picture_t * picture, uint8_t * buffer)
+static void reset_info (mpeg2_info_t * info)
 {
+    info->current_picture = info->current_picture_2nd = NULL;
+    info->display_picture = info->display_picture_2nd = NULL;
+    info->current_fbuf = info->display_fbuf = info->discard_fbuf = NULL;
+    info->user_data = NULL;	info->user_data_len = 0;
+}
+
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    static unsigned int frame_period[9] = {
+	0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000
+    };
     int width, height;
     int i;
 
-    if ((buffer[6] & 0x20) != 0x20){
-	printf("missing marker bit!\n");
-	return 1;	/* missing marker_bit */
-    }
+    if ((buffer[6] & 0x20) != 0x20)	/* missing marker_bit */
+	return 1;
 
-    height = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    i = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    sequence->display_width = sequence->picture_width = width = i >> 12;
+    sequence->display_height = sequence->picture_height = height = i & 0xfff;
+    decoder->width = sequence->width = width = (width + 15) & ~15;
+    decoder->height = sequence->height = height = (height + 15) & ~15;
+    decoder->vertical_position_extension = (height > 2800);
+    sequence->chroma_width = width >> 1;
+    sequence->chroma_height = height >> 1;
 
-    picture->display_picture_width = (height >> 12);
-    picture->display_picture_height = (height & 0xfff);
+    sequence->flags = SEQ_FLAG_PROGRESSIVE_SEQUENCE;
 
-    width = ((height >> 12) + 15) & ~15;
-    height = ((height & 0xfff) + 15) & ~15;
+    sequence->pixel_width = buffer[3] >> 4;	/* aspect ratio */
+    sequence->frame_period = 0;
+    if ((buffer[3] & 15) < 9)
+	sequence->frame_period = frame_period[buffer[3] & 15];
 
-    if ((width > 768) || (height > 576)){
-	printf("size restrictions for MP@ML or MPEG1 exceeded! (%dx%d)\n",width,height);
-//	return 1;	/* size restrictions for MP@ML or MPEG1 */
-    }
-    
-    picture->coded_picture_width = width;
-    picture->coded_picture_height = height;
+    sequence->byte_rate = (buffer[4]<<10) | (buffer[5]<<2) | (buffer[6]>>6);
 
-    /* this is not used by the decoder */
-    picture->aspect_ratio_information = buffer[3] >> 4;
-    picture->frame_rate_code = buffer[3] & 15;
-    picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6);
+    sequence->vbv_buffer_size = ((buffer[6]<<16)|(buffer[7]<<8))&0x1ff800;
+
+    if (buffer[7] & 4)
+	sequence->flags |= SEQ_FLAG_CONSTRAINED_PARAMETERS;
 
     if (buffer[7] & 2) {
 	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
 	buffer += 64;
-    } else {
+    } else
 	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		default_intra_quantizer_matrix [i];
-    }
 
-    if (buffer[7] & 1) {
+    if (buffer[7] & 1)
 	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		buffer[i+8];
-    } else {
+    else
 	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[i] = 16;
+	    decoder->non_intra_quantizer_matrix[i] = 16;
+
+    sequence->profile_level_id = 0x80;
+    sequence->colour_primaries = 1;
+    sequence->transfer_characteristics = 1;
+    sequence->matrix_coefficients = 1;
+
+    decoder->mpeg1 = 1;
+    decoder->intra_dc_precision = 0;
+    decoder->frame_pred_frame_dct = 1;
+    decoder->q_scale_type = 0;
+    decoder->concealment_motion_vectors = 0;
+    decoder->scan = mpeg2_scan_norm;
+    decoder->picture_structure = FRAME_PICTURE;
+
+    mpeg2dec->ext_state = SEQ_EXT;
+    mpeg2dec->state = STATE_SEQUENCE;
+    mpeg2dec->display_offset_x = mpeg2dec->display_offset_y = 0;
+
+    reset_info (&(mpeg2dec->info));
+    return 0;
+}
+
+static int sequence_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int width, height;
+    uint32_t flags;
+
+    if (!(buffer[3] & 1))
+	return 1;
+
+    sequence->profile_level_id = (buffer[0] << 4) | (buffer[1] >> 4);
+
+    width = sequence->display_width = sequence->picture_width +=
+	((buffer[1] << 13) | (buffer[2] << 5)) & 0x3000;
+    height = sequence->display_height = sequence->picture_height +=
+	(buffer[2] << 7) & 0x3000;
+    decoder->vertical_position_extension = (height > 2800);
+    flags = sequence->flags | SEQ_FLAG_MPEG2;
+    if (!(buffer[1] & 8)) {
+	flags &= ~SEQ_FLAG_PROGRESSIVE_SEQUENCE;
+	height = (height + 31) & ~31;
+    }
+    if (buffer[5] & 0x80)
+	flags |= SEQ_FLAG_LOW_DELAY;
+    sequence->flags = flags;
+    decoder->width = sequence->width = width = (width + 15) & ~15;
+    decoder->height = sequence->height = height = (height + 15) & ~15;
+    switch (buffer[1] & 6) {
+    case 0:	/* invalid */
+	return 1;
+    case 2:	/* 4:2:0 */
+	height >>= 1;
+    case 4:	/* 4:2:2 */
+	width >>= 1;
     }
+    sequence->chroma_width = width;
+    sequence->chroma_height = height;
 
-    /* MPEG1 - for testing only */
-    picture->mpeg1 = 1;
-    picture->intra_dc_precision = 0;
-    picture->frame_pred_frame_dct = 1;
-    picture->q_scale_type = 0;
-    picture->concealment_motion_vectors = 0;
-    /* picture->alternate_scan = 0; */
-    picture->picture_structure = FRAME_PICTURE;
-    /* picture->second_field = 0; */
+    sequence->byte_rate += ((buffer[2]<<25) | (buffer[3]<<17)) & 0x3ffc0000;
+
+    sequence->vbv_buffer_size |= buffer[4] << 21;
+
+    sequence->frame_period =
+	sequence->frame_period * ((buffer[5]&31)+1) / (((buffer[5]>>2)&3)+1);
+
+    decoder->mpeg1 = 0;
+
+    mpeg2dec->ext_state = SEQ_DISPLAY_EXT;
 
     return 0;
 }
 
-static int header_process_sequence_extension (picture_t * picture,
-					      uint8_t * buffer)
+static int sequence_display_ext (mpeg2dec_t * mpeg2dec)
 {
-    /* check chroma format, size extensions, marker bit */
-    if (((buffer[1] & 0x07) != 0x02) || (buffer[2] & 0xe0) ||
-	((buffer[3] & 0x01) != 0x01))
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    uint32_t flags;
+
+    flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) |
+	     ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT));
+    if (buffer[0] & 1) {
+	flags |= SEQ_FLAG_COLOUR_DESCRIPTION;
+	sequence->colour_primaries = buffer[1];
+	sequence->transfer_characteristics = buffer[2];
+	sequence->matrix_coefficients = buffer[3];
+	buffer += 3;
+    }
+
+    if (!(buffer[2] & 2))	/* missing marker_bit */
 	return 1;
 
-    /* this is not used by the decoder */
-    picture->progressive_sequence = (buffer[1] >> 3) & 1;
+    sequence->display_width = (buffer[1] << 6) | (buffer[2] >> 2);
+    sequence->display_height =
+	((buffer[2]& 1 ) << 13) | (buffer[3] << 5) | (buffer[4] >> 3);
+
+    return 0;
+}
+
+static inline void finalize_sequence (sequence_t * sequence)
+{
+    int width;
+    int height;
+
+    sequence->byte_rate *= 50;
+
+    if (sequence->flags & SEQ_FLAG_MPEG2) {
+	switch (sequence->pixel_width) {
+	case 1:		/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;	return;
+	case 2:		/* 4:3 aspect ratio */
+	    width = 4; height = 3;	break;
+	case 3:		/* 16:9 aspect ratio */
+	    width = 16; height = 9;	break;
+	case 4:		/* 2.21:1 aspect ratio */
+	    width = 221; height = 100;	break;
+	default:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;	return;
+	}
+	width *= sequence->display_height;
+	height *= sequence->display_width;
+
+    } else {
+	if (sequence->byte_rate == 50 * 0x3ffff) 
+	    sequence->byte_rate = 0;        /* mpeg-1 VBR */ 
+
+	switch (sequence->pixel_width) {
+	case 0:	case 15:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;		return;
+	case 1:	/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;		return;
+	case 3:	/* 720x576 16:9 */
+	    sequence->pixel_width = 64;	sequence->pixel_height = 45;	return;
+	case 6:	/* 720x480 16:9 */
+	    sequence->pixel_width = 32;	sequence->pixel_height = 27;	return;
+	case 12:	/* 720*480 4:3 */
+	    sequence->pixel_width = 8;	sequence->pixel_height = 9;	return;
+	default:
+	    height = 88 * sequence->pixel_width + 1171;
+	    width = 2000;
+	}
+    }
 
-    if (picture->progressive_sequence)
-	picture->coded_picture_height =
-	    (picture->coded_picture_height + 31) & ~31;
+    sequence->pixel_width = width;
+    sequence->pixel_height = height;
+    while (width) {	/* find greatest common divisor */
+	int tmp = width;
+	width = height % tmp;
+	height = tmp;
+    }
+    sequence->pixel_width /= height;
+    sequence->pixel_height /= height;
+}
 
-    /* MPEG1 - for testing only */
-    picture->mpeg1 = 0;
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec)
+{
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+
+    finalize_sequence (sequence);
+
+    /*
+     * according to 6.1.1.6, repeat sequence headers should be
+     * identical to the original. However some DVDs dont respect that
+     * and have different bitrates in the repeat sequence headers. So
+     * we'll ignore that in the comparison and still consider these as
+     * repeat sequence headers.
+     */
+    mpeg2dec->sequence.byte_rate = sequence->byte_rate;
+    if (!memcmp (&(mpeg2dec->sequence), sequence, sizeof (sequence_t)))
+	mpeg2dec->state = STATE_SEQUENCE_REPEATED;
+    mpeg2dec->sequence = *sequence;
+
+    mpeg2dec->info.sequence = &(mpeg2dec->sequence);
+}
 
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->state = STATE_GOP;
+    reset_info (&(mpeg2dec->info));
     return 0;
 }
 
-static int header_process_quant_matrix_extension (picture_t * picture,
-						  uint8_t * buffer)
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type)
 {
     int i;
 
-    if (buffer[0] & 8) {
-	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
-		(buffer[i] << 5) | (buffer[i+1] >> 3);
-	buffer += 64;
+    for (i = 0; i < 3; i++)
+	if (mpeg2dec->fbuf[1] != &mpeg2dec->fbuf_alloc[i].fbuf &&
+	    mpeg2dec->fbuf[2] != &mpeg2dec->fbuf_alloc[i].fbuf) {
+	    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[i].fbuf;
+	    mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
+	    if ((coding_type == B_TYPE) ||
+		(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+		if ((coding_type == B_TYPE) || (mpeg2dec->convert_start))
+		    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
+		mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
+	    }
+	    break;
+	}
+}
+
+int mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec)
+{
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    picture_t * picture;
+
+    if (mpeg2dec->state != STATE_SLICE_1ST) {
+	mpeg2dec->state = STATE_PICTURE;
+	picture = mpeg2dec->pictures;
+	if ((decoder->coding_type != PIC_FLAG_CODING_TYPE_B) ^
+	    (mpeg2dec->picture >= mpeg2dec->pictures + 2))
+	    picture += 2;
+    } else {
+	mpeg2dec->state = STATE_PICTURE_2ND;
+	picture = mpeg2dec->picture + 1;	/* second field picture */
     }
+    mpeg2dec->picture = picture;
+    picture->flags = 0;
+    if (mpeg2dec->num_pts) {
+	if (mpeg2dec->bytes_since_pts >= 4) {
+	    mpeg2dec->num_pts = 0;
+	    picture->pts = mpeg2dec->pts_current;
+	    picture->flags = PIC_FLAG_PTS;
+	} else if (mpeg2dec->num_pts > 1) {
+	    mpeg2dec->num_pts = 1;
+	    picture->pts = mpeg2dec->pts_previous;
+	    picture->flags = PIC_FLAG_PTS;
+	}
+    }
+    picture->display_offset[0].x = picture->display_offset[1].x =
+	picture->display_offset[2].x = mpeg2dec->display_offset_x;
+    picture->display_offset[0].y = picture->display_offset[1].y =
+	picture->display_offset[2].y = mpeg2dec->display_offset_y;
+    return mpeg2_parse_header (mpeg2dec);
+}
 
-    if (buffer[0] & 4) {
-	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
-		(buffer[i] << 6) | (buffer[i+1] >> 2);
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int type;
+    int low_delay;
+
+    type = (buffer [1] >> 3) & 7;
+    low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY;
+
+    if (mpeg2dec->state == STATE_PICTURE) {
+	picture_t * other;
+
+	decoder->second_field = 0;
+	other = mpeg2dec->pictures;
+	if (other == picture)
+	    other += 2;
+	if (decoder->coding_type != PIC_FLAG_CODING_TYPE_B) {
+	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
+	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
+	}
+	mpeg2dec->fbuf[0] = NULL;
+	reset_info (&(mpeg2dec->info));
+	mpeg2dec->info.current_picture = picture;
+	mpeg2dec->info.display_picture = picture;
+	if (type != PIC_FLAG_CODING_TYPE_B) {
+	    if (!low_delay) {
+		if (mpeg2dec->first) {
+		    mpeg2dec->info.display_picture = NULL;
+		    mpeg2dec->first = 0;
+		} else {
+		    mpeg2dec->info.display_picture = other;
+		    if (other->nb_fields == 1)
+			mpeg2dec->info.display_picture_2nd = other + 1;
+		    mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
+		}
+	    }
+	    if (!low_delay + !mpeg2dec->convert_start)
+		mpeg2dec->info.discard_fbuf =
+		    mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert_start];
+	}
+	if (!mpeg2dec->custom_fbuf) {
+	    while (mpeg2dec->alloc_index < 3) {
+		fbuf_t * fbuf;
+
+		fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf);
+		fbuf->id = NULL;
+		if (mpeg2dec->convert_start) {    
+		    fbuf->buf[0] =
+			(uint8_t *) mpeg2_malloc (mpeg2dec->convert_size[0],
+						  ALLOC_CONVERTED);
+		    fbuf->buf[1] = fbuf->buf[0] + mpeg2dec->convert_size[1];
+		    fbuf->buf[2] = fbuf->buf[0] + mpeg2dec->convert_size[2];
+		} else {
+		    int size;
+		    size = mpeg2dec->decoder.width * mpeg2dec->decoder.height;
+		    fbuf->buf[0] = (uint8_t *) mpeg2_malloc (6 * size >> 2,
+							     ALLOC_YUV);
+		    fbuf->buf[1] = fbuf->buf[0] + size;
+		    fbuf->buf[2] = fbuf->buf[1] + (size >> 2);
+		}
+	    }
+	    mpeg2_set_fbuf (mpeg2dec, type);
+	}
+    } else {
+	decoder->second_field = 1;
+	mpeg2dec->info.current_picture_2nd = picture;
+	mpeg2dec->info.user_data = NULL; mpeg2dec->info.user_data_len = 0;
+	if (low_delay || type == PIC_FLAG_CODING_TYPE_B)
+	    mpeg2dec->info.display_picture_2nd = picture;
+    }
+    mpeg2dec->ext_state = PIC_CODING_EXT;
+
+    picture->temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
+
+    decoder->coding_type = type;
+    picture->flags |= type;
+
+    if (type == PIC_FLAG_CODING_TYPE_P || type == PIC_FLAG_CODING_TYPE_B) {
+	/* forward_f_code and backward_f_code - used in mpeg1 only */
+	decoder->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
+	decoder->f_motion.f_code[0] =
+	    (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
+	decoder->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
+	decoder->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
     }
 
+    /* XXXXXX decode extra_information_picture as well */
+
+    picture->nb_fields = 2;
+
     return 0;
 }
 
-static int header_process_picture_coding_extension (picture_t * picture, uint8_t * buffer)
+static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
 {
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    uint32_t flags;
+
     /* pre subtract 1 for use later in compute_motion_vector */
-    picture->f_motion.f_code[0] = (buffer[0] & 15) - 1;
-    picture->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
-    picture->b_motion.f_code[0] = (buffer[1] & 15) - 1;
-    picture->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
-
-    picture->intra_dc_precision = (buffer[2] >> 2) & 3;
-    picture->picture_structure = buffer[2] & 3;
-    picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
-    picture->concealment_motion_vectors = (buffer[3] >> 5) & 1;
-    picture->q_scale_type = (buffer[3] >> 4) & 1;
-    picture->intra_vlc_format = (buffer[3] >> 3) & 1;
-
-    if (buffer[3] & 4)	/* alternate_scan */
-	picture->scan = scan_alt;
-    else
-	picture->scan = scan_norm;
-
-    /* these are not used by the decoder */
-    picture->top_field_first = buffer[3] >> 7;
-    picture->repeat_first_field = (buffer[3] >> 1) & 1;
-    picture->progressive_frame = buffer[4] >> 7;
-
-    // repeat_first implementation by A'rpi/ESP-team, based on libmpeg3:
-    picture->display_time=100;
-    if(picture->repeat_first_field){
-        if(picture->progressive_sequence){
-            if(picture->top_field_first)
-                picture->display_time+=200;
-            else
-                picture->display_time+=100;
-        } else
-        if(picture->progressive_frame){
-                picture->display_time+=50;
-        }
+    decoder->f_motion.f_code[0] = (buffer[0] & 15) - 1;
+    decoder->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
+    decoder->b_motion.f_code[0] = (buffer[1] & 15) - 1;
+    decoder->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
+
+    flags = picture->flags;
+    decoder->intra_dc_precision = (buffer[2] >> 2) & 3;
+    decoder->picture_structure = buffer[2] & 3;
+    switch (decoder->picture_structure) {
+    case TOP_FIELD:
+	flags |= PIC_FLAG_TOP_FIELD_FIRST;
+    case BOTTOM_FIELD:
+	picture->nb_fields = 1;
+	break;
+    case FRAME_PICTURE:
+	if (!(mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)) {
+	    picture->nb_fields = (buffer[3] & 2) ? 3 : 2;
+	    flags |= (buffer[3] & 128) ? PIC_FLAG_TOP_FIELD_FIRST : 0;
+	} else
+	    picture->nb_fields = (buffer[3]&2) ? ((buffer[3]&128) ? 6 : 4) : 2;
+	break;
+    default:
+	return 1;
+    }
+    decoder->top_field_first = buffer[3] >> 7;
+    decoder->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    decoder->concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    decoder->q_scale_type = (buffer[3] >> 4) & 1;
+    decoder->intra_vlc_format = (buffer[3] >> 3) & 1;
+    decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm;
+    flags |= (buffer[4] & 0x80) ? PIC_FLAG_PROGRESSIVE_FRAME : 0;
+    if (buffer[4] & 0x40)
+	flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) &
+		  PIC_MASK_COMPOSITE_DISPLAY) | PIC_FLAG_COMPOSITE_DISPLAY;
+    picture->flags = flags;
+
+    mpeg2dec->ext_state = PIC_DISPLAY_EXT | COPYRIGHT_EXT | QUANT_MATRIX_EXT;
+
+    return 0;
+}
+
+static int picture_display_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    int i, nb_pos;
+
+    nb_pos = picture->nb_fields;
+    if (mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)
+	nb_pos >>= 1;
+
+    for (i = 0; i < nb_pos; i++) {
+	int x, y;
+
+	x = ((buffer[4*i] << 24) | (buffer[4*i+1] << 16) |
+	     (buffer[4*i+2] << 8) | buffer[4*i+3]) >> (11-2*i);
+	y = ((buffer[4*i+2] << 24) | (buffer[4*i+3] << 16) |
+	     (buffer[4*i+4] << 8) | buffer[4*i+5]) >> (10-2*i);
+	if (! (x & y & 1))
+	    return 1;
+	picture->display_offset[i].x = mpeg2dec->display_offset_x = x >> 1;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y = y >> 1;
+    }
+    for (; i < 3; i++) {
+	picture->display_offset[i].x = mpeg2dec->display_offset_x;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y;
     }
-    //temopral hack. We calc time on every field, so if we have 2 fields
-    // interlaced we'll end with double time for 1 frame
-    if( picture->picture_structure!=3 ) picture->display_time/=2;
     return 0;
 }
 
-int header_process_extension (picture_t * picture, uint8_t * buffer)
+static int copyright_ext (mpeg2dec_t * mpeg2dec)
 {
-    switch (buffer[0] & 0xf0) {
-    case 0x10:	/* sequence extension */
-	return header_process_sequence_extension (picture, buffer);
+    return 0;
+}
 
-    case 0x30:	/* quant matrix extension */
-	return header_process_quant_matrix_extension (picture, buffer);
+static int quant_matrix_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int i;
 
-    case 0x80:	/* picture coding extension */
-	return header_process_picture_coding_extension (picture, buffer);
+    if (buffer[0] & 8) {
+	for (i = 0; i < 64; i++)
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 5) | (buffer[i+1] >> 3);
+	buffer += 64;
     }
 
+    if (buffer[0] & 4)
+	for (i = 0; i < 64; i++)
+	    decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 6) | (buffer[i+1] >> 2);
+
     return 0;
 }
 
-int header_process_picture_header (picture_t *picture, uint8_t * buffer)
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec)
 {
-    picture->picture_coding_type = (buffer [1] >> 3) & 7;
+    static int (* parser[]) (mpeg2dec_t *) = {
+	0, sequence_ext, sequence_display_ext, quant_matrix_ext,
+	copyright_ext, 0, 0, picture_display_ext, picture_coding_ext
+    };
+    int ext, ext_bit;
+
+    ext = mpeg2dec->chunk_start[0] >> 4;
+    ext_bit = 1 << ext;
+
+    if (!(mpeg2dec->ext_state & ext_bit))
+	return 0;	/* ignore illegal extensions */
+    mpeg2dec->ext_state &= ~ext_bit;
+    return parser[ext] (mpeg2dec);
+}
 
-    /* forward_f_code and backward_f_code - used in mpeg1 only */
-    picture->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
-    picture->f_motion.f_code[0] =
-	(((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
-    picture->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
-    picture->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec)
+{
+    if (!mpeg2dec->info.user_data_len)
+	mpeg2dec->info.user_data = mpeg2dec->chunk_start;
+    else
+	mpeg2dec->info.user_data_len += 3;
+    mpeg2dec->info.user_data_len += (mpeg2dec->chunk_ptr - 4 -
+				     mpeg2dec->chunk_start);
+    mpeg2dec->chunk_start = mpeg2dec->chunk_ptr - 1;
+    
+    return 0;
+}
 
-    /* move in header_process_picture_header */
-        picture->second_field =
-            (picture->picture_structure != FRAME_PICTURE) &&
-            !(picture->second_field);
+int mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->state = ((mpeg2dec->picture->nb_fields > 1 ||
+			mpeg2dec->state == STATE_PICTURE_2ND) ?
+		       STATE_SLICE : STATE_SLICE_1ST);
+
+    if (!(mpeg2dec->nb_decode_slices))
+	mpeg2dec->picture->flags |= PIC_FLAG_SKIP;
+    else if (mpeg2dec->convert_start) {
+	int flags;
+
+	switch (mpeg2dec->decoder.picture_structure) {
+	case TOP_FIELD:		flags = CONVERT_TOP_FIELD;	break;
+	case BOTTOM_FIELD:	flags = CONVERT_BOTTOM_FIELD;	break;
+	default:
+	    flags =
+		((mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE) ?
+		 CONVERT_FRAME : CONVERT_BOTH_FIELDS);
+	}
+	mpeg2dec->convert_start (mpeg2dec->convert_id,
+				 mpeg2dec->fbuf[0]->buf, flags);
+
+	mpeg2dec->decoder.convert = mpeg2dec->convert_copy;
+	mpeg2dec->decoder.fbuf_id = mpeg2dec->convert_id;
+
+	if (mpeg2dec->decoder.coding_type == B_TYPE)
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->yuv_buf[2],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	else {
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder),
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	    if (mpeg2dec->state == STATE_SLICE)
+		mpeg2dec->yuv_index ^= 1;
+	}
+    } else {
+	int b_type;
 
+	mpeg2dec->decoder.convert = NULL;
+	b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+	mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->fbuf[0]->buf,
+			 mpeg2dec->fbuf[b_type + 1]->buf,
+			 mpeg2dec->fbuf[b_type]->buf);
+    }
+    mpeg2dec->action = NULL;
     return 0;
 }
+
+int mpeg2_header_end (mpeg2dec_t * mpeg2dec)
+{
+    picture_t * picture;
+    int b_type;
+
+    picture = mpeg2dec->pictures;
+    if (mpeg2dec->picture < picture + 2)
+	picture = mpeg2dec->pictures + 2;
+
+    mpeg2dec->state = STATE_INVALID;
+    reset_info (&(mpeg2dec->info));
+    b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+    if (!(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+	mpeg2dec->info.display_picture = picture;
+	if (picture->nb_fields == 1)
+	    mpeg2dec->info.display_picture_2nd = picture + 1;
+	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[b_type];
+	if (!mpeg2dec->convert_start)
+	    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type + 1];
+    } else if (!mpeg2dec->convert_start)
+	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type];
+    mpeg2dec->action = mpeg2_seek_sequence;
+    return STATE_END;
+}
diff --git a/libmpeg2/idct.c b/libmpeg2/idct.c
index 1e869c37de..bcae078156 100644
--- a/libmpeg2/idct.c
+++ b/libmpeg2/idct.c
@@ -1,12 +1,10 @@
 /*
  * idct.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- *
- * Portions of this code are from the MPEG software simulation group
- * idct implementation. This code will be replaced with a new
- * implementation soon.
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,27 +21,14 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-/**********************************************************/
-/* inverse two dimensional DCT, Chen-Wang algorithm */
-/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
-/* 32-bit integer arithmetic (8 bit coefficients) */
-/* 11 mults, 29 adds per DCT */
-/* sE, 18.8.91 */
-/**********************************************************/
-/* coefficients extended to 12 bit for IEEE1180-1990 */
-/* compliance sE, 2.1.94 */
-/**********************************************************/
-
-/* this code assumes >> to be a two's-complement arithmetic */
-/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
-
 #include "config.h"
 
-#include <stdio.h>
+#include <stdlib.h>
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
-#include "mm_accel.h"
+#include "attributes.h"
 
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -53,199 +38,131 @@
 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
 
 /* idct main entry point  */
-void (*idct_block_copy) (int16_t * block, uint8_t * dest, int stride);
-void (*idct_block_add) (int16_t * block, uint8_t * dest, int stride);
-
-static void idct_block_copy_c (int16_t *block, uint8_t * dest, int stride);
-static void idct_block_add_c (int16_t *block, uint8_t * dest, int stride);
+void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+void (* mpeg2_idct_add) (int last, int16_t * block,
+			 uint8_t * dest, int stride);
 
 static uint8_t clip_lut[1024];
-#define CLIP(i) ((clip_lut+384)[ (i)])
+#define CLIP(i) ((clip_lut+384)[(i)])
 
-void idct_init (void)
-{
-#ifdef ARCH_X86
-    if (config.flags & MM_ACCEL_X86_MMXEXT) {
-	printf ("libmpeg2: Using MMXEXT for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mmxext;
-	idct_block_add = idct_block_add_mmxext;
-	idct_mmx_init ();
-    } else if (config.flags & MM_ACCEL_X86_MMX) {
-	printf ("libmpeg2: Using MMX for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mmx;
-	idct_block_add = idct_block_add_mmx;
-	idct_mmx_init ();
-    } else
-#endif
-#ifdef LIBMPEG2_MLIB
-    if (config.flags & MM_ACCEL_MLIB) {
-	printf ("libmpeg2: Using mlib for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mlib;
-	idct_block_add = idct_block_add_mlib;
-    } else
+#if 0
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    t0 = W0*d0 + W1*d1;			\
+    t1 = W0*d1 - W1*d0;			\
+} while (0)
+#else
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    int tmp = W0 * (d0 + d1);		\
+    t0 = tmp + (W1 - W0) * d1;		\
+    t1 = tmp - (W1 + W0) * d0;		\
+} while (0)
 #endif
-    {
-	int i;
-
-	printf ("libmpeg2: No accelerated IDCT transform found\n");
-	idct_block_copy = idct_block_copy_c;
-	idct_block_add = idct_block_add_c;
-	for (i = -384; i < 640; i++)
-	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
-    }
-}
 
-/* row (horizontal) IDCT
- *
- * 7 pi 1
- * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l )
- * l=0 8 2
- *
- * where: c[0] = 128
- * c[1..7] = 128*sqrt (2)
- */
-
-static inline void idct_row (int16_t * block)
+static void inline idct_row (int16_t * const block)
 {
-    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-    x1 = block[4] << 11;
-    x2 = block[6];
-    x3 = block[2];
-    x4 = block[1];
-    x5 = block[7];
-    x6 = block[5];
-    x7 = block[3];
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
 
     /* shortcut */
-    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
-	block[0] = block[1] = block[2] = block[3] = block[4] =
-	    block[5] = block[6] = block[7] = block[0]<<3;
+    if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] |
+		  ((int32_t *)block)[3]))) {
+	uint32_t tmp = (uint16_t) (block[0] << 3);
+	tmp |= tmp << 16;
+	((int32_t *)block)[0] = tmp;
+	((int32_t *)block)[1] = tmp;
+	((int32_t *)block)[2] = tmp;
+	((int32_t *)block)[3] = tmp;
 	return;
     }
 
-    x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */
-
-    /* first stage */
-    x8 = W7 * (x4 + x5);
-    x4 = x8 + (W1 - W7) * x4;
-    x5 = x8 - (W1 + W7) * x5;
-    x8 = W3 * (x6 + x7);
-    x6 = x8 - (W3 - W5) * x6;
-    x7 = x8 - (W3 + W5) * x7;
- 
-    /* second stage */
-    x8 = x0 + x1;
-    x0 -= x1;
-    x1 = W6 * (x3 + x2);
-    x2 = x1 - (W2 + W6) * x2;
-    x3 = x1 + (W2 - W6) * x3;
-    x1 = x4 + x6;
-    x4 -= x6;
-    x6 = x5 + x7;
-    x5 -= x7;
- 
-    /* third stage */
-    x7 = x8 + x3;
-    x8 -= x3;
-    x3 = x0 + x2;
-    x0 -= x2;
-    x2 = (181 * (x4 + x5) + 128) >> 8;
-    x4 = (181 * (x4 - x5) + 128) >> 8;
- 
-    /* fourth stage */
-    block[0] = (x7 + x1) >> 8;
-    block[1] = (x3 + x2) >> 8;
-    block[2] = (x0 + x4) >> 8;
-    block[3] = (x8 + x6) >> 8;
-    block[4] = (x8 - x6) >> 8;
-    block[5] = (x0 - x4) >> 8;
-    block[6] = (x3 - x2) >> 8;
-    block[7] = (x7 - x1) >> 8;
+    d0 = (block[0] << 11) + 128;
+    d1 = block[1];
+    d2 = block[2] << 11;
+    d3 = block[3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[4];
+    d1 = block[5];
+    d2 = block[6];
+    d3 = block[7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) * 181) >> 8;
+    b2 = ((t0 - t1) * 181) >> 8;
+
+    block[0] = (a0 + b0) >> 8;
+    block[1] = (a1 + b1) >> 8;
+    block[2] = (a2 + b2) >> 8;
+    block[3] = (a3 + b3) >> 8;
+    block[4] = (a3 - b3) >> 8;
+    block[5] = (a2 - b2) >> 8;
+    block[6] = (a1 - b1) >> 8;
+    block[7] = (a0 - b0) >> 8;
 }
 
-/* column (vertical) IDCT
- *
- * 7 pi 1
- * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l )
- * l=0 8 2
- *
- * where: c[0] = 1/1024
- * c[1..7] = (1/1024)*sqrt (2)
- */
-
-static inline void idct_col (int16_t *block)
+static void inline idct_col (int16_t * const block)
 {
-    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-    /* shortcut */
-    x1 = block [8*4] << 8;
-    x2 = block [8*6];
-    x3 = block [8*2];
-    x4 = block [8*1];
-    x5 = block [8*7];
-    x6 = block [8*5];
-    x7 = block [8*3];
-
-#if 0
-    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
-	block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] =
-	    block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6;
-	return;
-    }
-#endif
-
-    x0 = (block[8*0] << 8) + 8192;
-
-    /* first stage */
-    x8 = W7 * (x4 + x5) + 4;
-    x4 = (x8 + (W1 - W7) * x4) >> 3;
-    x5 = (x8 - (W1 + W7) * x5) >> 3;
-    x8 = W3 * (x6 + x7) + 4;
-    x6 = (x8 - (W3 - W5) * x6) >> 3;
-    x7 = (x8 - (W3 + W5) * x7) >> 3;
- 
-    /* second stage */
-    x8 = x0 + x1;
-    x0 -= x1;
-    x1 = W6 * (x3 + x2) + 4;
-    x2 = (x1 - (W2 + W6) * x2) >> 3;
-    x3 = (x1 + (W2 - W6) * x3) >> 3;
-    x1 = x4 + x6;
-    x4 -= x6;
-    x6 = x5 + x7;
-    x5 -= x7;
- 
-    /* third stage */
-    x7 = x8 + x3;
-    x8 -= x3;
-    x3 = x0 + x2;
-    x0 -= x2;
-    x2 = (181 * (x4 + x5) + 128) >> 8;
-    x4 = (181 * (x4 - x5) + 128) >> 8;
- 
-    /* fourth stage */
-    block[8*0] = (x7 + x1) >> 14;
-    block[8*1] = (x3 + x2) >> 14;
-    block[8*2] = (x0 + x4) >> 14;
-    block[8*3] = (x8 + x6) >> 14;
-    block[8*4] = (x8 - x6) >> 14;
-    block[8*5] = (x0 - x4) >> 14;
-    block[8*6] = (x3 - x2) >> 14;
-    block[8*7] = (x7 - x1) >> 14;
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
+
+    d0 = (block[8*0] << 11) + 65536;
+    d1 = block[8*1];
+    d2 = block[8*2] << 11;
+    d3 = block[8*3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[8*4];
+    d1 = block[8*5];
+    d2 = block[8*6];
+    d3 = block[8*7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 = (t0 - t2) >> 8;
+    t1 = (t1 - t3) >> 8;
+    b1 = (t0 + t1) * 181;
+    b2 = (t0 - t1) * 181;
+
+    block[8*0] = (a0 + b0) >> 17;
+    block[8*1] = (a1 + b1) >> 17;
+    block[8*2] = (a2 + b2) >> 17;
+    block[8*3] = (a3 + b3) >> 17;
+    block[8*4] = (a3 - b3) >> 17;
+    block[8*5] = (a2 - b2) >> 17;
+    block[8*6] = (a1 - b1) >> 17;
+    block[8*7] = (a0 - b0) >> 17;
 }
 
-void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride)
+static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
+			       const int stride)
 {
     int i;
 
     for (i = 0; i < 8; i++)
 	idct_row (block + 8 * i);
-
     for (i = 0; i < 8; i++)
 	idct_col (block + i);
-
-    i = 8;
     do {
 	dest[0] = CLIP (block[0]);
 	dest[1] = CLIP (block[1]);
@@ -256,33 +173,112 @@ void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride)
 	dest[6] = CLIP (block[6]);
 	dest[7] = CLIP (block[7]);
 
+	block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
+	block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+
 	dest += stride;
 	block += 8;
     } while (--i);
 }
 
-void idct_block_add_c (int16_t * block, uint8_t * dest, int stride)
+static void mpeg2_idct_add_c (const int last, int16_t * block,
+			      uint8_t * dest, const int stride)
 {
     int i;
 
-    for (i = 0; i < 8; i++)
-	idct_row (block + 8 * i);
-
-    for (i = 0; i < 8; i++)
-	idct_col (block + i);
+    if (last != 129 || (block[0] & 7) == 4) {
+	for (i = 0; i < 8; i++)
+	    idct_row (block + 8 * i);
+	for (i = 0; i < 8; i++)
+	    idct_col (block + i);
+	do {
+	    dest[0] = CLIP (block[0] + dest[0]);
+	    dest[1] = CLIP (block[1] + dest[1]);
+	    dest[2] = CLIP (block[2] + dest[2]);
+	    dest[3] = CLIP (block[3] + dest[3]);
+	    dest[4] = CLIP (block[4] + dest[4]);
+	    dest[5] = CLIP (block[5] + dest[5]);
+	    dest[6] = CLIP (block[6] + dest[6]);
+	    dest[7] = CLIP (block[7] + dest[7]);
+
+	    block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
+	    block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+
+	    dest += stride;
+	    block += 8;
+	} while (--i);
+    } else {
+	int DC;
+
+	DC = (block[0] + 4) >> 3;
+	block[0] = block[63] = 0;
+	i = 8;
+	do {
+	    dest[0] = CLIP (DC + dest[0]);
+	    dest[1] = CLIP (DC + dest[1]);
+	    dest[2] = CLIP (DC + dest[2]);
+	    dest[3] = CLIP (DC + dest[3]);
+	    dest[4] = CLIP (DC + dest[4]);
+	    dest[5] = CLIP (DC + dest[5]);
+	    dest[6] = CLIP (DC + dest[6]);
+	    dest[7] = CLIP (DC + dest[7]);
+	    dest += stride;
+	} while (--i);
+    }
+}
 
-    i = 8;
-    do {
-	dest[0] = CLIP (block[0] + dest[0]);
-	dest[1] = CLIP (block[1] + dest[1]);
-	dest[2] = CLIP (block[2] + dest[2]);
-	dest[3] = CLIP (block[3] + dest[3]);
-	dest[4] = CLIP (block[4] + dest[4]);
-	dest[5] = CLIP (block[5] + dest[5]);
-	dest[6] = CLIP (block[6] + dest[6]);
-	dest[7] = CLIP (block[7] + dest[7]);
+void mpeg2_idct_init (uint32_t accel)
+{
+#ifdef ARCH_X86
+    if (accel & MPEG2_ACCEL_X86_MMXEXT) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
+	mpeg2_idct_add = mpeg2_idct_add_mmxext;
+	mpeg2_idct_mmx_init ();
+    } else if (accel & MPEG2_ACCEL_X86_MMX) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmx;
+	mpeg2_idct_add = mpeg2_idct_add_mmx;
+	mpeg2_idct_mmx_init ();
+    } else
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC) {
+	mpeg2_idct_copy = mpeg2_idct_copy_altivec;
+	mpeg2_idct_add = mpeg2_idct_add_altivec;
+	mpeg2_idct_altivec_init ();
+    } else
+#endif
+#ifdef ARCH_ALPHA
+    if (accel & MPEG2_ACCEL_ALPHA_MVI) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mvi;
+	mpeg2_idct_add = mpeg2_idct_add_mvi;
+	mpeg2_idct_alpha_init (0);
+    } else if (accel & MPEG2_ACCEL_ALPHA) {
+	mpeg2_idct_copy = mpeg2_idct_copy_alpha;
+	mpeg2_idct_add = mpeg2_idct_add_alpha;
+	mpeg2_idct_alpha_init (1);
+    } else
+#endif
+#ifdef LIBMPEG2_MLIB
+    if (accel & MPEG2_ACCEL_MLIB) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mlib_non_ieee;
+	mpeg2_idct_add = (getenv ("MLIB_NON_IEEE") ?
+			  mpeg2_idct_add_mlib_non_ieee : mpeg2_idct_add_mlib);
+    } else
+#endif
+    {
+	extern uint8_t mpeg2_scan_norm[64];
+	extern uint8_t mpeg2_scan_alt[64];
+	int i, j;
 
-	dest += stride;
-	block += 8;
-    } while (--i);
+	mpeg2_idct_copy = mpeg2_idct_copy_c;
+	mpeg2_idct_add = mpeg2_idct_add_c;
+	for (i = -384; i < 640; i++)
+	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+	for (i = 0; i < 64; i++) {
+	    j = mpeg2_scan_norm[i];
+	    mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	    j = mpeg2_scan_alt[i];
+	    mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	}
+    }
 }
diff --git a/libmpeg2/idct_mlib.c b/libmpeg2/idct_mlib.c
index 876ab574a4..eae2a2f1be 100644
--- a/libmpeg2/idct_mlib.c
+++ b/libmpeg2/idct_mlib.c
@@ -1,8 +1,9 @@
 /*
  * idct_mlib.c
- * Copyright (C) 1999-2001 H�kan Hjort <d95hjort@dtek.chalmers.se>
+ * Copyright (C) 1999-2002 H�kan Hjort <d95hjort@dtek.chalmers.se>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,25 +24,37 @@
 
 #ifdef LIBMPEG2_MLIB
 
-#include <inttypes.h>
 #include <mlib_types.h>
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
+#include <string.h>
+#include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 
-void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mlib (const int last, int16_t * const block,
+			  uint8_t * const dest, const int stride)
+{
+    mlib_VideoIDCT_IEEE_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * const block,
+				    uint8_t * const dest, const int stride)
 {
     mlib_VideoIDCT8x8_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
 }
 
-void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mlib_non_ieee (const int last, int16_t * const block,
+				   uint8_t * const dest, const int stride)
 {
-    /* Should we use mlib_VideoIDCT_IEEE_S16_S16 here ?? */
-    /* it's ~30% slower. */
     mlib_VideoIDCT8x8_S16_S16 (block, block);
     mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
 }
 
 #endif
diff --git a/libmpeg2/idct_mmx.c b/libmpeg2/idct_mmx.c
index 70b3b9b95e..4915b93750 100644
--- a/libmpeg2/idct_mmx.c
+++ b/libmpeg2/idct_mmx.c
@@ -1,8 +1,10 @@
 /*
  * idct_mmx.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,6 +27,7 @@
 
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 #include "attributes.h"
 #include "mmx.h"
@@ -87,104 +90,107 @@ static inline void idct_row (int16_t * row, int offset,
 						   c5, -c1,  c3, -c1,	\
 						   c7,  c3,  c7, -c5 }
 
-static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+				    const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
 
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
 }
 
-static inline void mmxext_row (int16_t * table, int32_t * rounder)
+static inline void mmxext_row (const int16_t * const table,
+			       const int32_t * const rounder)
 {
-    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
-    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
 
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
-    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
 
-    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
 
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
 
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
 
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
 
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
 
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
 
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm4);		/* mm4 = a3 a2 + rounder */
 
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm4);		/* mm4 = a3-b3 a2-b2 + rounder */
 }
 
-static inline void mmxext_row_tail (int16_t * row, int store)
+static inline void mmxext_row_tail (int16_t * const row, const int store)
 {
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
 
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
 
     /* slot */
 
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
 }
 
-static inline void mmxext_row_mid (int16_t * row, int store,
-				   int offset, int16_t * table)
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+				   const int offset,
+				   const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
 
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
 
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
 }
 
 
@@ -199,125 +205,127 @@ static inline void mmxext_row_mid (int16_t * row, int store,
 					   c5, -c1,  c7, -c5,	\
 					   c7,  c3,  c3, -c1 }
 
-static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
+static inline void mmx_row_head (int16_t * const row, const int offset,
+				 const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
 
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
 }
 
-static inline void mmx_row (int16_t * table, int32_t * rounder)
+static inline void mmx_row (const int16_t * const table,
+			    const int32_t * const rounder)
 {
-    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
-    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
 
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
-    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
 
-    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
 
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
 
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
 
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
 
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
 
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
 
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm7);		/* mm7 = a3 a2 + rounder */
 
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm7);		/* mm7 = a3-b3 a2-b2 + rounder */
 }
 
-static inline void mmx_row_tail (int16_t * row, int store)
+static inline void mmx_row_tail (int16_t * const row, const int store)
 {
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
 
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm4);		/* mm4 = y6 y7 y4 y5 */
 
-    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
+    pslld_i2r (16, mm7);		/* mm7 = y7 0 y5 0 */
 
-    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
+    psrld_i2r (16, mm4);		/* mm4 = 0 y6 0 y4 */
 
-    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
+    por_r2r (mm4, mm7);			/* mm7 = y7 y6 y5 y4 */
 
     /* slot */
 
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
 }
 
-static inline void mmx_row_mid (int16_t * row, int store,
-				int offset, int16_t * table)
+static inline void mmx_row_mid (int16_t * const row, const int store,
+				const int offset, const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm1);		/* mm1 = y6 y7 y4 y5 */
 
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
-    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
 
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
 
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
 
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
 }
 
 
 #if 0
-// C column IDCT - its just here to document the MMXEXT and MMX versions
+/* C column IDCT - its just here to document the MMXEXT and MMX versions */
 static inline void idct_col (int16_t * col, int offset)
 {
 /* multiplication - as implemented on mmx */
@@ -388,178 +396,178 @@ static inline void idct_col (int16_t * col, int offset)
 #endif
 
 
-// MMX column IDCT
-static inline void idct_col (int16_t * col, int offset)
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
 {
 #define T1 13036
 #define T2 27146
 #define T3 43790
 #define C4 23170
 
-    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
-    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
-    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
-    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+    static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
 
     /* column code adapted from peter gubanov */
     /* http://www.elecard.com/peter/idct.shtml */
 
-    movq_m2r (*_T1, mm0);		// mm0 = T1
+    movq_m2r (*_T1, mm0);		/* mm0 = T1 */
 
-    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
-    movq_r2r (mm0, mm2);		// mm2 = T1
+    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
+    movq_r2r (mm0, mm2);		/* mm2 = T1 */
 
-    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
-    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
+    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
+    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
 
-    movq_m2r (*_T3, mm5);		// mm5 = T3
-    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
+    movq_m2r (*_T3, mm5);		/* mm5 = T3 */
+    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
 
-    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
-    movq_r2r (mm5, mm7);		// mm7 = T3-1
+    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
+    movq_r2r (mm5, mm7);		/* mm7 = T3-1 */
 
-    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
-    psubsw_r2r (mm4, mm0);		// mm0 = v17
+    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
+    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
 
-    movq_m2r (*_T2, mm4);		// mm4 = T2
-    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
+    movq_m2r (*_T2, mm4);		/* mm4 = T2 */
+    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
 
-    paddsw_r2r (mm2, mm1);		// mm1 = u17
-    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
+    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
+    pmulhw_r2r (mm6, mm7);		/* mm7 = (T3-1)*x5 */
 
     /* slot */
 
-    movq_r2r (mm4, mm2);		// mm2 = T2
-    paddsw_r2r (mm3, mm5);		// m