From 0b6eb24b9a8034287f67f800fc61d07b7f018891 Mon Sep 17 00:00:00 2001 From: arpi Date: Sun, 6 Apr 2003 16:36:02 +0000 Subject: Importing libmpeg2 from mpeg2dec-0.3.1 git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@9853 b3059339-0415-0410-9bf9-f77b7e298cf2 --- libmpeg2/Makefile | 5 +- libmpeg2/attributes.h | 14 +- libmpeg2/header.c | 714 +++++++++++++++++----- libmpeg2/idct.c | 416 +++++++------ libmpeg2/idct_mlib.c | 25 +- libmpeg2/idct_mmx.c | 605 +++++++++++-------- libmpeg2/mm_accel.h | 30 - libmpeg2/mmx.h | 26 +- libmpeg2/motion_comp.c | 142 ++--- libmpeg2/motion_comp_mlib.c | 148 ++--- libmpeg2/motion_comp_mmx.c | 530 ++++++++--------- libmpeg2/mpeg2.h | 176 ++++-- libmpeg2/mpeg2_internal.h | 287 +++++---- libmpeg2/slice.c | 1368 +++++++++++++++++++++---------------------- libmpeg2/sse.h | 256 -------- libmpeg2/stats.c | 315 ---------- libmpeg2/vlc.h | 59 +- 17 files changed, 2605 insertions(+), 2511 deletions(-) delete mode 100644 libmpeg2/mm_accel.h delete mode 100644 libmpeg2/sse.h delete mode 100644 libmpeg2/stats.c (limited to 'libmpeg2') diff --git a/libmpeg2/Makefile b/libmpeg2/Makefile index 914b41844d..6ee925ddb9 100644 --- a/libmpeg2/Makefile +++ b/libmpeg2/Makefile @@ -3,9 +3,8 @@ LIBNAME = libmpeg2.a include ../config.mak -SRCS = header.c idct.c idct_mmx.c idct_mlib.c \ - motion_comp.c motion_comp_mmx.c motion_comp_mlib.c \ - slice.c stats.c # decode.c +SRCS = alloc.c cpu_accel.c cpu_state.c decode.c header.c idct.c idct_alpha.c idct_altivec.c idct_mlib.c idct_mmx.c motion_comp.c motion_comp_alpha.c motion_comp_altivec.c motion_comp_mlib.c motion_comp_mmx.c slice.c + OBJS = $(SRCS:.c=.o) INCLUDE = -I. -I../libvo -I.. $(EXTRA_INC) $(MLIB_INC) CFLAGS = $(OPTFLAGS) $(INCLUDE) -DMPG12PLAY diff --git a/libmpeg2/attributes.h b/libmpeg2/attributes.h index ab7105c2df..96a86b26c0 100644 --- a/libmpeg2/attributes.h +++ b/libmpeg2/attributes.h @@ -1,8 +1,10 @@ /* * attributes.h - * Copyright (C) 1999-2001 Aaron Holtzman + * Copyright (C) 2000-2002 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,7 +23,15 @@ /* use gcc attribs to align critical data structures */ #ifdef ATTRIBUTE_ALIGNED_MAX -#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < (align)) ? ATTRIBUTE_ALIGNED_MAX : (align)))) +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align))) #else #define ATTR_ALIGN(align) #endif + +#ifdef HAVE_BUILTIN_EXPECT +#define likely(x) __builtin_expect ((x) != 0, 1) +#define unlikely(x) __builtin_expect ((x) != 0, 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif diff --git a/libmpeg2/header.c b/libmpeg2/header.c index 68483a71c1..548d6bf21e 100644 --- a/libmpeg2/header.c +++ b/libmpeg2/header.c @@ -1,8 +1,10 @@ /* - * slice.c - * Copyright (C) 1999-2001 Aaron Holtzman + * header.c + * Copyright (C) 2000-2002 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,13 +24,23 @@ #include "config.h" #include -#include +#include /* defines NULL */ +#include /* memcmp */ +#include "mpeg2.h" #include "mpeg2_internal.h" +#include "convert.h" #include "attributes.h" +#define SEQ_EXT 2 +#define SEQ_DISPLAY_EXT 4 +#define QUANT_MATRIX_EXT 8 +#define COPYRIGHT_EXT 0x10 +#define PIC_DISPLAY_EXT 0x80 +#define PIC_CODING_EXT 0x100 + /* default intra quant matrix, in zig-zag order */ -static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = { +static const uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = { 8, 16, 16, 19, 16, 19, @@ -46,214 +58,634 @@ static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = { 83 }; -uint8_t scan_norm[64] ATTR_ALIGN(16) = -{ +uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) = { /* Zig-Zag scan pattern */ - 0, 1, 8,16, 9, 2, 3,10, - 17,24,32,25,18,11, 4, 5, - 12,19,26,33,40,48,41,34, - 27,20,13, 6, 7,14,21,28, - 35,42,49,56,57,50,43,36, - 29,22,15,23,30,37,44,51, - 58,59,52,45,38,31,39,46, - 53,60,61,54,47,55,62,63 + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; -uint8_t scan_alt[64] ATTR_ALIGN(16) = -{ +uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = { /* Alternate scan pattern */ - 0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49, - 41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43, - 51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45, - 53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63 + 0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49, + 41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43, + 51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45, + 53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63 }; -void header_state_init (picture_t * picture) +void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec) { - picture->scan = scan_norm; + mpeg2dec->decoder.scan = mpeg2_scan_norm; + mpeg2dec->picture = mpeg2dec->pictures; + mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf; + mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf; + mpeg2dec->fbuf[2] = &mpeg2dec->fbuf_alloc[2].fbuf; + mpeg2dec->first = 1; + mpeg2dec->alloc_index = 0; + mpeg2dec->alloc_index_user = 0; } -int header_process_sequence_header (picture_t * picture, uint8_t * buffer) +static void reset_info (mpeg2_info_t * info) { + info->current_picture = info->current_picture_2nd = NULL; + info->display_picture = info->display_picture_2nd = NULL; + info->current_fbuf = info->display_fbuf = info->discard_fbuf = NULL; + info->user_data = NULL; info->user_data_len = 0; +} + +int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec) +{ + uint8_t * buffer = mpeg2dec->chunk_start; + sequence_t * sequence = &(mpeg2dec->new_sequence); + decoder_t * decoder = &(mpeg2dec->decoder); + static unsigned int frame_period[9] = { + 0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000 + }; int width, height; int i; - if ((buffer[6] & 0x20) != 0x20){ - printf("missing marker bit!\n"); - return 1; /* missing marker_bit */ - } + if ((buffer[6] & 0x20) != 0x20) /* missing marker_bit */ + return 1; - height = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2]; + i = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2]; + sequence->display_width = sequence->picture_width = width = i >> 12; + sequence->display_height = sequence->picture_height = height = i & 0xfff; + decoder->width = sequence->width = width = (width + 15) & ~15; + decoder->height = sequence->height = height = (height + 15) & ~15; + decoder->vertical_position_extension = (height > 2800); + sequence->chroma_width = width >> 1; + sequence->chroma_height = height >> 1; - picture->display_picture_width = (height >> 12); - picture->display_picture_height = (height & 0xfff); + sequence->flags = SEQ_FLAG_PROGRESSIVE_SEQUENCE; - width = ((height >> 12) + 15) & ~15; - height = ((height & 0xfff) + 15) & ~15; + sequence->pixel_width = buffer[3] >> 4; /* aspect ratio */ + sequence->frame_period = 0; + if ((buffer[3] & 15) < 9) + sequence->frame_period = frame_period[buffer[3] & 15]; - if ((width > 768) || (height > 576)){ - printf("size restrictions for MP@ML or MPEG1 exceeded! (%dx%d)\n",width,height); -// return 1; /* size restrictions for MP@ML or MPEG1 */ - } - - picture->coded_picture_width = width; - picture->coded_picture_height = height; + sequence->byte_rate = (buffer[4]<<10) | (buffer[5]<<2) | (buffer[6]>>6); - /* this is not used by the decoder */ - picture->aspect_ratio_information = buffer[3] >> 4; - picture->frame_rate_code = buffer[3] & 15; - picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6); + sequence->vbv_buffer_size = ((buffer[6]<<16)|(buffer[7]<<8))&0x1ff800; + + if (buffer[7] & 4) + sequence->flags |= SEQ_FLAG_CONSTRAINED_PARAMETERS; if (buffer[7] & 2) { for (i = 0; i < 64; i++) - picture->intra_quantizer_matrix[scan_norm[i]] = + decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] = (buffer[i+7] << 7) | (buffer[i+8] >> 1); buffer += 64; - } else { + } else for (i = 0; i < 64; i++) - picture->intra_quantizer_matrix[scan_norm[i]] = + decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] = default_intra_quantizer_matrix [i]; - } - if (buffer[7] & 1) { + if (buffer[7] & 1) for (i = 0; i < 64; i++) - picture->non_intra_quantizer_matrix[scan_norm[i]] = + decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] = buffer[i+8]; - } else { + else for (i = 0; i < 64; i++) - picture->non_intra_quantizer_matrix[i] = 16; + decoder->non_intra_quantizer_matrix[i] = 16; + + sequence->profile_level_id = 0x80; + sequence->colour_primaries = 1; + sequence->transfer_characteristics = 1; + sequence->matrix_coefficients = 1; + + decoder->mpeg1 = 1; + decoder->intra_dc_precision = 0; + decoder->frame_pred_frame_dct = 1; + decoder->q_scale_type = 0; + decoder->concealment_motion_vectors = 0; + decoder->scan = mpeg2_scan_norm; + decoder->picture_structure = FRAME_PICTURE; + + mpeg2dec->ext_state = SEQ_EXT; + mpeg2dec->state = STATE_SEQUENCE; + mpeg2dec->display_offset_x = mpeg2dec->display_offset_y = 0; + + reset_info (&(mpeg2dec->info)); + return 0; +} + +static int sequence_ext (mpeg2dec_t * mpeg2dec) +{ + uint8_t * buffer = mpeg2dec->chunk_start; + sequence_t * sequence = &(mpeg2dec->new_sequence); + decoder_t * decoder = &(mpeg2dec->decoder); + int width, height; + uint32_t flags; + + if (!(buffer[3] & 1)) + return 1; + + sequence->profile_level_id = (buffer[0] << 4) | (buffer[1] >> 4); + + width = sequence->display_width = sequence->picture_width += + ((buffer[1] << 13) | (buffer[2] << 5)) & 0x3000; + height = sequence->display_height = sequence->picture_height += + (buffer[2] << 7) & 0x3000; + decoder->vertical_position_extension = (height > 2800); + flags = sequence->flags | SEQ_FLAG_MPEG2; + if (!(buffer[1] & 8)) { + flags &= ~SEQ_FLAG_PROGRESSIVE_SEQUENCE; + height = (height + 31) & ~31; + } + if (buffer[5] & 0x80) + flags |= SEQ_FLAG_LOW_DELAY; + sequence->flags = flags; + decoder->width = sequence->width = width = (width + 15) & ~15; + decoder->height = sequence->height = height = (height + 15) & ~15; + switch (buffer[1] & 6) { + case 0: /* invalid */ + return 1; + case 2: /* 4:2:0 */ + height >>= 1; + case 4: /* 4:2:2 */ + width >>= 1; } + sequence->chroma_width = width; + sequence->chroma_height = height; - /* MPEG1 - for testing only */ - picture->mpeg1 = 1; - picture->intra_dc_precision = 0; - picture->frame_pred_frame_dct = 1; - picture->q_scale_type = 0; - picture->concealment_motion_vectors = 0; - /* picture->alternate_scan = 0; */ - picture->picture_structure = FRAME_PICTURE; - /* picture->second_field = 0; */ + sequence->byte_rate += ((buffer[2]<<25) | (buffer[3]<<17)) & 0x3ffc0000; + + sequence->vbv_buffer_size |= buffer[4] << 21; + + sequence->frame_period = + sequence->frame_period * ((buffer[5]&31)+1) / (((buffer[5]>>2)&3)+1); + + decoder->mpeg1 = 0; + + mpeg2dec->ext_state = SEQ_DISPLAY_EXT; return 0; } -static int header_process_sequence_extension (picture_t * picture, - uint8_t * buffer) +static int sequence_display_ext (mpeg2dec_t * mpeg2dec) { - /* check chroma format, size extensions, marker bit */ - if (((buffer[1] & 0x07) != 0x02) || (buffer[2] & 0xe0) || - ((buffer[3] & 0x01) != 0x01)) + uint8_t * buffer = mpeg2dec->chunk_start; + sequence_t * sequence = &(mpeg2dec->new_sequence); + uint32_t flags; + + flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) | + ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT)); + if (buffer[0] & 1) { + flags |= SEQ_FLAG_COLOUR_DESCRIPTION; + sequence->colour_primaries = buffer[1]; + sequence->transfer_characteristics = buffer[2]; + sequence->matrix_coefficients = buffer[3]; + buffer += 3; + } + + if (!(buffer[2] & 2)) /* missing marker_bit */ return 1; - /* this is not used by the decoder */ - picture->progressive_sequence = (buffer[1] >> 3) & 1; + sequence->display_width = (buffer[1] << 6) | (buffer[2] >> 2); + sequence->display_height = + ((buffer[2]& 1 ) << 13) | (buffer[3] << 5) | (buffer[4] >> 3); + + return 0; +} + +static inline void finalize_sequence (sequence_t * sequence) +{ + int width; + int height; + + sequence->byte_rate *= 50; + + if (sequence->flags & SEQ_FLAG_MPEG2) { + switch (sequence->pixel_width) { + case 1: /* square pixels */ + sequence->pixel_width = sequence->pixel_height = 1; return; + case 2: /* 4:3 aspect ratio */ + width = 4; height = 3; break; + case 3: /* 16:9 aspect ratio */ + width = 16; height = 9; break; + case 4: /* 2.21:1 aspect ratio */ + width = 221; height = 100; break; + default: /* illegal */ + sequence->pixel_width = sequence->pixel_height = 0; return; + } + width *= sequence->display_height; + height *= sequence->display_width; + + } else { + if (sequence->byte_rate == 50 * 0x3ffff) + sequence->byte_rate = 0; /* mpeg-1 VBR */ + + switch (sequence->pixel_width) { + case 0: case 15: /* illegal */ + sequence->pixel_width = sequence->pixel_height = 0; return; + case 1: /* square pixels */ + sequence->pixel_width = sequence->pixel_height = 1; return; + case 3: /* 720x576 16:9 */ + sequence->pixel_width = 64; sequence->pixel_height = 45; return; + case 6: /* 720x480 16:9 */ + sequence->pixel_width = 32; sequence->pixel_height = 27; return; + case 12: /* 720*480 4:3 */ + sequence->pixel_width = 8; sequence->pixel_height = 9; return; + default: + height = 88 * sequence->pixel_width + 1171; + width = 2000; + } + } - if (picture->progressive_sequence) - picture->coded_picture_height = - (picture->coded_picture_height + 31) & ~31; + sequence->pixel_width = width; + sequence->pixel_height = height; + while (width) { /* find greatest common divisor */ + int tmp = width; + width = height % tmp; + height = tmp; + } + sequence->pixel_width /= height; + sequence->pixel_height /= height; +} - /* MPEG1 - for testing only */ - picture->mpeg1 = 0; +void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec) +{ + sequence_t * sequence = &(mpeg2dec->new_sequence); + + finalize_sequence (sequence); + + /* + * according to 6.1.1.6, repeat sequence headers should be + * identical to the original. However some DVDs dont respect that + * and have different bitrates in the repeat sequence headers. So + * we'll ignore that in the comparison and still consider these as + * repeat sequence headers. + */ + mpeg2dec->sequence.byte_rate = sequence->byte_rate; + if (!memcmp (&(mpeg2dec->sequence), sequence, sizeof (sequence_t))) + mpeg2dec->state = STATE_SEQUENCE_REPEATED; + mpeg2dec->sequence = *sequence; + + mpeg2dec->info.sequence = &(mpeg2dec->sequence); +} +int mpeg2_header_gop (mpeg2dec_t * mpeg2dec) +{ + mpeg2dec->state = STATE_GOP; + reset_info (&(mpeg2dec->info)); return 0; } -static int header_process_quant_matrix_extension (picture_t * picture, - uint8_t * buffer) +void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type) { int i; - if (buffer[0] & 8) { - for (i = 0; i < 64; i++) - picture->intra_quantizer_matrix[scan_norm[i]] = - (buffer[i] << 5) | (buffer[i+1] >> 3); - buffer += 64; + for (i = 0; i < 3; i++) + if (mpeg2dec->fbuf[1] != &mpeg2dec->fbuf_alloc[i].fbuf && + mpeg2dec->fbuf[2] != &mpeg2dec->fbuf_alloc[i].fbuf) { + mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[i].fbuf; + mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0]; + if ((coding_type == B_TYPE) || + (mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) { + if ((coding_type == B_TYPE) || (mpeg2dec->convert_start)) + mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0]; + mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0]; + } + break; + } +} + +int mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec) +{ + decoder_t * decoder = &(mpeg2dec->decoder); + picture_t * picture; + + if (mpeg2dec->state != STATE_SLICE_1ST) { + mpeg2dec->state = STATE_PICTURE; + picture = mpeg2dec->pictures; + if ((decoder->coding_type != PIC_FLAG_CODING_TYPE_B) ^ + (mpeg2dec->picture >= mpeg2dec->pictures + 2)) + picture += 2; + } else { + mpeg2dec->state = STATE_PICTURE_2ND; + picture = mpeg2dec->picture + 1; /* second field picture */ } + mpeg2dec->picture = picture; + picture->flags = 0; + if (mpeg2dec->num_pts) { + if (mpeg2dec->bytes_since_pts >= 4) { + mpeg2dec->num_pts = 0; + picture->pts = mpeg2dec->pts_current; + picture->flags = PIC_FLAG_PTS; + } else if (mpeg2dec->num_pts > 1) { + mpeg2dec->num_pts = 1; + picture->pts = mpeg2dec->pts_previous; + picture->flags = PIC_FLAG_PTS; + } + } + picture->display_offset[0].x = picture->display_offset[1].x = + picture->display_offset[2].x = mpeg2dec->display_offset_x; + picture->display_offset[0].y = picture->display_offset[1].y = + picture->display_offset[2].y = mpeg2dec->display_offset_y; + return mpeg2_parse_header (mpeg2dec); +} - if (buffer[0] & 4) { - for (i = 0; i < 64; i++) - picture->non_intra_quantizer_matrix[scan_norm[i]] = - (buffer[i] << 6) | (buffer[i+1] >> 2); +int mpeg2_header_picture (mpeg2dec_t * mpeg2dec) +{ + uint8_t * buffer = mpeg2dec->chunk_start; + picture_t * picture = mpeg2dec->picture; + decoder_t * decoder = &(mpeg2dec->decoder); + int type; + int low_delay; + + type = (buffer [1] >> 3) & 7; + low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY; + + if (mpeg2dec->state == STATE_PICTURE) { + picture_t * other; + + decoder->second_field = 0; + other = mpeg2dec->pictures; + if (other == picture) + other += 2; + if (decoder->coding_type != PIC_FLAG_CODING_TYPE_B) { + mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1]; + mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0]; + } + mpeg2dec->fbuf[0] = NULL; + reset_info (&(mpeg2dec->info)); + mpeg2dec->info.current_picture = picture; + mpeg2dec->info.display_picture = picture; + if (type != PIC_FLAG_CODING_TYPE_B) { + if (!low_delay) { + if (mpeg2dec->first) { + mpeg2dec->info.display_picture = NULL; + mpeg2dec->first = 0; + } else { + mpeg2dec->info.display_picture = other; + if (other->nb_fields == 1) + mpeg2dec->info.display_picture_2nd = other + 1; + mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1]; + } + } + if (!low_delay + !mpeg2dec->convert_start) + mpeg2dec->info.discard_fbuf = + mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert_start]; + } + if (!mpeg2dec->custom_fbuf) { + while (mpeg2dec->alloc_index < 3) { + fbuf_t * fbuf; + + fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf); + fbuf->id = NULL; + if (mpeg2dec->convert_start) { + fbuf->buf[0] = + (uint8_t *) mpeg2_malloc (mpeg2dec->convert_size[0], + ALLOC_CONVERTED); + fbuf->buf[1] = fbuf->buf[0] + mpeg2dec->convert_size[1]; + fbuf->buf[2] = fbuf->buf[0] + mpeg2dec->convert_size[2]; + } else { + int size; + size = mpeg2dec->decoder.width * mpeg2dec->decoder.height; + fbuf->buf[0] = (uint8_t *) mpeg2_malloc (6 * size >> 2, + ALLOC_YUV); + fbuf->buf[1] = fbuf->buf[0] + size; + fbuf->buf[2] = fbuf->buf[1] + (size >> 2); + } + } + mpeg2_set_fbuf (mpeg2dec, type); + } + } else { + decoder->second_field = 1; + mpeg2dec->info.current_picture_2nd = picture; + mpeg2dec->info.user_data = NULL; mpeg2dec->info.user_data_len = 0; + if (low_delay || type == PIC_FLAG_CODING_TYPE_B) + mpeg2dec->info.display_picture_2nd = picture; + } + mpeg2dec->ext_state = PIC_CODING_EXT; + + picture->temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6); + + decoder->coding_type = type; + picture->flags |= type; + + if (type == PIC_FLAG_CODING_TYPE_P || type == PIC_FLAG_CODING_TYPE_B) { + /* forward_f_code and backward_f_code - used in mpeg1 only */ + decoder->f_motion.f_code[1] = (buffer[3] >> 2) & 1; + decoder->f_motion.f_code[0] = + (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1; + decoder->b_motion.f_code[1] = (buffer[4] >> 6) & 1; + decoder->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1; } + /* XXXXXX decode extra_information_picture as well */ + + picture->nb_fields = 2; + return 0; } -static int header_process_picture_coding_extension (picture_t * picture, uint8_t * buffer) +static int picture_coding_ext (mpeg2dec_t * mpeg2dec) { + uint8_t * buffer = mpeg2dec->chunk_start; + picture_t * picture = mpeg2dec->picture; + decoder_t * decoder = &(mpeg2dec->decoder); + uint32_t flags; + /* pre subtract 1 for use later in compute_motion_vector */ - picture->f_motion.f_code[0] = (buffer[0] & 15) - 1; - picture->f_motion.f_code[1] = (buffer[1] >> 4) - 1; - picture->b_motion.f_code[0] = (buffer[1] & 15) - 1; - picture->b_motion.f_code[1] = (buffer[2] >> 4) - 1; - - picture->intra_dc_precision = (buffer[2] >> 2) & 3; - picture->picture_structure = buffer[2] & 3; - picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1; - picture->concealment_motion_vectors = (buffer[3] >> 5) & 1; - picture->q_scale_type = (buffer[3] >> 4) & 1; - picture->intra_vlc_format = (buffer[3] >> 3) & 1; - - if (buffer[3] & 4) /* alternate_scan */ - picture->scan = scan_alt; - else - picture->scan = scan_norm; - - /* these are not used by the decoder */ - picture->top_field_first = buffer[3] >> 7; - picture->repeat_first_field = (buffer[3] >> 1) & 1; - picture->progressive_frame = buffer[4] >> 7; - - // repeat_first implementation by A'rpi/ESP-team, based on libmpeg3: - picture->display_time=100; - if(picture->repeat_first_field){ - if(picture->progressive_sequence){ - if(picture->top_field_first) - picture->display_time+=200; - else - picture->display_time+=100; - } else - if(picture->progressive_frame){ - picture->display_time+=50; - } + decoder->f_motion.f_code[0] = (buffer[0] & 15) - 1; + decoder->f_motion.f_code[1] = (buffer[1] >> 4) - 1; + decoder->b_motion.f_code[0] = (buffer[1] & 15) - 1; + decoder->b_motion.f_code[1] = (buffer[2] >> 4) - 1; + + flags = picture->flags; + decoder->intra_dc_precision = (buffer[2] >> 2) & 3; + decoder->picture_structure = buffer[2] & 3; + switch (decoder->picture_structure) { + case TOP_FIELD: + flags |= PIC_FLAG_TOP_FIELD_FIRST; + case BOTTOM_FIELD: + picture->nb_fields = 1; + break; + case FRAME_PICTURE: + if (!(mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)) { + picture->nb_fields = (buffer[3] & 2) ? 3 : 2; + flags |= (buffer[3] & 128) ? PIC_FLAG_TOP_FIELD_FIRST : 0; + } else + picture->nb_fields = (buffer[3]&2) ? ((buffer[3]&128) ? 6 : 4) : 2; + break; + default: + return 1; + } + decoder->top_field_first = buffer[3] >> 7; + decoder->frame_pred_frame_dct = (buffer[3] >> 6) & 1; + decoder->concealment_motion_vectors = (buffer[3] >> 5) & 1; + decoder->q_scale_type = (buffer[3] >> 4) & 1; + decoder->intra_vlc_format = (buffer[3] >> 3) & 1; + decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm; + flags |= (buffer[4] & 0x80) ? PIC_FLAG_PROGRESSIVE_FRAME : 0; + if (buffer[4] & 0x40) + flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) & + PIC_MASK_COMPOSITE_DISPLAY) | PIC_FLAG_COMPOSITE_DISPLAY; + picture->flags = flags; + + mpeg2dec->ext_state = PIC_DISPLAY_EXT | COPYRIGHT_EXT | QUANT_MATRIX_EXT; + + return 0; +} + +static int picture_display_ext (mpeg2dec_t * mpeg2dec) +{ + uint8_t * buffer = mpeg2dec->chunk_start; + picture_t * picture = mpeg2dec->picture; + int i, nb_pos; + + nb_pos = picture->nb_fields; + if (mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE) + nb_pos >>= 1; + + for (i = 0; i < nb_pos; i++) { + int x, y; + + x = ((buffer[4*i] << 24) | (buffer[4*i+1] << 16) | + (buffer[4*i+2] << 8) | buffer[4*i+3]) >> (11-2*i); + y = ((buffer[4*i+2] << 24) | (buffer[4*i+3] << 16) | + (buffer[4*i+4] << 8) | buffer[4*i+5]) >> (10-2*i); + if (! (x & y & 1)) + return 1; + picture->display_offset[i].x = mpeg2dec->display_offset_x = x >> 1; + picture->display_offset[i].y = mpeg2dec->display_offset_y = y >> 1; + } + for (; i < 3; i++) { + picture->display_offset[i].x = mpeg2dec->display_offset_x; + picture->display_offset[i].y = mpeg2dec->display_offset_y; } - //temopral hack. We calc time on every field, so if we have 2 fields - // interlaced we'll end with double time for 1 frame - if( picture->picture_structure!=3 ) picture->display_time/=2; return 0; } -int header_process_extension (picture_t * picture, uint8_t * buffer) +static int copyright_ext (mpeg2dec_t * mpeg2dec) { - switch (buffer[0] & 0xf0) { - case 0x10: /* sequence extension */ - return header_process_sequence_extension (picture, buffer); + return 0; +} - case 0x30: /* quant matrix extension */ - return header_process_quant_matrix_extension (picture, buffer); +static int quant_matrix_ext (mpeg2dec_t * mpeg2dec) +{ + uint8_t * buffer = mpeg2dec->chunk_start; + decoder_t * decoder = &(mpeg2dec->decoder); + int i; - case 0x80: /* picture coding extension */ - return header_process_picture_coding_extension (picture, buffer); + if (buffer[0] & 8) { + for (i = 0; i < 64; i++) + decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] = + (buffer[i] << 5) | (buffer[i+1] >> 3); + buffer += 64; } + if (buffer[0] & 4) + for (i = 0; i < 64; i++) + decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] = + (buffer[i] << 6) | (buffer[i+1] >> 2); + return 0; } -int header_process_picture_header (picture_t *picture, uint8_t * buffer) +int mpeg2_header_extension (mpeg2dec_t * mpeg2dec) { - picture->picture_coding_type = (buffer [1] >> 3) & 7; + static int (* parser[]) (mpeg2dec_t *) = { + 0, sequence_ext, sequence_display_ext, quant_matrix_ext, + copyright_ext, 0, 0, picture_display_ext, picture_coding_ext + }; + int ext, ext_bit; + + ext = mpeg2dec->chunk_start[0] >> 4; + ext_bit = 1 << ext; + + if (!(mpeg2dec->ext_state & ext_bit)) + return 0; /* ignore illegal extensions */ + mpeg2dec->ext_state &= ~ext_bit; + return parser[ext] (mpeg2dec); +} - /* forward_f_code and backward_f_code - used in mpeg1 only */ - picture->f_motion.f_code[1] = (buffer[3] >> 2) & 1; - picture->f_motion.f_code[0] = - (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1; - picture->b_motion.f_code[1] = (buffer[4] >> 6) & 1; - picture->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1; +int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec) +{ + if (!mpeg2dec->info.user_data_len) + mpeg2dec->info.user_data = mpeg2dec->chunk_start; + else + mpeg2dec->info.user_data_len += 3; + mpeg2dec->info.user_data_len += (mpeg2dec->chunk_ptr - 4 - + mpeg2dec->chunk_start); + mpeg2dec->chunk_start = mpeg2dec->chunk_ptr - 1; + + return 0; +} - /* move in header_process_picture_header */ - picture->second_field = - (picture->picture_structure != FRAME_PICTURE) && - !(picture->second_field); +int mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec) +{ + mpeg2dec->state = ((mpeg2dec->picture->nb_fields > 1 || + mpeg2dec->state == STATE_PICTURE_2ND) ? + STATE_SLICE : STATE_SLICE_1ST); + + if (!(mpeg2dec->nb_decode_slices)) + mpeg2dec->picture->flags |= PIC_FLAG_SKIP; + else if (mpeg2dec->convert_start) { + int flags; + + switch (mpeg2dec->decoder.picture_structure) { + case TOP_FIELD: flags = CONVERT_TOP_FIELD; break; + case BOTTOM_FIELD: flags = CONVERT_BOTTOM_FIELD; break; + default: + flags = + ((mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE) ? + CONVERT_FRAME : CONVERT_BOTH_FIELDS); + } + mpeg2dec->convert_start (mpeg2dec->convert_id, + mpeg2dec->fbuf[0]->buf, flags); + + mpeg2dec->decoder.convert = mpeg2dec->convert_copy; + mpeg2dec->decoder.fbuf_id = mpeg2dec->convert_id; + + if (mpeg2dec->decoder.coding_type == B_TYPE) + mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->yuv_buf[2], + mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1], + mpeg2dec->yuv_buf[mpeg2dec->yuv_index]); + else { + mpeg2_init_fbuf (&(mpeg2dec->decoder), + mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1], + mpeg2dec->yuv_buf[mpeg2dec->yuv_index], + mpeg2dec->yuv_buf[mpeg2dec->yuv_index]); + if (mpeg2dec->state == STATE_SLICE) + mpeg2dec->yuv_index ^= 1; + } + } else { + int b_type; + mpeg2dec->decoder.convert = NULL; + b_type = (mpeg2dec->decoder.coding_type == B_TYPE); + mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->fbuf[0]->buf, + mpeg2dec->fbuf[b_type + 1]->buf, + mpeg2dec->fbuf[b_type]->buf); + } + mpeg2dec->action = NULL; return 0; } + +int mpeg2_header_end (mpeg2dec_t * mpeg2dec) +{ + picture_t * picture; + int b_type; + + picture = mpeg2dec->pictures; + if (mpeg2dec->picture < picture + 2) + picture = mpeg2dec->pictures + 2; + + mpeg2dec->state = STATE_INVALID; + reset_info (&(mpeg2dec->info)); + b_type = (mpeg2dec->decoder.coding_type == B_TYPE); + if (!(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) { + mpeg2dec->info.display_picture = picture; + if (picture->nb_fields == 1) + mpeg2dec->info.display_picture_2nd = picture + 1; + mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[b_type]; + if (!mpeg2dec->convert_start) + mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type + 1]; + } else if (!mpeg2dec->convert_start) + mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type]; + mpeg2dec->action = mpeg2_seek_sequence; + return STATE_END; +} diff --git a/libmpeg2/idct.c b/libmpeg2/idct.c index 1e869c37de..bcae078156 100644 --- a/libmpeg2/idct.c +++ b/libmpeg2/idct.c @@ -1,12 +1,10 @@ /* * idct.c - * Copyright (C) 1999-2001 Aaron Holtzman - * - * Portions of this code are from the MPEG software simulation group - * idct implementation. This code will be replaced with a new - * implementation soon. + * Copyright (C) 2000-2002 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,27 +21,14 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/**********************************************************/ -/* inverse two dimensional DCT, Chen-Wang algorithm */ -/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */ -/* 32-bit integer arithmetic (8 bit coefficients) */ -/* 11 mults, 29 adds per DCT */ -/* sE, 18.8.91 */ -/**********************************************************/ -/* coefficients extended to 12 bit for IEEE1180-1990 */ -/* compliance sE, 2.1.94 */ -/**********************************************************/ - -/* this code assumes >> to be a two's-complement arithmetic */ -/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */ - #include "config.h" -#include +#include #include +#include "mpeg2.h" #include "mpeg2_internal.h" -#include "mm_accel.h" +#include "attributes.h" #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ @@ -53,199 +38,131 @@ #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ /* idct main entry point */ -void (*idct_block_copy) (int16_t * block, uint8_t * dest, int stride); -void (*idct_block_add) (int16_t * block, uint8_t * dest, int stride); - -static void idct_block_copy_c (int16_t *block, uint8_t * dest, int stride); -static void idct_block_add_c (int16_t *block, uint8_t * dest, int stride); +void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride); +void (* mpeg2_idct_add) (int last, int16_t * block, + uint8_t * dest, int stride); static uint8_t clip_lut[1024]; -#define CLIP(i) ((clip_lut+384)[ (i)]) +#define CLIP(i) ((clip_lut+384)[(i)]) -void idct_init (void) -{ -#ifdef ARCH_X86 - if (config.flags & MM_ACCEL_X86_MMXEXT) { - printf ("libmpeg2: Using MMXEXT for IDCT transform\n"); - idct_block_copy = idct_block_copy_mmxext; - idct_block_add = idct_block_add_mmxext; - idct_mmx_init (); - } else if (config.flags & MM_ACCEL_X86_MMX) { - printf ("libmpeg2: Using MMX for IDCT transform\n"); - idct_block_copy = idct_block_copy_mmx; - idct_block_add = idct_block_add_mmx; - idct_mmx_init (); - } else -#endif -#ifdef LIBMPEG2_MLIB - if (config.flags & MM_ACCEL_MLIB) { - printf ("libmpeg2: Using mlib for IDCT transform\n"); - idct_block_copy = idct_block_copy_mlib; - idct_block_add = idct_block_add_mlib; - } else +#if 0 +#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ +do { \ + t0 = W0*d0 + W1*d1; \ + t1 = W0*d1 - W1*d0; \ +} while (0) +#else +#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ +do { \ + int tmp = W0 * (d0 + d1); \ + t0 = tmp + (W1 - W0) * d1; \ + t1 = tmp - (W1 + W0) * d0; \ +} while (0) #endif - { - int i; - - printf ("libmpeg2: No accelerated IDCT transform found\n"); - idct_block_copy = idct_block_copy_c; - idct_block_add = idct_block_add_c; - for (i = -384; i < 640; i++) - clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i); - } -} -/* row (horizontal) IDCT - * - * 7 pi 1 - * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l ) - * l=0 8 2 - * - * where: c[0] = 128 - * c[1..7] = 128*sqrt (2) - */ - -static inline void idct_row (int16_t * block) +static void inline idct_row (int16_t * const block) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - x1 = block[4] << 11; - x2 = block[6]; - x3 = block[2]; - x4 = block[1]; - x5 = block[7]; - x6 = block[5]; - x7 = block[3]; + int d0, d1, d2, d3; + int a0, a1, a2, a3, b0, b1, b2, b3; + int t0, t1, t2, t3; /* shortcut */ - if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) { - block[0] = block[1] = block[2] = block[3] = block[4] = - block[5] = block[6] = block[7] = block[0]<<3; + if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] | + ((int32_t *)block)[3]))) { + uint32_t tmp = (uint16_t) (block[0] << 3); + tmp |= tmp << 16; + ((int32_t *)block)[0] = tmp; + ((int32_t *)block)[1] = tmp; + ((int32_t *)block)[2] = tmp; + ((int32_t *)block)[3] = tmp; return; } - x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */ - - /* first stage */ - x8 = W7 * (x4 + x5); - x4 = x8 + (W1 - W7) * x4; - x5 = x8 - (W1 + W7) * x5; - x8 = W3 * (x6 + x7); - x6 = x8 - (W3 - W5) * x6; - x7 = x8 - (W3 + W5) * x7; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2); - x2 = x1 - (W2 + W6) * x2; - x3 = x1 + (W2 - W6) * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - block[0] = (x7 + x1) >> 8; - block[1] = (x3 + x2) >> 8; - block[2] = (x0 + x4) >> 8; - block[3] = (x8 + x6) >> 8; - block[4] = (x8 - x6) >> 8; - block[5] = (x0 - x4) >> 8; - block[6] = (x3 - x2) >> 8; - block[7] = (x7 - x1) >> 8; + d0 = (block[0] << 11) + 128; + d1 = block[1]; + d2 = block[2] << 11; + d3 = block[3]; + t0 = d0 + d2; + t1 = d0 - d2; + BUTTERFLY (t2, t3, W6, W2, d3, d1); + a0 = t0 + t2; + a1 = t1 + t3; + a2 = t1 - t3; + a3 = t0 - t2; + + d0 = block[4]; + d1 = block[5]; + d2 = block[6]; + d3 = block[7]; + BUTTERFLY (t0, t1, W7, W1, d3, d0); + BUTTERFLY (t2, t3, W3, W5, d1, d2); + b0 = t0 + t2; + b3 = t1 + t3; + t0 -= t2; + t1 -= t3; + b1 = ((t0 + t1) * 181) >> 8; + b2 = ((t0 - t1) * 181) >> 8; + + block[0] = (a0 + b0) >> 8; + block[1] = (a1 + b1) >> 8; + block[2] = (a2 + b2) >> 8; + block[3] = (a3 + b3) >> 8; + block[4] = (a3 - b3) >> 8; + block[5] = (a2 - b2) >> 8; + block[6] = (a1 - b1) >> 8; + block[7] = (a0 - b0) >> 8; } -/* column (vertical) IDCT - * - * 7 pi 1 - * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l ) - * l=0 8 2 - * - * where: c[0] = 1/1024 - * c[1..7] = (1/1024)*sqrt (2) - */ - -static inline void idct_col (int16_t *block) +static void inline idct_col (int16_t * const block) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - x1 = block [8*4] << 8; - x2 = block [8*6]; - x3 = block [8*2]; - x4 = block [8*1]; - x5 = block [8*7]; - x6 = block [8*5]; - x7 = block [8*3]; - -#if 0 - if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) { - block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] = - block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6; - return; - } -#endif - - x0 = (block[8*0] << 8) + 8192; - - /* first stage */ - x8 = W7 * (x4 + x5) + 4; - x4 = (x8 + (W1 - W7) * x4) >> 3; - x5 = (x8 - (W1 + W7) * x5) >> 3; - x8 = W3 * (x6 + x7) + 4; - x6 = (x8 - (W3 - W5) * x6) >> 3; - x7 = (x8 - (W3 + W5) * x7) >> 3; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2) + 4; - x2 = (x1 - (W2 + W6) * x2) >> 3; - x3 = (x1 + (W2 - W6) * x3) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - block[8*0] = (x7 + x1) >> 14; - block[8*1] = (x3 + x2) >> 14; - block[8*2] = (x0 + x4) >> 14; - block[8*3] = (x8 + x6) >> 14; - block[8*4] = (x8 - x6) >> 14; - block[8*5] = (x0 - x4) >> 14; - block[8*6] = (x3 - x2) >> 14; - block[8*7] = (x7 - x1) >> 14; + int d0, d1, d2, d3; + int a0, a1, a2, a3, b0, b1, b2, b3; + int t0, t1, t2, t3; + + d0 = (block[8*0] << 11) + 65536; + d1 = block[8*1]; + d2 = block[8*2] << 11; + d3 = block[8*3]; + t0 = d0 + d2; + t1 = d0 - d2; + BUTTERFLY (t2, t3, W6, W2, d3, d1); + a0 = t0 + t2; + a1 = t1 + t3; + a2 = t1 - t3; + a3 = t0 - t2; + + d0 = block[8*4]; + d1 = block[8*5]; + d2 = block[8*6]; + d3 = block[8*7]; + BUTTERFLY (t0, t1, W7, W1, d3, d0); + BUTTERFLY (t2, t3, W3, W5, d1, d2); + b0 = t0 + t2; + b3 = t1 + t3; + t0 = (t0 - t2) >> 8; + t1 = (t1 - t3) >> 8; + b1 = (t0 + t1) * 181; + b2 = (t0 - t1) * 181; + + block[8*0] = (a0 + b0) >> 17; + block[8*1] = (a1 + b1) >> 17; + block[8*2] = (a2 + b2) >> 17; + block[8*3] = (a3 + b3) >> 17; + block[8*4] = (a3 - b3) >> 17; + block[8*5] = (a2 - b2) >> 17; + block[8*6] = (a1 - b1) >> 17; + block[8*7] = (a0 - b0) >> 17; } -void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride) +static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest, + const int stride) { int i; for (i = 0; i < 8; i++) idct_row (block + 8 * i); - for (i = 0; i < 8; i++) idct_col (block + i); - - i = 8; do { dest[0] = CLIP (block[0]); dest[1] = CLIP (block[1]); @@ -256,33 +173,112 @@ void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride) dest[6] = CLIP (block[6]); dest[7] = CLIP (block[7]); + block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0; + block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0; + dest += stride; block += 8; } while (--i); } -void idct_block_add_c (int16_t * block, uint8_t * dest, int stride) +static void mpeg2_idct_add_c (const int last, int16_t * block, + uint8_t * dest, const int stride) { int i; - for (i = 0; i < 8; i++) - idct_row (block + 8 * i); - - for (i = 0; i < 8; i++) - idct_col (block + i); + if (last != 129 || (block[0] & 7) == 4) { + for (i = 0; i < 8; i++) + idct_row (block + 8 * i); + for (i = 0; i < 8; i++) + idct_col (block + i); + do { + dest[0] = CLIP (block[0] + dest[0]); + dest[1] = CLIP (block[1] + dest[1]); + dest[2] = CLIP (block[2] + dest[2]); + dest[3] = CLIP (block[3] + dest[3]); + dest[4] = CLIP (block[4] + dest[4]); + dest[5] = CLIP (block[5] + dest[5]); + dest[6] = CLIP (block[6] + dest[6]); + dest[7] = CLIP (block[7] + dest[7]); + + block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0; + block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0; + + dest += stride; + block += 8; + } while (--i); + } else { + int DC; + + DC = (block[0] + 4) >> 3; + block[0] = block[63] = 0; + i = 8; + do { + dest[0] = CLIP (DC + dest[0]); + dest[1] = CLIP (DC + dest[1]); + dest[2] = CLIP (DC + dest[2]); + dest[3] = CLIP (DC + dest[3]); + dest[4] = CLIP (DC + dest[4]); + dest[5] = CLIP (DC + dest[5]); + dest[6] = CLIP (DC + dest[6]); + dest[7] = CLIP (DC + dest[7]); + dest += stride; + } while (--i); + } +} - i = 8; - do { - dest[0] = CLIP (block[0] + dest[0]); - dest[1] = CLIP (block[1] + dest[1]); - dest[2] = CLIP (block[2] + dest[2]); - dest[3] = CLIP (block[3] + dest[3]); - dest[4] = CLIP (block[4] + dest[4]); - dest[5] = CLIP (block[5] + dest[5]); - dest[6] = CLIP (block[6] + dest[6]); - dest[7] = CLIP (block[7] + dest[7]); +void mpeg2_idct_init (uint32_t accel) +{ +#ifdef ARCH_X86 + if (accel & MPEG2_ACCEL_X86_MMXEXT) { + mpeg2_idct_copy = mpeg2_idct_copy_mmxext; + mpeg2_idct_add = mpeg2_idct_add_mmxext; + mpeg2_idct_mmx_init (); + } else if (accel & MPEG2_ACCEL_X86_MMX) { + mpeg2_idct_copy = mpeg2_idct_copy_mmx; + mpeg2_idct_add = mpeg2_idct_add_mmx; + mpeg2_idct_mmx_init (); + } else +#endif +#ifdef ARCH_PPC + if (accel & MPEG2_ACCEL_PPC_ALTIVEC) { + mpeg2_idct_copy = mpeg2_idct_copy_altivec; + mpeg2_idct_add = mpeg2_idct_add_altivec; + mpeg2_idct_altivec_init (); + } else +#endif +#ifdef ARCH_ALPHA + if (accel & MPEG2_ACCEL_ALPHA_MVI) { + mpeg2_idct_copy = mpeg2_idct_copy_mvi; + mpeg2_idct_add = mpeg2_idct_add_mvi; + mpeg2_idct_alpha_init (0); + } else if (accel & MPEG2_ACCEL_ALPHA) { + mpeg2_idct_copy = mpeg2_idct_copy_alpha; + mpeg2_idct_add = mpeg2_idct_add_alpha; + mpeg2_idct_alpha_init (1); + } else +#endif +#ifdef LIBMPEG2_MLIB + if (accel & MPEG2_ACCEL_MLIB) { + mpeg2_idct_copy = mpeg2_idct_copy_mlib_non_ieee; + mpeg2_idct_add = (getenv ("MLIB_NON_IEEE") ? + mpeg2_idct_add_mlib_non_ieee : mpeg2_idct_add_mlib); + } else +#endif + { + extern uint8_t mpeg2_scan_norm[64]; + extern uint8_t mpeg2_scan_alt[64]; + int i, j; - dest += stride; - block += 8; - } while (--i); + mpeg2_idct_copy = mpeg2_idct_copy_c; + mpeg2_idct_add = mpeg2_idct_add_c; + for (i = -384; i < 640; i++) + clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i); + for (i = 0; i < 64; i++) { + j = mpeg2_scan_norm[i]; + mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); + j = mpeg2_scan_alt[i]; + mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); + } + } } diff --git a/libmpeg2/idct_mlib.c b/libmpeg2/idct_mlib.c index 876ab574a4..eae2a2f1be 100644 --- a/libmpeg2/idct_mlib.c +++ b/libmpeg2/idct_mlib.c @@ -1,8 +1,9 @@ /* * idct_mlib.c - * Copyright (C) 1999-2001 Håkan Hjort + * Copyright (C) 1999-2002 Håkan Hjort * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,25 +24,37 @@ #ifdef LIBMPEG2_MLIB -#include #include #include #include #include +#include +#include +#include "mpeg2.h" #include "mpeg2_internal.h" -void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride) +void mpeg2_idct_add_mlib (const int last, int16_t * const block, + uint8_t * const dest, const int stride) +{ + mlib_VideoIDCT_IEEE_S16_S16 (block, block); + mlib_VideoAddBlock_U8_S16 (dest, block, stride); + memset (block, 0, 64 * sizeof (uint16_t)); +} + +void mpeg2_idct_copy_mlib_non_ieee (int16_t * const block, + uint8_t * const dest, const int stride) { mlib_VideoIDCT8x8_U8_S16 (dest, block, stride); + memset (block, 0, 64 * sizeof (uint16_t)); } -void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride) +void mpeg2_idct_add_mlib_non_ieee (const int last, int16_t * const block, + uint8_t * const dest, const int stride) { - /* Should we use mlib_VideoIDCT_IEEE_S16_S16 here ?? */ - /* it's ~30% slower. */ mlib_VideoIDCT8x8_S16_S16 (block, block); mlib_VideoAddBlock_U8_S16 (dest, block, stride); + memset (block, 0, 64 * sizeof (uint16_t)); } #endif diff --git a/libmpeg2/idct_mmx.c b/libmpeg2/idct_mmx.c index 70b3b9b95e..4915b93750 100644 --- a/libmpeg2/idct_mmx.c +++ b/libmpeg2/idct_mmx.c @@ -1,8 +1,10 @@ /* * idct_mmx.c - * Copyright (C) 1999-2001 Aaron Holtzman + * Copyright (C) 2000-2002 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman * * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. * * mpeg2dec is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,6 +27,7 @@ #include +#include "mpeg2.h" #include "mpeg2_internal.h" #include "attributes.h" #include "mmx.h" @@ -87,104 +90,107 @@ static inline void idct_row (int16_t * row, int offset, c5, -c1, c3, -c1, \ c7, c3, c7, -c5 } -static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table) +static inline void mmxext_row_head (int16_t * const row, const int offset, + const int16_t * const table) { - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 - pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 + pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ } -static inline void mmxext_row (int16_t * table, int32_t * rounder) +static inline void mmxext_row (const int16_t * const table, + const int32_t * const rounder) { - movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 - pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 + movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ + pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ - pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 - pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 + pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ + pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ - movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 - pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 + movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ + pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ - paddd_m2r (*rounder, mm3); // mm3 += rounder - pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 + paddd_m2r (*rounder, mm3); /* mm3 += rounder */ + pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ - pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 - paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ + paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ - pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 - movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ + movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ - pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 - paddd_r2r (mm7, mm1); // mm1 = b1 b0 + pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ + paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ - paddd_m2r (*rounder, mm0); // mm0 += rounder - psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + paddd_m2r (*rounder, mm0); /* mm0 += rounder */ + psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ - psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 - paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ + paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ - paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder - psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ + psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ - paddd_r2r (mm6, mm5); // mm5 = b3 b2 - movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder + paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ + movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ - paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder - psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder + paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ + psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ } -static inline void mmxext_row_tail (int16_t * row, int store) +static inline void mmxext_row_tail (int16_t * const row, const int store) { - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ /* slot */ - movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 + movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ } -static inline void mmxext_row_mid (int16_t * row, int store, - int offset, int16_t * table) +static inline void mmxext_row_mid (int16_t * const row, const int store, + const int offset, + const int16_t * const table) { - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ - movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 - movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 + movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ + movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ - pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 - pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 + movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ } @@ -199,125 +205,127 @@ static inline void mmxext_row_mid (int16_t * row, int store, c5, -c1, c7, -c5, \ c7, c3, c3, -c1 } -static inline void mmx_row_head (int16_t * row, int offset, int16_t * table) +static inline void mmx_row_head (int16_t * const row, const int offset, + const int16_t * const table) { - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ - movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 - pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 + movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ - movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 - punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 + movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ + punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ } -static inline void mmx_row (int16_t * table, int32_t * rounder) +static inline void mmx_row (const int16_t * const table, + const int32_t * const rounder) { - pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 - punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 + pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ + punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ - pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 - punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 + pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ + punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ - movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 - pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 + movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ + pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ - paddd_m2r (*rounder, mm3); // mm3 += rounder - pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 + paddd_m2r (*rounder, mm3); /* mm3 += rounder */ + pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ - pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 - paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ + paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ - pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 - movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ + movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ - pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 - paddd_r2r (mm7, mm1); // mm1 = b1 b0 + pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ + paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ - paddd_m2r (*rounder, mm0); // mm0 += rounder - psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + paddd_m2r (*rounder, mm0); /* mm0 += rounder */ + psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ - psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 - paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ + paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ - paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder - psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ + psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ - paddd_r2r (mm6, mm5); // mm5 = b3 b2 - movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder + paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ + movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ - paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder - psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder + paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ + psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ } -static inline void mmx_row_tail (int16_t * row, int store) +static inline void mmx_row_tail (int16_t * const row, const int store) { - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ - pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 + pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ - psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 + psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ - por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 + por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ /* slot */ - movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 + movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ } -static inline void mmx_row_mid (int16_t * row, int store, - int offset, int16_t * table) +static inline void mmx_row_mid (int16_t * const row, const int store, + const int offset, const int16_t * const table) { - movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 - psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ - packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 - movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 - movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ - punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 - psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 + punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ + psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ - movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 - pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 + movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ + pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ - movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 - por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 + movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ + por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ - movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 - punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 + movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ + punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ - movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 - pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 + movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ } #if 0 -// C column IDCT - its just here to document the MMXEXT and MMX versions +/* C column IDCT - its just here to document the MMXEXT and MMX versions */ static inline void idct_col (int16_t * col, int offset) { /* multiplication - as implemented on mmx */ @@ -388,178 +396,178 @@ static inline void idct_col (int16_t * col, int offset) #endif -// MMX column IDCT -static inline void idct_col (int16_t * col, int offset) +/* MMX column IDCT */ +static inline void idct_col (int16_t * const col, const int offset) { #define T1 13036 #define T2 27146 #define T3 43790 #define C4 23170 - static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; - static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; - static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; - static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; + static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; + static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; + static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; + static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; /* column code adapted from peter gubanov */ /* http://www.elecard.com/peter/idct.shtml */ - movq_m2r (*_T1, mm0); // mm0 = T1 + movq_m2r (*_T1, mm0); /* mm0 = T1 */ - movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 - movq_r2r (mm0, mm2); // mm2 = T1 + movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ + movq_r2r (mm0, mm2); /* mm2 = T1 */ - movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 - pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 + movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ + pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ - movq_m2r (*_T3, mm5); // mm5 = T3 - pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 + movq_m2r (*_T3, mm5); /* mm5 = T3 */ + pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ - movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 - movq_r2r (mm5, mm7); // mm7 = T3-1 + movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ + movq_r2r (mm5, mm7); /* mm7 = T3-1 */ - movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 - psubsw_r2r (mm4, mm0); // mm0 = v17 + movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ + psubsw_r2r (mm4, mm0); /* mm0 = v17 */ - movq_m2r (*_T2, mm4); // mm4 = T2 - pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 + movq_m2r (*_T2, mm4); /* mm4 = T2 */ + pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ - paddsw_r2r (mm2, mm1); // mm1 = u17 - pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 + paddsw_r2r (mm2, mm1); /* mm1 = u17 */ + pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ /* slot */ - movq_r2r (mm4, mm2); // mm2 = T2 - paddsw_r2r (mm3, mm5); // m