From d34041569e71fc9bd772354e94dc9d16061072a5 Mon Sep 17 00:00:00 2001 From: arpi_esp Date: Sat, 24 Feb 2001 20:28:24 +0000 Subject: Initial revision git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@2 b3059339-0415-0410-9bf9-f77b7e298cf2 --- libmpeg2/Makefile | 35 + libmpeg2/attributes.h | 31 + libmpeg2/decode.c | 319 ++++++++ libmpeg2/header.c | 273 +++++++ libmpeg2/idct.c | 289 +++++++ libmpeg2/idct_mlib.c | 47 ++ libmpeg2/idct_mmx.c | 706 +++++++++++++++++ libmpeg2/mm_accel.h | 30 + libmpeg2/mmx.h | 255 ++++++ libmpeg2/motion_comp.c | 125 +++ libmpeg2/motion_comp_mlib.c | 180 +++++ libmpeg2/motion_comp_mmx.c | 1025 ++++++++++++++++++++++++ libmpeg2/mpeg2.h | 57 ++ libmpeg2/mpeg2_internal.h | 220 ++++++ libmpeg2/slice.c | 1797 +++++++++++++++++++++++++++++++++++++++++++ libmpeg2/sse.h | 256 ++++++ libmpeg2/stats.c | 316 ++++++++ libmpeg2/vlc.h | 425 ++++++++++ 18 files changed, 6386 insertions(+) create mode 100644 libmpeg2/Makefile create mode 100644 libmpeg2/attributes.h create mode 100644 libmpeg2/decode.c create mode 100644 libmpeg2/header.c create mode 100644 libmpeg2/idct.c create mode 100644 libmpeg2/idct_mlib.c create mode 100644 libmpeg2/idct_mmx.c create mode 100644 libmpeg2/mm_accel.h create mode 100644 libmpeg2/mmx.h create mode 100644 libmpeg2/motion_comp.c create mode 100644 libmpeg2/motion_comp_mlib.c create mode 100644 libmpeg2/motion_comp_mmx.c create mode 100644 libmpeg2/mpeg2.h create mode 100644 libmpeg2/mpeg2_internal.h create mode 100644 libmpeg2/slice.c create mode 100644 libmpeg2/sse.h create mode 100644 libmpeg2/stats.c create mode 100644 libmpeg2/vlc.h (limited to 'libmpeg2') diff --git a/libmpeg2/Makefile b/libmpeg2/Makefile new file mode 100644 index 0000000000..5e2f8e8cfc --- /dev/null +++ b/libmpeg2/Makefile @@ -0,0 +1,35 @@ + +LIBNAME = libmpeg2.a + +include ../config.mak + +SRCS = decode.c header.c idct.c idct_mmx.c motion_comp.c motion_comp_mmx.c slice.c stats.c +OBJS = decode.o header.o idct.o idct_mmx.o motion_comp.o motion_comp_mmx.o slice.o stats.o +CFLAGS = $(OPTFLAGS) -DMPG12PLAY +INCLUDE = -I. -I../libvo -I.. + +.SUFFIXES: .c .o + +# .PHONY: all clean + +.c.o: + $(CC) -c $(CFLAGS) $(INCLUDE) -o $@ $< + +$(LIBNAME): $(OBJS) + $(AR) r $(LIBNAME) $(OBJS) + +all: $(LIBNAME) + +clean: + rm -f *.o *.a *~ + +distclean: + makedepend + rm -f Makefile.bak *.o *.a *~ + +dep: depend + +depend: + makedepend -- $(CFLAGS) -- $(SRCS) &> /dev/null + +# DO NOT DELETE diff --git a/libmpeg2/attributes.h b/libmpeg2/attributes.h new file mode 100644 index 0000000000..dfbf129411 --- /dev/null +++ b/libmpeg2/attributes.h @@ -0,0 +1,31 @@ +/* + * attributes.h + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +//use gcc attribs to align critical data structures + +/* maximum supported data alignment */ +#define ATTRIBUTE_ALIGNED_MAX 64 + +#ifdef ATTRIBUTE_ALIGNED_MAX +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align))) +#else +#define ATTR_ALIGN(align) +#endif diff --git a/libmpeg2/decode.c b/libmpeg2/decode.c new file mode 100644 index 0000000000..e8bbb02112 --- /dev/null +++ b/libmpeg2/decode.c @@ -0,0 +1,319 @@ +/* Copyright (C) Aaron Holtzman - Nov 1999 */ +/* Some cleanup & hacking by A'rpi/ESP-team - Oct 2000 */ + +/* mpeg2dec version: */ +#define PACKAGE "mpeg2dec" +//#define VERSION "0.1.7-cvs" +#define VERSION "0.1.8-cvs" + +#include +#include +#include +#include +#include + +#include "config.h" + +//#include "video_out.h" + +#include "mpeg2.h" +#include "mpeg2_internal.h" + +#include "../linux/shmem.h" + +//#include "motion_comp.h" +//#include "idct.h" +//#include "header.h" +//#include "slice.h" +//#include "stats.h" + +#include "attributes.h" +#ifdef __i386__ +#include "mmx.h" +#endif + +//this is where we keep the state of the decoder +//picture_t picture_data; +//picture_t *picture=&picture_data; +picture_t *picture=NULL; + +//global config struct +mpeg2_config_t config; + +// the maximum chunk size is determined by vbv_buffer_size which is 224K for +// MP@ML streams. (we make no pretenses ofdecoding anything more than that) +//static uint8_t chunk_buffer[224 * 1024 + 4]; +//static uint32_t shift = 0; + +static int drop_flag = 0; +static int drop_frame = 0; + +int quant_store[MBR+1][MBC+1]; // [Review] + +void mpeg2_init (void) +{ + + printf (PACKAGE"-"VERSION" (C) 2000 Aaron Holtzman \n"); + config.flags = 0; +#ifdef HAVE_MMX + config.flags |= MM_ACCEL_X86_MMX; +#endif +#ifdef HAVE_SSE + config.flags |= MM_ACCEL_X86_MMXEXT; +#endif +#ifdef HAVE_3DNOW + config.flags |= MM_ACCEL_X86_3DNOW; +#endif +#ifdef HAVE_MLIB + config.flags |= MM_ACCEL_MLIB; +#endif + + printf("libmpeg2 config flags = 0x%X\n",config.flags); + + picture=shmem_alloc(sizeof(picture_t)); // !!! NEW HACK :) !!! + + header_state_init (picture); + picture->repeat_count=0; + + picture->pp_options=0; + + idct_init (); + motion_comp_init (); +} + +void mpeg2_allocate_image_buffers (picture_t * picture) +{ + int frame_size,buff_size; + unsigned char *base=NULL; + + // height+1 requires for yuv2rgb_mmx code (it reads next line after last) + frame_size = picture->coded_picture_width * (1+picture->coded_picture_height); + frame_size = (frame_size+31)&(~31); // align to 32 byte boundary + buff_size = frame_size + (frame_size/4)*2; // 4Y + 1U + 1V + + // allocate images in YV12 format + base = shmem_alloc(buff_size); + picture->throwaway_frame[0] = base; + picture->throwaway_frame[1] = base + frame_size * 5 / 4; + picture->throwaway_frame[2] = base + frame_size; + + base = shmem_alloc(buff_size); + picture->backward_reference_frame[0] = base; + picture->backward_reference_frame[1] = base + frame_size * 5 / 4; + picture->backward_reference_frame[2] = base + frame_size; + + base = shmem_alloc(buff_size); + picture->forward_reference_frame[0] = base; + picture->forward_reference_frame[1] = base + frame_size * 5 / 4; + picture->forward_reference_frame[2] = base + frame_size; + + base = shmem_alloc(buff_size); + picture->pp_frame[0] = base; + picture->pp_frame[1] = base + frame_size * 5 / 4; + picture->pp_frame[2] = base + frame_size; + +} + +static void decode_reorder_frames (void) +{ + if (picture->picture_coding_type != B_TYPE) { + + //reuse the soon to be outdated forward reference frame + picture->current_frame[0] = picture->forward_reference_frame[0]; + picture->current_frame[1] = picture->forward_reference_frame[1]; + picture->current_frame[2] = picture->forward_reference_frame[2]; + + //make the backward reference frame the new forward reference frame + picture->forward_reference_frame[0] = + picture->backward_reference_frame[0]; + picture->forward_reference_frame[1] = + picture->backward_reference_frame[1]; + picture->forward_reference_frame[2] = + picture->backward_reference_frame[2]; + + picture->backward_reference_frame[0] = picture->current_frame[0]; + picture->backward_reference_frame[1] = picture->current_frame[1]; + picture->backward_reference_frame[2] = picture->current_frame[2]; + + } else { + + picture->current_frame[0] = picture->throwaway_frame[0]; + picture->current_frame[1] = picture->throwaway_frame[1]; + picture->current_frame[2] = picture->throwaway_frame[2]; + + } +} + +static int in_slice_flag=0; + +static int parse_chunk (vo_functions_t * output, int code, uint8_t * buffer) +{ + int is_frame_done = 0; + + stats_header (code, buffer); + + is_frame_done = in_slice_flag && ((!code) || (code >= 0xb0)); + if (is_frame_done) { + in_slice_flag = 0; + + if(picture->picture_structure != FRAME_PICTURE) printf("Field! %d \n",picture->second_field); + + if ( ((HACK_MODE == 2) || (picture->mpeg1)) + && ((picture->picture_structure == FRAME_PICTURE) || + (picture->second_field)) + ) { + uint8_t ** bar; + int stride[3]; + + if (picture->picture_coding_type == B_TYPE) + bar = picture->throwaway_frame; + else + bar = picture->forward_reference_frame; + + stride[0]=picture->coded_picture_width; + stride[1]=stride[2]=stride[0]/2; + + if(picture->pp_options){ + // apply OpenDivX postprocess filter + postprocess(bar, stride[0], + picture->pp_frame, stride[0], + picture->coded_picture_width, picture->coded_picture_height, + &quant_store[1][1], (MBC+1), picture->pp_options); + output->draw_slice (picture->pp_frame, stride, + picture->display_picture_width, + picture->display_picture_height, 0, 0); + } else { + output->draw_slice (bar, stride, + picture->display_picture_width, + picture->display_picture_height, 0, 0); + } + + } +#ifdef ARCH_X86 + if (config.flags & MM_ACCEL_X86_MMX) emms (); +#endif + output->flip_page (); + } + + switch (code) { + case 0x00: /* picture_start_code */ + if (header_process_picture_header (picture, buffer)) { + printf ("bad picture header\n"); + exit (1); + } + + drop_frame = drop_flag && (picture->picture_coding_type == B_TYPE); + //decode_reorder_frames (); + break; + + case 0xb3: /* sequence_header_code */ + if (header_process_sequence_header (picture, buffer)) { + printf ("bad sequence header\n"); + exit (1); + } + break; + + case 0xb5: /* extension_start_code */ + if (header_process_extension (picture, buffer)) { + printf ("bad extension\n"); + exit (1); + } + break; + + default: +// if (code >= 0xb9) printf ("stream not demultiplexed ?\n"); + if (code >= 0xb0) break; + + if (!(in_slice_flag)) { + in_slice_flag = 1; + + if(!(picture->second_field)) decode_reorder_frames (); + } + + if (!drop_frame) { + uint8_t ** bar; + + slice_process (picture, code, buffer); + + if ((HACK_MODE < 2) && (!(picture->mpeg1))) { + uint8_t * foo[3]; + uint8_t ** bar; + //frame_t * bar; + int stride[3]; + int offset; + + if (picture->picture_coding_type == B_TYPE) + bar = picture->throwaway_frame; + else + bar = picture->forward_reference_frame; + + offset = (code-1) * 4 * picture->coded_picture_width; + if ((! HACK_MODE) && (picture->picture_coding_type == B_TYPE)) + offset = 0; + + foo[0] = bar[0] + 4 * offset; + foo[1] = bar[1] + offset; + foo[2] = bar[2] + offset; + + stride[0]=picture->coded_picture_width; + stride[1]=stride[2]=stride[0]/2; + + output->draw_slice (foo, stride, + picture->display_picture_width, 16, 0, (code-1)*16); + } +#ifdef ARCH_X86 + if (config.flags & MM_ACCEL_X86_MMX) emms (); +#endif + + } + } + + return is_frame_done; +} + + +int mpeg2_decode_data (vo_functions_t *output, uint8_t *current, uint8_t *end) +{ + //static uint8_t code = 0xff; + //static uint8_t chunk_buffer[65536]; + //static uint8_t *chunk_ptr = chunk_buffer; + //static uint32_t shift = 0; + uint8_t code; + uint8_t *pos=NULL; + uint8_t *start=current; + int ret = 0; + +// printf("RCVD %d bytes\n",end-current); + +while(current + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "config.h" + +#include + +#include "mpeg2_internal.h" +#include "attributes.h" + +// default intra quant matrix, in zig-zag order +static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = { + 8, + 16, 16, + 19, 16, 19, + 22, 22, 22, 22, + 22, 22, 26, 24, 26, + 27, 27, 27, 26, 26, 26, + 26, 27, 27, 27, 29, 29, 29, + 34, 34, 34, 29, 29, 29, 27, 27, + 29, 29, 32, 32, 34, 34, 37, + 38, 37, 35, 35, 34, 35, + 38, 38, 40, 40, 40, + 48, 48, 46, 46, + 56, 56, 58, + 69, 69, + 83 +}; + +uint8_t scan_norm[64] ATTR_ALIGN(16) = +{ + // Zig-Zag scan pattern + 0, 1, 8,16, 9, 2, 3,10, + 17,24,32,25,18,11, 4, 5, + 12,19,26,33,40,48,41,34, + 27,20,13, 6, 7,14,21,28, + 35,42,49,56,57,50,43,36, + 29,22,15,23,30,37,44,51, + 58,59,52,45,38,31,39,46, + 53,60,61,54,47,55,62,63 +}; + +uint8_t scan_alt[64] ATTR_ALIGN(16) = +{ + // Alternate scan pattern + 0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49, + 41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43, + 51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45, + 53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63 +}; + +void header_state_init (picture_t * picture) +{ + //FIXME we should set pointers to the real scan matrices here (mmx vs + //normal) instead of the ifdefs in header_process_picture_coding_extension + + picture->scan = scan_norm; +} + +static const int frameratecode2framerate[16] = { + 0, 24000*10000/1001, 24*10000,25*10000, 30000*10000/1001, 30*10000,50*10000,60000*10000/1001, + 60*10000, 0,0,0,0,0,0,0 +}; + +int header_process_sequence_header (picture_t * picture, uint8_t * buffer) +{ + unsigned int h_size; + unsigned int v_size; + int i; + + if ((buffer[6] & 0x20) != 0x20) + return 1; // missing marker_bit + + v_size = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2]; + + picture->display_picture_width = (v_size >> 12); + picture->display_picture_height = (v_size & 0xfff); + + h_size = ((v_size >> 12) + 15) & ~15; + v_size = ((v_size & 0xfff) + 15) & ~15; + + if ((h_size > 768) || (v_size > 576)) + return 1; // size restrictions for MP@ML or MPEG1 + + //XXX this needs field fixups + picture->coded_picture_width = h_size; + picture->coded_picture_height = v_size; + picture->last_mba = ((h_size * v_size) >> 8) - 1; + + // this is not used by the decoder + picture->aspect_ratio_information = buffer[3] >> 4; + picture->frame_rate_code = buffer[3] & 15; + picture->frame_rate = frameratecode2framerate[picture->frame_rate_code]; + + picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6); + + if (buffer[7] & 2) { + for (i = 0; i < 64; i++) + picture->intra_quantizer_matrix[scan_norm[i]] = + (buffer[i+7] << 7) | (buffer[i+8] >> 1); + buffer += 64; + } else { + for (i = 0; i < 64; i++) + picture->intra_quantizer_matrix[scan_norm[i]] = + default_intra_quantizer_matrix [i]; + } + + if (buffer[7] & 1) { + for (i = 0; i < 64; i++) + picture->non_intra_quantizer_matrix[scan_norm[i]] = + buffer[i+8]; + } else { + for (i = 0; i < 64; i++) + picture->non_intra_quantizer_matrix[i] = 16; + } + + // MPEG1 - for testing only + picture->mpeg1 = 1; + picture->intra_dc_precision = 0; + picture->frame_pred_frame_dct = 1; + picture->q_scale_type = 0; + picture->concealment_motion_vectors = 0; + //picture->alternate_scan = 0; + picture->picture_structure = FRAME_PICTURE; + //picture->second_field = 0; + + return 0; +} + +static int header_process_sequence_extension (picture_t * picture, + uint8_t * buffer) +{ + // MPEG1 - for testing only + picture->mpeg1 = 0; + + // check chroma format, size extensions, marker bit + if(((buffer[1]>>1)&3)!=1){ + printf("This CHROMA format not yet supported :(\n"); + return 1; + } + if ((buffer[1] & 1) || (buffer[2] & 0xe0)){ + printf("Big resolution video not yet supported :(\n"); + return 1; + } + if((buffer[3] & 0x01) != 0x01) return 1; // marker bit + + + // this is not used by the decoder + picture->progressive_sequence = (buffer[1] >> 3) & 1; + + if (picture->progressive_sequence) + picture->coded_picture_height = + (picture->coded_picture_height + 31) & ~31; + picture->bitrate>>=1; // hack + + return 0; +} + +static int header_process_quant_matrix_extension (picture_t * picture, + uint8_t * buffer) +{ + int i; + + if (buffer[0] & 8) { + for (i = 0; i < 64; i++) + picture->intra_quantizer_matrix[scan_norm[i]] = + (buffer[i] << 5) | (buffer[i+1] >> 3); + buffer += 64; + } + + if (buffer[0] & 4) { + for (i = 0; i < 64; i++) + picture->non_intra_quantizer_matrix[scan_norm[i]] = + (buffer[i] << 6) | (buffer[i+1] >> 2); + } + + return 0; +} + +static int header_process_picture_coding_extension (picture_t * picture, uint8_t * buffer) +{ + //pre subtract 1 for use later in compute_motion_vector + picture->f_code[0][0] = (buffer[0] & 15) - 1; + picture->f_code[0][1] = (buffer[1] >> 4) - 1; + picture->f_code[1][0] = (buffer[1] & 15) - 1; + picture->f_code[1][1] = (buffer[2] >> 4) - 1; + + picture->intra_dc_precision = (buffer[2] >> 2) & 3; + picture->picture_structure = buffer[2] & 3; + picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1; + picture->concealment_motion_vectors = (buffer[3] >> 5) & 1; + picture->q_scale_type = (buffer[3] >> 4) & 1; + picture->intra_vlc_format = (buffer[3] >> 3) & 1; + + if (buffer[3] & 4) // alternate_scan + picture->scan = scan_alt; + else + picture->scan = scan_norm; + + // these are not used by the decoder + picture->top_field_first = buffer[3] >> 7; + picture->repeat_first_field = (buffer[3] >> 1) & 1; + picture->progressive_frame = buffer[4] >> 7; + + // repeat_first implementation by A'rpi/ESP-team, based on libmpeg3: + if(picture->repeat_count>=100) picture->repeat_count=0; + if(picture->repeat_first_field){ + if(picture->progressive_sequence){ + if(picture->top_field_first) + picture->repeat_count+=200; + else + picture->repeat_count+=100; + } else + if(picture->progressive_frame){ + picture->repeat_count+=50; + } + } + + return 0; +} + +int header_process_extension (picture_t * picture, uint8_t * buffer) +{ + switch (buffer[0] & 0xf0) { + case 0x10: // sequence extension + return header_process_sequence_extension (picture, buffer); + + case 0x30: // quant matrix extension + return header_process_quant_matrix_extension (picture, buffer); + + case 0x80: // picture coding extension + return header_process_picture_coding_extension (picture, buffer); + } + + return 0; +} + +int header_process_picture_header (picture_t *picture, uint8_t * buffer) +{ + picture->picture_coding_type = (buffer [1] >> 3) & 7; + + // forward_f_code and backward_f_code - used in mpeg1 only + picture->f_code[0][1] = (buffer[3] >> 2) & 1; + picture->f_code[0][0] = + (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1; + picture->f_code[1][1] = (buffer[4] >> 6) & 1; + picture->f_code[1][0] = ((buffer[4] >> 3) & 7) - 1; + + // move in header_process_picture_header + picture->second_field = + (picture->picture_structure != FRAME_PICTURE) && + !(picture->second_field); + + return 0; +} diff --git a/libmpeg2/idct.c b/libmpeg2/idct.c new file mode 100644 index 0000000000..7411e176dd --- /dev/null +++ b/libmpeg2/idct.c @@ -0,0 +1,289 @@ +/* + * idct.c + * Copyright (C) 1999-2000 Aaron Holtzman + * + * Portions of this code are from the MPEG software simulation group + * idct implementation. This code will be replaced with a new + * implementation soon. + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/**********************************************************/ +/* inverse two dimensional DCT, Chen-Wang algorithm */ +/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */ +/* 32-bit integer arithmetic (8 bit coefficients) */ +/* 11 mults, 29 adds per DCT */ +/* sE, 18.8.91 */ +/**********************************************************/ +/* coefficients extended to 12 bit for IEEE1180-1990 */ +/* compliance sE, 2.1.94 */ +/**********************************************************/ + +/* this code assumes >> to be a two's-complement arithmetic */ +/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */ + +#include "config.h" + +#include +#include + +#include "mpeg2_internal.h" +#include "mm_accel.h" + +#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ +#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ +#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ +#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ +#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ +#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ + + +// idct main entry point +void (*idct_block_copy) (int16_t * block, uint8_t * dest, int stride); +void (*idct_block_add) (int16_t * block, uint8_t * dest, int stride); + +static void idct_block_copy_c (int16_t *block, uint8_t * dest, int stride); +static void idct_block_add_c (int16_t *block, uint8_t * dest, int stride); + +static uint8_t clip_lut[1024]; +#define CLIP(i) ((clip_lut+384)[ (i)]) + +void idct_init (void) +{ +#ifdef ARCH_X86 + if (config.flags & MM_ACCEL_X86_MMXEXT) { + fprintf (stderr, "Using MMXEXT for IDCT transform\n"); + idct_block_copy = idct_block_copy_mmxext; + idct_block_add = idct_block_add_mmxext; + idct_mmx_init (); + } else if (config.flags & MM_ACCEL_X86_MMX) { + fprintf (stderr, "Using MMX for IDCT transform\n"); + idct_block_copy = idct_block_copy_mmx; + idct_block_add = idct_block_add_mmx; + idct_mmx_init (); + } else +#endif +#ifdef LIBMPEG2_MLIB + if (config.flags & MM_ACCEL_MLIB) { + fprintf (stderr, "Using mlib for IDCT transform\n"); + idct_block_copy = idct_block_copy_mlib; + idct_block_add = idct_block_add_mlib; + } else +#endif + { + int i; + + fprintf (stderr, "No accelerated IDCT transform found\n"); + idct_block_copy = idct_block_copy_c; + idct_block_add = idct_block_add_c; + for (i = -384; i < 640; i++) + clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i); + } +} + +/* row (horizontal) IDCT + * + * 7 pi 1 + * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l ) + * l=0 8 2 + * + * where: c[0] = 128 + * c[1..7] = 128*sqrt (2) + */ + +static void inline idct_row (int16_t * block) +{ + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + + x1 = block[4] << 11; + x2 = block[6]; + x3 = block[2]; + x4 = block[1]; + x5 = block[7]; + x6 = block[5]; + x7 = block[3]; + + /* shortcut */ + if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) { + block[0] = block[1] = block[2] = block[3] = block[4] = + block[5] = block[6] = block[7] = block[0]<<3; + return; + } + + x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */ + + /* first stage */ + x8 = W7 * (x4 + x5); + x4 = x8 + (W1 - W7) * x4; + x5 = x8 - (W1 + W7) * x5; + x8 = W3 * (x6 + x7); + x6 = x8 - (W3 - W5) * x6; + x7 = x8 - (W3 + W5) * x7; + + /* second stage */ + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3 + x2); + x2 = x1 - (W2 + W6) * x2; + x3 = x1 + (W2 - W6) * x3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + block[0] = (x7 + x1) >> 8; + block[1] = (x3 + x2) >> 8; + block[2] = (x0 + x4) >> 8; + block[3] = (x8 + x6) >> 8; + block[4] = (x8 - x6) >> 8; + block[5] = (x0 - x4) >> 8; + block[6] = (x3 - x2) >> 8; + block[7] = (x7 - x1) >> 8; +} + +/* column (vertical) IDCT + * + * 7 pi 1 + * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l ) + * l=0 8 2 + * + * where: c[0] = 1/1024 + * c[1..7] = (1/1024)*sqrt (2) + */ + +static void inline idct_col (int16_t *block) +{ + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + + /* shortcut */ + x1 = block [8*4] << 8; + x2 = block [8*6]; + x3 = block [8*2]; + x4 = block [8*1]; + x5 = block [8*7]; + x6 = block [8*5]; + x7 = block [8*3]; + +#if 0 + if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) { + block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] = + block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6; + return; + } +#endif + + x0 = (block[8*0] << 8) + 8192; + + /* first stage */ + x8 = W7 * (x4 + x5) + 4; + x4 = (x8 + (W1 - W7) * x4) >> 3; + x5 = (x8 - (W1 + W7) * x5) >> 3; + x8 = W3 * (x6 + x7) + 4; + x6 = (x8 - (W3 - W5) * x6) >> 3; + x7 = (x8 - (W3 + W5) * x7) >> 3; + + /* second stage */ + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3 + x2) + 4; + x2 = (x1 - (W2 + W6) * x2) >> 3; + x3 = (x1 + (W2 - W6) * x3) >> 3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + block[8*0] = (x7 + x1) >> 14; + block[8*1] = (x3 + x2) >> 14; + block[8*2] = (x0 + x4) >> 14; + block[8*3] = (x8 + x6) >> 14; + block[8*4] = (x8 - x6) >> 14; + block[8*5] = (x0 - x4) >> 14; + block[8*6] = (x3 - x2) >> 14; + block[8*7] = (x7 - x1) >> 14; +} + +void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride) +{ + int i; + + for (i = 0; i < 8; i++) + idct_row (block + 8 * i); + + for (i = 0; i < 8; i++) + idct_col (block + i); + + i = 8; + do { + dest[0] = CLIP (block[0]); + dest[1] = CLIP (block[1]); + dest[2] = CLIP (block[2]); + dest[3] = CLIP (block[3]); + dest[4] = CLIP (block[4]); + dest[5] = CLIP (block[5]); + dest[6] = CLIP (block[6]); + dest[7] = CLIP (block[7]); + + dest += stride; + block += 8; + } while (--i); +} + +void idct_block_add_c (int16_t * block, uint8_t * dest, int stride) +{ + int i; + + for (i = 0; i < 8; i++) + idct_row (block + 8 * i); + + for (i = 0; i < 8; i++) + idct_col (block + i); + + i = 8; + do { + dest[0] = CLIP (block[0] + dest[0]); + dest[1] = CLIP (block[1] + dest[1]); + dest[2] = CLIP (block[2] + dest[2]); + dest[3] = CLIP (block[3] + dest[3]); + dest[4] = CLIP (block[4] + dest[4]); + dest[5] = CLIP (block[5] + dest[5]); + dest[6] = CLIP (block[6] + dest[6]); + dest[7] = CLIP (block[7] + dest[7]); + + dest += stride; + block += 8; + } while (--i); +} diff --git a/libmpeg2/idct_mlib.c b/libmpeg2/idct_mlib.c new file mode 100644 index 0000000000..055ee75fa6 --- /dev/null +++ b/libmpeg2/idct_mlib.c @@ -0,0 +1,47 @@ +/* + * idct_mlib.c + * Copyright (C) 1999 Håkan Hjort + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "config.h" + +#ifdef LIBMPEG2_MLIB + +#include +#include +#include +#include +#include + +#include "mpeg2_internal.h" + +void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride) +{ + mlib_VideoIDCT8x8_U8_S16 (dest, block, stride); +} + +void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride) +{ + // Should we use mlib_VideoIDCT_IEEE_S16_S16 here ?? + // it's ~30% slower. + mlib_VideoIDCT8x8_S16_S16 (block, block); + mlib_VideoAddBlock_U8_S16 (dest, block, stride); +} + +#endif diff --git a/libmpeg2/idct_mmx.c b/libmpeg2/idct_mmx.c new file mode 100644 index 0000000000..03ea5d7580 --- /dev/null +++ b/libmpeg2/idct_mmx.c @@ -0,0 +1,706 @@ +/* + * idct_mmx.c + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "config.h" + +#ifdef ARCH_X86 + +#include + +#include "mpeg2_internal.h" +#include "attributes.h" +#include "mmx.h" + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +#define round(bias) ((int)(((bias)+0.5) * (1<> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; +} +#endif + + +// MMXEXT row IDCT + +#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ + c4, c6, c4, c6, \ + c1, c3, -c1, -c5, \ + c5, c7, c3, -c7, \ + c4, -c6, c4, -c6, \ + -c4, c2, c4, -c2, \ + c5, -c1, c3, -c1, \ + c7, c3, c7, -c5 } + +static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 +} + +static inline void mmxext_row (int16_t * table, int32_t * rounder) +{ + movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 + pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 + + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 + pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 + + movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 + + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 + + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + + pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + + pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 + + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder + + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder +} + +static inline void mmxext_row_tail (int16_t * row, int store) +{ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + + // slot + + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 +} + +static inline void mmxext_row_mid (int16_t * row, int store, + int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 + + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 +} + + +// MMX row IDCT + +#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ + c4, c6, -c4, -c2, \ + c1, c3, c3, -c7, \ + c5, c7, -c1, -c5, \ + c4, -c6, c4, -c2, \ + -c4, c2, c4, -c6, \ + c5, -c1, c7, -c5, \ + c7, c3, c3, -c1 } + +static inline void mmx_row_head (int16_t * row, int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 + + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 +} + +static inline void mmx_row (int16_t * table, int32_t * rounder) +{ + pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 + punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 + + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 + punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 + + movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 + + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 + + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + + pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + + pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 + + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder + + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder +} + +static inline void mmx_row_tail (int16_t * row, int store) +{ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 + + pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 + + psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 + + por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 + + // slot + + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 +} + +static inline void mmx_row_mid (int16_t * row, int store, + int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 + + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 + + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 + + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 + + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 + + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 +} + + +#if 0 +// C column IDCT - its just here to document the MMXEXT and MMX versions +static inline void idct_col (int16_t * col, int offset) +{ +// multiplication - as implemented on mmx +#define F(c,x) (((c) * (x)) >> 16) + +// saturation - it helps us handle torture test cases +#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) + + int16_t x0, x1, x2, x3, x4, x5, x6, x7; + int16_t y0, y1, y2, y3, y4, y5, y6, y7; + int16_t a0, a1, a2, a3, b0, b1, b2, b3; + int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; + + col += offset; + + x0 = col[0*8]; + x1 = col[1*8]; + x2 = col[2*8]; + x3 = col[3*8]; + x4 = col[4*8]; + x5 = col[5*8]; + x6 = col[6*8]; + x7 = col[7*8]; + + u04 = S (x0 + x4); + v04 = S (x0 - x4); + u26 = S (F (T2, x6) + x2); // -0.5 + v26 = S (F (T2, x2) - x6); // -0.5 + + a0 = S (u04 + u26); + a1 = S (v04 + v26); + a2 = S (v04 - v26); + a3 = S (u04 - u26); + + u17 = S (F (T1, x7) + x1); // -0.5 + v17 = S (F (T1, x1) - x7); // -0.5 + u35 = S (F (T3, x5) + x3); // -0.5 + v35 = S (F (T3, x3) - x5); // -0.5 + + b0 = S (u17 + u35); + b3 = S (v17 - v35); + u12 = S (u17 - u35); + v12 = S (v17 + v35); + u12 = S (2 * F (C4, u12)); // -0.5 + v12 = S (2 * F (C4, v12)); // -0.5 + b1 = S (u12 + v12); + b2 = S (u12 - v12); + + y0 = S (a0 + b0) >> COL_SHIFT; + y1 = S (a1 + b1) >> COL_SHIFT; + y2 = S (a2 + b2) >> COL_SHIFT; + y3 = S (a3 + b3) >> COL_SHIFT; + + y4 = S (a3 - b3) >> COL_SHIFT; + y5 = S (a2 - b2) >> COL_SHIFT; + y6 = S (a1 - b1) >> COL_SHIFT; + y7 = S (a0 - b0) >> COL_SHIFT; + + col[0*8] = y0; + col[1*8] = y1; + col[2*8] = y2; + col[3*8] = y3; + col[4*8] = y4; + col[5*8] = y5; + col[6*8] = y6; + col[7*8] = y7; +} +#endif + + +// MMX column IDCT +static inline void idct_col (int16_t * col, int offset) +{ +#define T1 13036 +#define T2 27146 +#define T3 43790 +#define C4 23170 + + static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; + static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; + static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; + static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; + static mmx_t scratch0, scratch1; + + /* column code adapted from peter gubanov */ + /* http://www.elecard.com/peter/idct.shtml */ + + movq_m2r (*_T1, mm0); // mm0 = T1 + + movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 + movq_r2r (mm0, mm2); // mm2 = T1 + + movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 + pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 + + movq_m2r (*_T3, mm5); // mm5 = T3 + pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 + + movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 + movq_r2r (mm5, mm7); // mm7 = T3-1 + + movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 + psubsw_r2r (mm4, mm0); // mm0 = v17 + + movq_m2r (*_T2, mm4); // mm4 = T2 + pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 + + paddsw_r2r (mm2, mm1); // mm1 = u17 + pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 + + // slot + + movq_r2r (mm4, mm2); // mm2 = T2 + paddsw_r2r (mm3, mm5); // mm5 = T3*x3 + + pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 + paddsw_r2r (mm6, mm7); // mm7 = T3*x5 + + psubsw_r2r (mm6, mm5); // mm5 = v35 + paddsw_r2r (mm3, mm7); // mm7 = u35 + + movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 + movq_r2r (mm0, mm6); // mm6 = v17 + + pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 + psubsw_r2r (mm5, mm0); // mm0 = b3 + + psubsw_r2r (mm3, mm4); // mm4 = v26 + paddsw_r2r (mm6, mm5); // mm5 = v12 + + movq_r2m (mm0, scratch0); // save b3 + movq_r2r (mm1, mm6); // mm6 = u17 + + paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 + paddsw_r2r (mm7, mm6); // mm6 = b0 + + psubsw_r2r (mm7, mm1); // mm1 = u12 + movq_r2r (mm1, mm7); // mm7 = u12 + + movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 + paddsw_r2r (mm5, mm1); // mm1 = u12+v12 + + movq_m2r (*_C4, mm0); // mm0 = C4/2 + psubsw_r2r (mm5, mm7); // mm7 = u12-v12 + + movq_r2m (mm6, scratch1); // save b0 + pmulhw_r2r (mm0, mm1); // mm1 = b1/2 + + movq_r2r (mm4, mm6); // mm6 = v26 + pmulhw_r2r (mm0, mm7); // mm7 = b2/2 + + movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 + movq_r2r (mm3, mm0); // mm0 = x0 + + psubsw_r2r (mm5, mm3); // mm3 = v04 + paddsw_r2r (mm5, mm0); // mm0 = u04 + + paddsw_r2r (mm3, mm4); // mm4 = a1 + movq_r2r (mm0, mm5); // mm5 = u04 + + psubsw_r2r (mm6, mm3); // mm3 = a2 + paddsw_r2r (mm2, mm5); // mm5 = a0 + + paddsw_r2r (mm1, mm1); // mm1 = b1 + psubsw_r2r (mm2, mm0); // mm0 = a3 + + paddsw_r2r (mm7, mm7); // mm7 = b2 + movq_r2r (mm3, mm2); // mm2 = a2 + + movq_r2r (mm4, mm6); // mm6 = a1 + paddsw_r2r (mm7, mm3); // mm3 = a2+b2 + + psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 + paddsw_r2r (mm1, mm4); // mm4 = a1+b1 + + psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 + psubsw_r2r (mm1, mm6); // mm6 = a1-b1 + + movq_m2r (scratch1, mm1); // mm1 = b0 + psubsw_r2r (mm7, mm2); // mm2 = a2-b2 + + psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 + movq_r2r (mm5, mm7); // mm7 = a0 + + movq_r2m (mm4, *(col+offset+1*8)); // save y1 + psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 + + movq_r2m (mm3, *(col+offset+2*8)); // save y2 + paddsw_r2r (mm1, mm5); // mm5 = a0+b0 + + movq_m2r (scratch0, mm4); // mm4 = b3 + psubsw_r2r (mm1, mm7); // mm7 = a0-b0 + + psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 + movq_r2r (mm0, mm3); // mm3 = a3 + + movq_r2m (mm2, *(col+offset+5*8)); // save y5 + psubsw_r2r (mm4, mm3); // mm3 = a3-b3 + + psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 + paddsw_r2r (mm0, mm4); // mm4 = a3+b3 + + movq_r2m (mm5, *(col+offset+0*8)); // save y0 + psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 + + movq_r2m (mm6, *(col+offset+6*8)); // save y6 + psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 + + movq_r2m (mm7, *(col+offset+7*8)); // save y7 + + movq_r2m (mm3, *(col+offset+4*8)); // save y4 + + movq_r2m (mm4, *(col+offset+3*8)); // save y3 +} + + +static int32_t rounder0[] ATTR_ALIGN(8) = + rounder ((1 << (COL_SHIFT - 1)) - 0.5); +static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); +static int32_t rounder1[] ATTR_ALIGN(8) = + rounder (1.25683487303); // C1*(C1/C4+C1+C7)/2 +static int32_t rounder7[] ATTR_ALIGN(8) = + rounder (-0.25); // C1*(C7/C4+C7-C1)/2 +static int32_t rounder2[] ATTR_ALIGN(8) = + rounder (0.60355339059); // C2 * (C6+C2)/2 +static int32_t rounder6[] ATTR_ALIGN(8) = + rounder (-0.25); // C2 * (C6-C2)/2 +static int32_t rounder3[] ATTR_ALIGN(8) = + rounder (0.087788325588); // C3*(-C3/C4+C3+C5)/2 +static int32_t rounder5[] ATTR_ALIGN(8) = + rounder (-0.441341716183); // C3*(-C5/C4+C5-C3)/2 + + +#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ +static inline void idct (int16_t * block) \ +{ \ + static int16_t table04[] ATTR_ALIGN(16) = \ + table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ + static int16_t table17[] ATTR_ALIGN(16) = \ + table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ + static int16_t table26[] ATTR_ALIGN(16) = \ + table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ + static int16_t table35[] ATTR_ALIGN(16) = \ + table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ + \ + idct_row_head (block, 0*8, table04); \ + idct_row (table04, rounder0); \ + idct_row_mid (block, 0*8, 4*8, table04); \ + idct_row (table04, rounder4); \ + idct_row_mid (block, 4*8, 1*8, table17); \ + idct_row (table17, rounder1); \ + idct_row_mid (block, 1*8, 7*8, table17); \ + idct_row (table17, rounder7); \ + idct_row_mid (block, 7*8, 2*8, table26); \ + idct_row (table26, rounder2); \ + idct_row_mid (block, 2*8, 6*8, table26); \ + idct_row (table26, rounder6); \ + idct_row_mid (block, 6*8, 3*8, table35); \ + idct_row (table35, rounder3); \ + idct_row_mid (block, 3*8, 5*8, table35); \ + idct_row (table35, rounder5); \ + idct_row_tail (block, 5*8); \ + \ + idct_col (block, 0); \ + idct_col (block, 4); \ +} + + +#define COPY_MMX(offset,r0,r1,r2) \ +do { \ + movq_m2r (*(block+offset), r0); \ + dest += stride; \ + movq_m2r (*(block+offset+4), r1); \ + movq_r2m (r2, *dest); \ + packuswb_r2r (r1, r0); \ +} while (0) + +static void block_copy (int16_t * block, uint8_t * dest, int stride) +{ + movq_m2r (*(block+0*8), mm0); + movq_m2r (*(block+0*8+4), mm1); + movq_m2r (*(block+1*8), mm2); + packuswb_r2r (mm1, mm0); + movq_m2r (*(block+1*8+4), mm3); + movq_r2m (mm0, *dest); + packuswb_r2r (mm3, mm2); + COPY_MMX (2*8, mm0, mm1, mm2); + COPY_MMX (3*8, mm2, mm3, mm0); + COPY_MMX (4*8, mm0, mm1, mm2); + COPY_MMX (5*8, mm2, mm3, mm0); + COPY_MMX (6*8, mm0, mm1, mm2); + COPY_MMX (7*8, mm2, mm3, mm0); + movq_r2m (mm2, *(dest+stride)); +} + + +#define ADD_MMX(offset,r1,r2,r3,r4) \ +do { \ + movq_m2r (*(dest+2*stride), r1); \ + packuswb_r2r (r4, r3); \ + movq_r2r (r1, r2); \ + dest += stride; \ + movq_r2m (r3, *dest); \ + punpcklbw_r2r (mm0, r1); \ + paddsw_m2r (*(block+offset), r1); \ + punpckhbw_r2r (mm0, r2); \ + paddsw_m2r (*(block+offset+4), r2); \ +} while (0) + +static void block_add (int16_t * block, uint8_t * dest, int stride) +{ + movq_m2r (*dest, mm1); + pxor_r2r (mm0, mm0); + movq_m2r (*(dest+stride), mm3); + movq_r2r (mm1, mm2); + punpcklbw_r2r (mm0, mm1); + movq_r2r (mm3, mm4); + paddsw_m2r (*(block+0*8), mm1); + punpckhbw_r2r (mm0, mm2); + paddsw_m2r (*(block+0*8+4), mm2); + punpcklbw_r2r (mm0, mm3); + paddsw_m2r (*(block+1*8), mm3); + packuswb_r2r (mm2, mm1); + punpckhbw_r2r (mm0, mm4); + movq_r2m (mm1, *dest); + paddsw_m2r (*(block+1*8+4), mm4); + ADD_MMX (2*8, mm1, mm2, mm3, mm4); + ADD_MMX (3*8, mm3, mm4, mm1, mm2); + ADD_MMX (4*8, mm1, mm2, mm3, mm4); + ADD_MMX (5*8, mm3, mm4, mm1, mm2); + ADD_MMX (6*8, mm1, mm2, mm3, mm4); + ADD_MMX (7*8, mm3, mm4, mm1, mm2); + packuswb_r2r (mm4, mm3); + movq_r2m (mm3, *(dest+stride)); +} + + +declare_idct (mmxext_idct, mmxext_table, + mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) + +void idct_block_copy_mmxext (int16_t * block, uint8_t * dest, int stride) +{ + mmxext_idct (block); + block_copy (block, dest, stride); +} + +void idct_block_add_mmxext (int16_t * block, uint8_t * dest, int stride) +{ + mmxext_idct (block); + block_add (block, dest, stride); +} + + +declare_idct (mmx_idct, mmx_table, + mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) + +void idct_block_copy_mmx (int16_t * block, uint8_t * dest, int stride) +{ + mmx_idct (block); + block_copy (block, dest, stride); +} + +void idct_block_add_mmx (int16_t * block, uint8_t * dest, int stride) +{ + mmx_idct (block); + block_add (block, dest, stride); +} + + +void idct_mmx_init (void) +{ + extern uint8_t scan_norm[64]; + extern uint8_t scan_alt[64]; + int i, j; + + // the mmx/mmxext idct uses a reordered input, so we patch scan tables + + for (i = 0; i < 64; i++) { + j = scan_norm[i]; + scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); + j = scan_alt[i]; + scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); + } +} + +#endif diff --git a/libmpeg2/mm_accel.h b/libmpeg2/mm_accel.h new file mode 100644 index 0000000000..133d6acb03 --- /dev/null +++ b/libmpeg2/mm_accel.h @@ -0,0 +1,30 @@ +/* + * oms_accel.h + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +// generic accelerations +#define MM_ACCEL_MLIB 0x00000001 + +// x86 accelerations +#define MM_ACCEL_X86_MMX 0x80000000 +#define MM_ACCEL_X86_3DNOW 0x40000000 +#define MM_ACCEL_X86_MMXEXT 0x20000000 + +//uint32_t mm_accel (void); diff --git a/libmpeg2/mmx.h b/libmpeg2/mmx.h new file mode 100644 index 0000000000..bab97b8b1f --- /dev/null +++ b/libmpeg2/mmx.h @@ -0,0 +1,255 @@ +/* + * mmx.h + * Copyright (C) 1997-1999 H. Dietz and R. Fisher + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * The type of an value that fits in an MMX register (note that long + * long constant values MUST be suffixed by LL and unsigned long long + * values by ULL, lest they be truncated by the compiler) + */ + +typedef union { + long long q; /* Quadword (64-bit) value */ + unsigned long long uq; /* Unsigned Quadword */ + int d[2]; /* 2 Doubleword (32-bit) values */ + unsigned int ud[2]; /* 2 Unsigned Doubleword */ + short w[4]; /* 4 Word (16-bit) values */ + unsigned short uw[4]; /* 4 Unsigned Word */ + char b[8]; /* 8 Byte (8-bit) values */ + unsigned char ub[8]; /* 8 Unsigned Byte */ + float s[2]; /* Single-precision (32-bit) value */ +} ATTR_ALIGN(8) mmx_t; /* On an 8-byte (64-bit) boundary */ + + +#define mmx_i2r(op,imm,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm) ) + +#define mmx_m2r(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)) + +#define mmx_r2m(op,reg,mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op,regs,regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + + +#define emms() __asm__ __volatile__ ("emms") + +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) + +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) + +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) + +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) + +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) + +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) + +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) + +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) + +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) + +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) + +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) + +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) + +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) + +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) + +#define por_m2r(var,reg) mmx_m2r (por, var, reg) +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) + +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) + +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) + +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) + +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) + +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) + +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) + +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) + +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) + +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) + + +/* 3DNOW extensions */ + +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) + + +/* AMD MMX extensions - also available in intel SSE */ + + +#define mmx_m2ri(op,mem,reg,imm) \ + __asm__ __volatile__ (#op " %1, %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem), "X" (imm)) +#define mmx_r2ri(op,regs,regd,imm) \ + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "X" (imm) ) + +#define mmx_fetch(mem,hint) \ + __asm__ __volatile__ ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "X" (mem)) + + +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) + +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) + +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) + +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) + +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) + +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) + +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) + +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) + +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) + +#define pmovmskb(mmreg,reg) \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) + +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) + +#define prefetcht0(mem) mmx_fetch (mem, t0) +#define prefetcht1(mem) mmx_fetch (mem, t1) +#define prefetcht2(mem) mmx_fetch (mem, t2) +#define prefetchnta(mem) mmx_fetch (mem, nta) + +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) +#define pshufw_r2r(r