diff options
author | arpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-02-24 20:28:24 +0000 |
---|---|---|
committer | arpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-02-24 20:28:24 +0000 |
commit | d34041569e71fc9bd772354e94dc9d16061072a5 (patch) | |
tree | 8f481cae1c70f32d1756fbe5f39000577b73042d /mp3lib | |
parent | e95a95ece09bac96bdfd37322f96c6f57ef79ebc (diff) | |
download | mpv-d34041569e71fc9bd772354e94dc9d16061072a5.tar.bz2 mpv-d34041569e71fc9bd772354e94dc9d16061072a5.tar.xz |
Initial revision
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@2 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'mp3lib')
-rw-r--r-- | mp3lib/Makefile | 34 | ||||
-rw-r--r-- | mp3lib/config.mak | 6 | ||||
-rw-r--r-- | mp3lib/d_cpu.h | 17 | ||||
-rw-r--r-- | mp3lib/d_cpu.s | 94 | ||||
-rw-r--r-- | mp3lib/dct12.c | 139 | ||||
-rw-r--r-- | mp3lib/dct36.c | 264 | ||||
-rw-r--r-- | mp3lib/dct36_3dnow.s | 499 | ||||
-rw-r--r-- | mp3lib/dct64.c | 313 | ||||
-rw-r--r-- | mp3lib/dct64_3dnow.s | 706 | ||||
-rw-r--r-- | mp3lib/dct64_i386.c | 314 | ||||
-rw-r--r-- | mp3lib/decod386.c | 194 | ||||
-rw-r--r-- | mp3lib/decode_3dnow.s | 265 | ||||
-rw-r--r-- | mp3lib/decode_i586.s | 321 | ||||
-rw-r--r-- | mp3lib/equalizer.c | 79 | ||||
-rw-r--r-- | mp3lib/huffman.h | 332 | ||||
-rw-r--r-- | mp3lib/l2tables.h | 154 | ||||
-rw-r--r-- | mp3lib/layer2.c | 295 | ||||
-rw-r--r-- | mp3lib/layer3.c | 1703 | ||||
-rw-r--r-- | mp3lib/mp3.h | 31 | ||||
-rw-r--r-- | mp3lib/mpg123.h | 125 | ||||
-rw-r--r-- | mp3lib/sr1.c | 489 | ||||
-rw-r--r-- | mp3lib/tabinit.c | 89 |
22 files changed, 6463 insertions, 0 deletions
diff --git a/mp3lib/Makefile b/mp3lib/Makefile new file mode 100644 index 0000000000..774fed3fd4 --- /dev/null +++ b/mp3lib/Makefile @@ -0,0 +1,34 @@ + +include config.mak + +SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS) +OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS) +CFLAGS = $(OPTFLAGS) + +.SUFFIXES: .c .o + +# .PHONY: all clean + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.s.o: + $(CC) -c $(CFLAGS) -o $@ $< + +libMP3.a: $(OBJS) + $(AR) r libMP3.a $(OBJS) + +all: libMP3.a + +clean: + rm -f *~ *.o *.a + +distclean: + makedepend + rm -f *~ *.o *.a Makefile.bak + +dep: depend + +depend: + makedepend -- $(CFLAGS) -- $(SRCS) &>/dev/null +# DO NOT DELETE diff --git a/mp3lib/config.mak b/mp3lib/config.mak new file mode 100644 index 0000000000..8aacf246da --- /dev/null +++ b/mp3lib/config.mak @@ -0,0 +1,6 @@ + +include ../config.mak + +OPTIONAL_SRCS = +OPTIONAL_OBJS = + diff --git a/mp3lib/d_cpu.h b/mp3lib/d_cpu.h new file mode 100644 index 0000000000..d215c4d42a --- /dev/null +++ b/mp3lib/d_cpu.h @@ -0,0 +1,17 @@ + +// -------------------------------------------------------------------------- +// Cpu function detect by Pontscho/fresh!mindworkz +// -------------------------------------------------------------------------- + +#ifndef __MY_CPUIDENT +#define __MY_CPUIDENT + +unsigned int _CpuID; +unsigned int _i586; +unsigned int _3dnow; + +extern unsigned long CpuDetect( void ); +extern unsigned long ipentium( void ); +extern unsigned long a3dnow( void ); + +#endif
\ No newline at end of file diff --git a/mp3lib/d_cpu.s b/mp3lib/d_cpu.s new file mode 100644 index 0000000000..34ca9730c3 --- /dev/null +++ b/mp3lib/d_cpu.s @@ -0,0 +1,94 @@ + +/ --------------------------------------------------------------------------- +/ Cpu function detect by Pontscho/fresh!mindworkz +/ (c) 2000 - 2000 +/ --------------------------------------------------------------------------- + +.text + +.globl CpuDetect +.globl ipentium +.globl a3dnow + +/ --------------------------------------------------------------------------- +/ in C: unsigned long CpuDetect( void ); +/ return: cpu ident number. +/ --------------------------------------------------------------------------- +CpuDetect: + pushl %ebx + pushl %ecx + pushl %edx + + movl $1,%eax + cpuid + + popl %edx + popl %ecx + popl %ebx + ret + +/ --------------------------------------------------------------------------- +/ in C: unsigled long ipentium( void ); +/ return: 0 if the processor is not P5 or above else above 1. +/ --------------------------------------------------------------------------- +ipentium: + pushl %ebx + pushl %ecx + pushl %edx + pushfl + popl %eax + movl %eax,%ebx + xorl $0x00200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz no_cpuid + movl $1,%eax + cpuid + shrl $8,%eax + cmpl $5,%eax + jb no_cpuid + movl $1,%eax + jmp exit +no_cpuid: + xorl %eax,%eax +exit: + popl %edx + popl %ecx + popl %ebx + ret + +/ --------------------------------------------------------------------------- +/ in C: unsigned long a3dnow( void ); +/ return: 0 if this processor not requiment 3dnow! else above 1. +/ --------------------------------------------------------------------------- +a3dnow: + pushl %ebx + pushl %edx + pushl %ecx + + + call ipentium + shrl $1,%eax + jnc no_3dnow + + movl $0x80000000,%eax + cpuid + cmpl $0x80000000,%eax + jbe no_3dnow + movl $0x80000001,%eax + cpuid + testl $0x80000000,%edx + jz no_3dnow + movl $1,%eax + jmp exit2 +no_3dnow: + xorl %eax,%eax +exit2: + + popl %ecx + popl %edx + popl %ebx + ret diff --git a/mp3lib/dct12.c b/mp3lib/dct12.c new file mode 100644 index 0000000000..61f2b29900 --- /dev/null +++ b/mp3lib/dct12.c @@ -0,0 +1,139 @@ +/* + * new DCT12 + */ +static void dct12(real *in,real *rawout1,real *rawout2,register real *wi,register real *ts) +{ +#define DCT12_PART1 \ + in5 = in[5*3]; \ + in5 += (in4 = in[4*3]); \ + in4 += (in3 = in[3*3]); \ + in3 += (in2 = in[2*3]); \ + in2 += (in1 = in[1*3]); \ + in1 += (in0 = in[0*3]); \ + \ + in5 += in3; in3 += in1; \ + \ + in2 *= COS6_1; \ + in3 *= COS6_1; \ + +#define DCT12_PART2 \ + in0 += in4 * COS6_2; \ + \ + in4 = in0 + in2; \ + in0 -= in2; \ + \ + in1 += in5 * COS6_2; \ + \ + in5 = (in1 + in3) * tfcos12[0]; \ + in1 = (in1 - in3) * tfcos12[2]; \ + \ + in3 = in4 + in5; \ + in4 -= in5; \ + \ + in2 = in0 + in1; \ + in0 -= in1; + + + { + real in0,in1,in2,in3,in4,in5; + register real *out1 = rawout1; + ts[SBLIMIT*0] = out1[0]; ts[SBLIMIT*1] = out1[1]; ts[SBLIMIT*2] = out1[2]; + ts[SBLIMIT*3] = out1[3]; ts[SBLIMIT*4] = out1[4]; ts[SBLIMIT*5] = out1[5]; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = (in1 - in5) * tfcos12[1]; + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + ts[(17-1)*SBLIMIT] = out1[17-1] + tmp0 * wi[11-1]; + ts[(12+1)*SBLIMIT] = out1[12+1] + tmp0 * wi[6+1]; + ts[(6 +1)*SBLIMIT] = out1[6 +1] + tmp1 * wi[1]; + ts[(11-1)*SBLIMIT] = out1[11-1] + tmp1 * wi[5-1]; + } + + DCT12_PART2 + + ts[(17-0)*SBLIMIT] = out1[17-0] + in2 * wi[11-0]; + ts[(12+0)*SBLIMIT] = out1[12+0] + in2 * wi[6+0]; + ts[(12+2)*SBLIMIT] = out1[12+2] + in3 * wi[6+2]; + ts[(17-2)*SBLIMIT] = out1[17-2] + in3 * wi[11-2]; + + ts[(6+0)*SBLIMIT] = out1[6+0] + in0 * wi[0]; + ts[(11-0)*SBLIMIT] = out1[11-0] + in0 * wi[5-0]; + ts[(6+2)*SBLIMIT] = out1[6+2] + in4 * wi[2]; + ts[(11-2)*SBLIMIT] = out1[11-2] + in4 * wi[5-2]; + } + + in++; + + { + real in0,in1,in2,in3,in4,in5; + register real *out2 = rawout2; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = (in1 - in5) * tfcos12[1]; + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + out2[5-1] = tmp0 * wi[11-1]; + out2[0+1] = tmp0 * wi[6+1]; + ts[(12+1)*SBLIMIT] += tmp1 * wi[1]; + ts[(17-1)*SBLIMIT] += tmp1 * wi[5-1]; + } + + DCT12_PART2 + + out2[5-0] = in2 * wi[11-0]; + out2[0+0] = in2 * wi[6+0]; + out2[0+2] = in3 * wi[6+2]; + out2[5-2] = in3 * wi[11-2]; + + ts[(12+0)*SBLIMIT] += in0 * wi[0]; + ts[(17-0)*SBLIMIT] += in0 * wi[5-0]; + ts[(12+2)*SBLIMIT] += in4 * wi[2]; + ts[(17-2)*SBLIMIT] += in4 * wi[5-2]; + } + + in++; + + { + real in0,in1,in2,in3,in4,in5; + register real *out2 = rawout2; + out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = (in1 - in5) * tfcos12[1]; + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + out2[11-1] = tmp0 * wi[11-1]; + out2[6 +1] = tmp0 * wi[6+1]; + out2[0+1] += tmp1 * wi[1]; + out2[5-1] += tmp1 * wi[5-1]; + } + + DCT12_PART2 + + out2[11-0] = in2 * wi[11-0]; + out2[6 +0] = in2 * wi[6+0]; + out2[6 +2] = in3 * wi[6+2]; + out2[11-2] = in3 * wi[11-2]; + + out2[0+0] += in0 * wi[0]; + out2[5-0] += in0 * wi[5-0]; + out2[0+2] += in4 * wi[2]; + out2[5-2] += in4 * wi[5-2]; + } +} diff --git a/mp3lib/dct36.c b/mp3lib/dct36.c new file mode 100644 index 0000000000..04992f09cc --- /dev/null +++ b/mp3lib/dct36.c @@ -0,0 +1,264 @@ +/* +// This is an optimized DCT from Jeff Tsay's maplay 1.2+ package. +// Saved one multiplication by doing the 'twiddle factor' stuff +// together with the window mul. (MH) +// +// This uses Byeong Gi Lee's Fast Cosine Transform algorithm, but the +// 9 point IDCT needs to be reduced further. Unfortunately, I don't +// know how to do that, because 9 is not an even number. - Jeff. +// +////////////////////////////////////////////////////////////////// +// +// 9 Point Inverse Discrete Cosine Transform +// +// This piece of code is Copyright 1997 Mikko Tommila and is freely usable +// by anybody. The algorithm itself is of course in the public domain. +// +// Again derived heuristically from the 9-point WFTA. +// +// The algorithm is optimized (?) for speed, not for small rounding errors or +// good readability. +// +// 36 additions, 11 multiplications +// +// Again this is very likely sub-optimal. +// +// The code is optimized to use a minimum number of temporary variables, +// so it should compile quite well even on 8-register Intel x86 processors. +// This makes the code quite obfuscated and very difficult to understand. +// +// References: +// [1] S. Winograd: "On Computing the Discrete Fourier Transform", +// Mathematics of Computation, Volume 32, Number 141, January 1978, +// Pages 175-199 +*/ + +/*------------------------------------------------------------------*/ +/* */ +/* Function: Calculation of the inverse MDCT */ +/* */ +/*------------------------------------------------------------------*/ + +static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) +{ +#ifdef NEW_DCT9 + real tmp[18]; +#endif + + { + register real *in = inbuf; + + in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; + in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; + in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; + in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; + in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; + in[2] +=in[1]; in[1] +=in[0]; + + in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; + in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; + + +#ifdef NEW_DCT9 + { + real t0, t1, t2, t3, t4, t5, t6, t7; + + t1 = COS6_2 * in[12]; + t2 = COS6_2 * (in[8] + in[16] - in[4]); + + t3 = in[0] + t1; + t4 = in[0] - t1 - t1; + t5 = t4 - t2; + + t0 = cos9[0] * (in[4] + in[8]); + t1 = cos9[1] * (in[8] - in[16]); + + tmp[4] = t4 + t2 + t2; + t2 = cos9[2] * (in[4] + in[16]); + + t6 = t3 - t0 - t2; + t0 += t3 + t1; + t3 += t2 - t1; + + t2 = cos18[0] * (in[2] + in[10]); + t4 = cos18[1] * (in[10] - in[14]); + t7 = COS6_1 * in[6]; + + t1 = t2 + t4 + t7; + tmp[0] = t0 + t1; + tmp[8] = t0 - t1; + t1 = cos18[2] * (in[2] + in[14]); + t2 += t1 - t7; + + tmp[3] = t3 + t2; + t0 = COS6_1 * (in[10] + in[14] - in[2]); + tmp[5] = t3 - t2; + + t4 -= t1 + t7; + + tmp[1] = t5 - t0; + tmp[7] = t5 + t0; + tmp[2] = t6 + t4; + tmp[6] = t6 - t4; + } + + { + real t0, t1, t2, t3, t4, t5, t6, t7; + + t1 = COS6_2 * in[13]; + t2 = COS6_2 * (in[9] + in[17] - in[5]); + + t3 = in[1] + t1; + t4 = in[1] - t1 - t1; + t5 = t4 - t2; + + t0 = cos9[0] * (in[5] + in[9]); + t1 = cos9[1] * (in[9] - in[17]); + + tmp[13] = (t4 + t2 + t2) * tfcos36[17-13]; + t2 = cos9[2] * (in[5] + in[17]); + + t6 = t3 - t0 - t2; + t0 += t3 + t1; + t3 += t2 - t1; + + t2 = cos18[0] * (in[3] + in[11]); + t4 = cos18[1] * (in[11] - in[15]); + t7 = COS6_1 * in[7]; + + t1 = t2 + t4 + t7; + tmp[17] = (t0 + t1) * tfcos36[17-17]; + tmp[9] = (t0 - t1) * tfcos36[17-9]; + t1 = cos18[2] * (in[3] + in[15]); + t2 += t1 - t7; + + tmp[14] = (t3 + t2) * tfcos36[17-14]; + t0 = COS6_1 * (in[11] + in[15] - in[3]); + tmp[12] = (t3 - t2) * tfcos36[17-12]; + + t4 -= t1 + t7; + + tmp[16] = (t5 - t0) * tfcos36[17-16]; + tmp[10] = (t5 + t0) * tfcos36[17-10]; + tmp[15] = (t6 + t4) * tfcos36[17-15]; + tmp[11] = (t6 - t4) * tfcos36[17-11]; + } + +#define MACRO(v) { \ + real tmpval; \ + real sum0 = tmp[(v)]; \ + real sum1 = tmp[17-(v)]; \ + out2[9+(v)] = (tmpval = sum0 + sum1) * w[27+(v)]; \ + out2[8-(v)] = tmpval * w[26-(v)]; \ + sum0 -= sum1; \ + ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ + ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; } + +{ + register real *out2 = o2; + register real *w = wintab; + register real *out1 = o1; + register real *ts = tsbuf; + + MACRO(0); + MACRO(1); + MACRO(2); + MACRO(3); + MACRO(4); + MACRO(5); + MACRO(6); + MACRO(7); + MACRO(8); +} + +#else + + { + +#define MACRO0(v) { \ + real tmp; \ + out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ + out2[8-(v)] = tmp * w[26-(v)]; } \ + sum0 -= sum1; \ + ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ + ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; +#define MACRO1(v) { \ + real sum0,sum1; \ + sum0 = tmp1a + tmp2a; \ + sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ + MACRO0(v); } +#define MACRO2(v) { \ + real sum0,sum1; \ + sum0 = tmp2a - tmp1a; \ + sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ + MACRO0(v); } + + register const real *c = nCOS9; + register real *out2 = o2; + register real *w = wintab; + register real *out1 = o1; + register real *ts = tsbuf; + + real ta33,ta66,tb33,tb66; + + ta33 = in[2*3+0] * c[3]; + ta66 = in[2*6+0] * c[6]; + tb33 = in[2*3+1] * c[3]; + tb66 = in[2*6+1] * c[6]; + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; + tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; + tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; + tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; + + MACRO1(0); + MACRO2(8); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; + tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; + tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; + tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; + + MACRO1(1); + MACRO2(7); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; + tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; + tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; + tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; + + MACRO1(2); + MACRO2(6); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; + tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; + tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; + tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; + + MACRO1(3); + MACRO2(5); + } + + { + real sum0,sum1; + sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; + sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; + MACRO0(4); + } + } +#endif + + } +} + diff --git a/mp3lib/dct36_3dnow.s b/mp3lib/dct36_3dnow.s new file mode 100644 index 0000000000..a729bb4646 --- /dev/null +++ b/mp3lib/dct36_3dnow.s @@ -0,0 +1,499 @@ +/ +/ dct36_3dnow.s - 3DNow! optimized dct36() +/ +/ This code based 'dct36_3dnow.s' by Syuuhei Kashiyama +/ <squash@mb.kcom.ne.jp>,only two types of changes have been made: +/ +/ - remove PREFETCH instruction for speedup +/ - change function name for support 3DNow! automatic detect +/ +/ You can find Kashiyama's original 3dnow! support patch +/ (for mpg123-0.59o) at +/ http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). +/ +/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999 +/ <kim@comtec.co.jp> - after 1.Apr.1999 +/ + +/// +/// Replacement of dct36() with AMD's 3DNow! SIMD operations support +/// +/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp> +/// +/// The author of this program disclaim whole expressed or implied +/// warranties with regard to this program, and in no event shall the +/// author of this program liable to whatever resulted from the use of +/// this program. Use it at your own risk. +/// + + .globl dct36_3dnow + .type dct36_3dnow,@function +dct36_3dnow: + pushl %ebp + movl %esp,%ebp + subl $120,%esp + pushl %esi + pushl %ebx + movl 8(%ebp),%eax + movl 12(%ebp),%esi + movl 16(%ebp),%ecx + movl 20(%ebp),%edx + movl 24(%ebp),%ebx + leal -128(%ebp),%esp + + femms + movq (%eax),%mm0 + movq 4(%eax),%mm1 + pfadd %mm1,%mm0 + movq %mm0,4(%eax) + psrlq $32,%mm1 + movq 12(%eax),%mm2 + punpckldq %mm2,%mm1 + pfadd %mm2,%mm1 + movq %mm1,12(%eax) + psrlq $32,%mm2 + movq 20(%eax),%mm3 + punpckldq %mm3,%mm2 + pfadd %mm3,%mm2 + movq %mm2,20(%eax) + psrlq $32,%mm3 + movq 28(%eax),%mm4 + punpckldq %mm4,%mm3 + pfadd %mm4,%mm3 + movq %mm3,28(%eax) + psrlq $32,%mm4 + movq 36(%eax),%mm5 + punpckldq %mm5,%mm4 + pfadd %mm5,%mm4 + movq %mm4,36(%eax) + psrlq $32,%mm5 + movq 44(%eax),%mm6 + punpckldq %mm6,%mm5 + pfadd %mm6,%mm5 + movq %mm5,44(%eax) + psrlq $32,%mm6 + movq 52(%eax),%mm7 + punpckldq %mm7,%mm6 + pfadd %mm7,%mm6 + movq %mm6,52(%eax) + psrlq $32,%mm7 + movq 60(%eax),%mm0 + punpckldq %mm0,%mm7 + pfadd %mm0,%mm7 + movq %mm7,60(%eax) + psrlq $32,%mm0 + movd 68(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,68(%eax) + movd 4(%eax),%mm0 + movd 12(%eax),%mm1 + punpckldq %mm1,%mm0 + punpckldq 20(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,12(%eax) + psrlq $32,%mm0 + movd %mm0,20(%eax) + psrlq $32,%mm1 + movd 28(%eax),%mm2 + punpckldq %mm2,%mm1 + punpckldq 36(%eax),%mm2 + pfadd %mm2,%mm1 + movd %mm1,28(%eax) + psrlq $32,%mm1 + movd %mm1,36(%eax) + psrlq $32,%mm2 + movd 44(%eax),%mm3 + punpckldq %mm3,%mm2 + punpckldq 52(%eax),%mm3 + pfadd %mm3,%mm2 + movd %mm2,44(%eax) + psrlq $32,%mm2 + movd %mm2,52(%eax) + psrlq $32,%mm3 + movd 60(%eax),%mm4 + punpckldq %mm4,%mm3 + punpckldq 68(%eax),%mm4 + pfadd %mm4,%mm3 + movd %mm3,60(%eax) + psrlq $32,%mm3 + movd %mm3,68(%eax) + + movq 24(%eax),%mm0 + movq 48(%eax),%mm1 + movd COS9+12,%mm2 + punpckldq %mm2,%mm2 + movd COS9+24,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm2,%mm0 + pfmul %mm3,%mm1 + pushl %eax + movl $1,%eax + movd %eax,%mm7 + pi2fd %mm7,%mm7 + popl %eax + movq 8(%eax),%mm2 + movd COS9+4,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfadd %mm0,%mm2 + movq 40(%eax),%mm3 + movd COS9+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd COS9+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd COS9+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq 32(%eax),%mm4 + movd COS9+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd COS9+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+0,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 108(%edx),%mm6 + punpckldq 104(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,36(%ecx) + psrlq $32,%mm5 + movd %mm5,32(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 32(%edx),%mm6 + punpckldq 36(%edx),%mm6 + pfmul %mm6,%mm5 + movd 32(%esi),%mm6 + punpckldq 36(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,1024(%ebx) + psrlq $32,%mm5 + movd %mm5,1152(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+32,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 140(%edx),%mm6 + punpckldq 72(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,68(%ecx) + psrlq $32,%mm5 + movd %mm5,0(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 0(%edx),%mm6 + punpckldq 68(%edx),%mm6 + pfmul %mm6,%mm5 + movd 0(%esi),%mm6 + punpckldq 68(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,0(%ebx) + psrlq $32,%mm5 + movd %mm5,2176(%ebx) + movq 8(%eax),%mm2 + movq 40(%eax),%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + pfsub %mm3,%mm2 + movd COS9+12,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + movq 16(%eax),%mm3 + movq 32(%eax),%mm4 + pfsub %mm4,%mm3 + movq 64(%eax),%mm4 + pfsub %mm4,%mm3 + movd COS9+24,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + movq 48(%eax),%mm4 + pfsub %mm4,%mm3 + movq (%eax),%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+4,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 112(%edx),%mm6 + punpckldq 100(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,40(%ecx) + psrlq $32,%mm5 + movd %mm5,28(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 28(%edx),%mm6 + punpckldq 40(%edx),%mm6 + pfmul %mm6,%mm5 + movd 28(%esi),%mm6 + punpckldq 40(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,896(%ebx) + psrlq $32,%mm5 + movd %mm5,1280(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+28,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 136(%edx),%mm6 + punpckldq 76(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,64(%ecx) + psrlq $32,%mm5 + movd %mm5,4(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 4(%edx),%mm6 + punpckldq 64(%edx),%mm6 + pfmul %mm6,%mm5 + movd 4(%esi),%mm6 + punpckldq 64(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,128(%ebx) + psrlq $32,%mm5 + movd %mm5,2048(%ebx) + + movq 8(%eax),%mm2 + movd COS9+20,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd COS9+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + movd COS9+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd COS9+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd COS9+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd COS9+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+8,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 116(%edx),%mm6 + punpckldq 96(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,44(%ecx) + psrlq $32,%mm5 + movd %mm5,24(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 24(%edx),%mm6 + punpckldq 44(%edx),%mm6 + pfmul %mm6,%mm5 + movd 24(%esi),%mm6 + punpckldq 44(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,768(%ebx) + psrlq $32,%mm5 + movd %mm5,1408(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+24,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 132(%edx),%mm6 + punpckldq 80(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,60(%ecx) + psrlq $32,%mm5 + movd %mm5,8(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 8(%edx),%mm6 + punpckldq 60(%edx),%mm6 + pfmul %mm6,%mm5 + movd 8(%esi),%mm6 + punpckldq 60(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,256(%ebx) + psrlq $32,%mm5 + movd %mm5,1920(%ebx) + movq 8(%eax),%mm2 + movd COS9+28,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd COS9+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd COS9+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd COS9+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd COS9+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd COS9+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+12,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 120(%edx),%mm6 + punpckldq 92(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,48(%ecx) + psrlq $32,%mm5 + movd %mm5,20(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 20(%edx),%mm6 + punpckldq 48(%edx),%mm6 + pfmul %mm6,%mm5 + movd 20(%esi),%mm6 + punpckldq 48(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,640(%ebx) + psrlq $32,%mm5 + movd %mm5,1536(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq tfcos36+20,%mm5 |