diff options
author | alex <alex@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2003-10-22 21:08:46 +0000 |
---|---|---|
committer | alex <alex@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2003-10-22 21:08:46 +0000 |
commit | 44792402b200296e9759faf6b0d4a3fc9b6cdfa0 (patch) | |
tree | 6d9ca8f502058a808267beeaadb481fd87695b76 /mp3lib/dct64_sse.s | |
parent | cf069df98e3ae5a0ee6179ba7f6f734b323ed7f2 (diff) | |
download | mpv-44792402b200296e9759faf6b0d4a3fc9b6cdfa0.tar.bz2 mpv-44792402b200296e9759faf6b0d4a3fc9b6cdfa0.tar.xz |
removed obsoleted (or never ending) code
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@11242 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'mp3lib/dct64_sse.s')
-rw-r--r-- | mp3lib/dct64_sse.s | 2221 |
1 files changed, 0 insertions, 2221 deletions
diff --git a/mp3lib/dct64_sse.s b/mp3lib/dct64_sse.s deleted file mode 100644 index 3bc74cc8c0..0000000000 --- a/mp3lib/dct64_sse.s +++ /dev/null @@ -1,2221 +0,0 @@ -/ This code is a translation of dct64_k7.s from MPlayer. -/ Coded by Felix Buenemann <atmosfear at users.sourceforge.net> -/ -/ TODO: - fix phases 4 and 5 (sse) -/ - optimize scalar FPU code? (interleave with sse code) -/ - fix alignment (prohibits finishing this code) -/ - then use faster insns for aligned data -/ -/ Note: currently code is disabled as I couldn't get input data aligned! -/ - -//.data -// .align 8 -//x_plus_minus_3dnow: .long 0x00000000, 0x80000000 -//plus_1f: .float 1.0 - -.text - - .align 16 - - .global dct64_MMX_sse - -dct64_MMX_sse: - pushl %ebx - pushl %esi - pushl %edi - subl $256,%esp - movl 280(%esp),%eax - - leal 128(%esp),%edx - movl 272(%esp),%esi - movl 276(%esp),%edi - movl $costab_mmx,%ebx - orl %ecx,%ecx - movl %esp,%ecx - -/* Phase 1 (complete, worx) */ - -// [1] Process Block A1 (16 Bytes) -/ movq (%eax), %mm0 -/ movq 8(%eax), %mm4 - movups (%eax), %xmm0 - -// Copy A1 to another register A2 -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -// Process Block B1 (last 16 bytes) -/ movq 120(%eax), %mm1 -/ movq 112(%eax), %mm5 - movups 112(%eax), %xmm1 - -/* The PSWAPD instruction swaps or reverses the upper and lower - * doublewords of the source operand. PSWAPD mmreg1, mmreg2 - * performs the following operations: - * temp = mmreg2 - * mmreg1[63:32] = temp[31:0 ] - * mmreg1[31:0 ] = temp[63:32] - */ -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -// shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752) -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -// Add B1 to A1 -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -// Save Block A1 -/ movq %mm0, (%edx) -/ movq %mm4, 8(%edx) - movups %xmm0, (%edx) - -// Sub B1 from A2 -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -// Mul mem with A2 -/ pfmul (%ebx), %mm3 -/ pfmul 8(%ebx), %mm7 - movups (%ebx), %xmm7 - mulps %xmm7, %xmm2 - -// Shuffle A2 -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 -// I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps) - shufps $27, %xmm2, %xmm2 - -// Save A2 to mem (end) -/ movq %mm3, 120(%edx) -/ movq %mm7, 112(%edx) - movups %xmm2, 112(%edx) - -// [2] Process next data block -/ movq 16(%eax), %mm0 -/ movq 24(%eax), %mm4 - movups 16(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%eax), %mm1 -/ movq 96(%eax), %mm5 - movups 96(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 16(%edx) -/ movq %mm4, 24(%edx) - movups %xmm0, 16(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 16(%ebx), %mm3 -/ pfmul 24(%ebx), %mm7 - movups 16(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 104(%edx) -/ movq %mm7, 96(%edx) - movups %xmm2, 96(%edx) - -// [3] -/ movq 32(%eax), %mm0 -/ movq 40(%eax), %mm4 - movups 32(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 88(%eax), %mm1 -/ movq 80(%eax), %mm5 - movups 80(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%edx) -/ movq %mm4, 40(%edx) - movups %xmm0, 32(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 32(%ebx), %mm3 -/ pfmul 40(%ebx), %mm7 - movups 32(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 88(%edx) -/ movq %mm7, 80(%edx) - movups %xmm2, 80(%edx) - -// [4] -/ movq 48(%eax), %mm0 -/ movq 56(%eax), %mm4 - movups 48(%eax), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 72(%eax), %mm1 -/ movq 64(%eax), %mm5 - movups 64(%eax), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 48(%edx) -/ movq %mm4, 56(%edx) - movups %xmm0, 48(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 48(%ebx), %mm3 -/ pfmul 56(%ebx), %mm7 - movups 48(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 72(%edx) -/ movq %mm7, 64(%edx) - movups %xmm2, 64(%edx) - - -// phase 1 fpu code -/* Phase 1*/ -/* - flds (%eax) - leal 128(%esp),%edx - fadds 124(%eax) - movl 272(%esp),%esi - fstps (%edx) - movl 276(%esp),%edi - - flds 4(%eax) - movl $costab_mmx,%ebx - fadds 120(%eax) - orl %ecx,%ecx - fstps 4(%edx) - - flds (%eax) - movl %esp,%ecx - fsubs 124(%eax) - fmuls (%ebx) - fstps 124(%edx) - - flds 4(%eax) - fsubs 120(%eax) - fmuls 4(%ebx) - fstps 120(%edx) - - flds 8(%eax) - fadds 116(%eax) - fstps 8(%edx) - - flds 12(%eax) - fadds 112(%eax) - fstps 12(%edx) - - flds 8(%eax) - fsubs 116(%eax) - fmuls 8(%ebx) - fstps 116(%edx) - - flds 12(%eax) - fsubs 112(%eax) - fmuls 12(%ebx) - fstps 112(%edx) - - flds 16(%eax) - fadds 108(%eax) - fstps 16(%edx) - - flds 20(%eax) - fadds 104(%eax) - fstps 20(%edx) - - flds 16(%eax) - fsubs 108(%eax) - fmuls 16(%ebx) - fstps 108(%edx) - - flds 20(%eax) - fsubs 104(%eax) - fmuls 20(%ebx) - fstps 104(%edx) - - flds 24(%eax) - fadds 100(%eax) - fstps 24(%edx) - - flds 28(%eax) - fadds 96(%eax) - fstps 28(%edx) - - flds 24(%eax) - fsubs 100(%eax) - fmuls 24(%ebx) - fstps 100(%edx) - - flds 28(%eax) - fsubs 96(%eax) - fmuls 28(%ebx) - fstps 96(%edx) - - flds 32(%eax) - fadds 92(%eax) - fstps 32(%edx) - - flds 36(%eax) - fadds 88(%eax) - fstps 36(%edx) - - flds 32(%eax) - fsubs 92(%eax) - fmuls 32(%ebx) - fstps 92(%edx) - - flds 36(%eax) - fsubs 88(%eax) - fmuls 36(%ebx) - fstps 88(%edx) - - flds 40(%eax) - fadds 84(%eax) - fstps 40(%edx) - - flds 44(%eax) - fadds 80(%eax) - fstps 44(%edx) - - flds 40(%eax) - fsubs 84(%eax) - fmuls 40(%ebx) - fstps 84(%edx) - - flds 44(%eax) - fsubs 80(%eax) - fmuls 44(%ebx) - fstps 80(%edx) - - flds 48(%eax) - fadds 76(%eax) - fstps 48(%edx) - - flds 52(%eax) - fadds 72(%eax) - fstps 52(%edx) - - flds 48(%eax) - fsubs 76(%eax) - fmuls 48(%ebx) - fstps 76(%edx) - - flds 52(%eax) - fsubs 72(%eax) - fmuls 52(%ebx) - fstps 72(%edx) - - flds 56(%eax) - fadds 68(%eax) - fstps 56(%edx) - - flds 60(%eax) - fadds 64(%eax) - fstps 60(%edx) - - flds 56(%eax) - fsubs 68(%eax) - fmuls 56(%ebx) - fstps 68(%edx) - - flds 60(%eax) - fsubs 64(%eax) - fmuls 60(%ebx) - fstps 64(%edx) -*/ -// end phase 1 fpu code - -/* Phase 2 (completed, worx) */ - -/ movq (%edx), %mm0 -/ movq 8(%edx), %mm4 - movups (%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 56(%edx), %mm1 -/ movq 48(%edx), %mm5 - movups 48(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%ecx) -/ movq %mm4, 8(%ecx) - movups %xmm0, (%ecx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 64(%ebx), %mm3 -/ pfmul 72(%ebx), %mm7 - movups 64(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 56(%ecx) -/ movq %mm7, 48(%ecx) - movups %xmm2, 48(%ecx) - -/ movq 16(%edx), %mm0 -/ movq 24(%edx), %mm4 - movups 16(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 40(%edx), %mm1 -/ movq 32(%edx), %mm5 - movups 32(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 16(%ecx) -/ movq %mm4, 24(%ecx) - movups %xmm0, 16(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul 80(%ebx), %mm3 -/ pfmul 88(%ebx), %mm7 - movups 80(%ebx), %xmm7 - mulps %xmm7, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 40(%ecx) -/ movq %mm7, 32(%ecx) - movups %xmm2, 32(%ecx) - - -// phase 2 fpu -/* Phase 2*/ -/* - flds (%edx) - fadds 60(%edx) - fstps (%ecx) - - flds 4(%edx) - fadds 56(%edx) - fstps 4(%ecx) - - flds (%edx) - fsubs 60(%edx) - fmuls 64(%ebx) - fstps 60(%ecx) - - flds 4(%edx) - fsubs 56(%edx) - fmuls 68(%ebx) - fstps 56(%ecx) - - flds 8(%edx) - fadds 52(%edx) - fstps 8(%ecx) - - flds 12(%edx) - fadds 48(%edx) - fstps 12(%ecx) - - flds 8(%edx) - fsubs 52(%edx) - fmuls 72(%ebx) - fstps 52(%ecx) - - flds 12(%edx) - fsubs 48(%edx) - fmuls 76(%ebx) - fstps 48(%ecx) - - flds 16(%edx) - fadds 44(%edx) - fstps 16(%ecx) - - flds 20(%edx) - fadds 40(%edx) - fstps 20(%ecx) - - flds 16(%edx) - fsubs 44(%edx) - fmuls 80(%ebx) - fstps 44(%ecx) - - flds 20(%edx) - fsubs 40(%edx) - fmuls 84(%ebx) - fstps 40(%ecx) - - flds 24(%edx) - fadds 36(%edx) - fstps 24(%ecx) - - flds 28(%edx) - fadds 32(%edx) - fstps 28(%ecx) - - flds 24(%edx) - fsubs 36(%edx) - fmuls 88(%ebx) - fstps 36(%ecx) - - flds 28(%edx) - fsubs 32(%edx) - fmuls 92(%ebx) - fstps 32(%ecx) -*/ -// end phase 2 fpu - -/* Phase 3 (completed, working) */ - -/ movq 64(%edx), %mm0 -/ movq 72(%edx), %mm4 - movups 64(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 120(%edx), %mm1 -/ movq 112(%edx), %mm5 - movups 112(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%ecx) -/ movq %mm4, 72(%ecx) - movups %xmm0, 64(%ecx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// optimized (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul 64(%ebx), %mm3 -/ pfmul 72(%ebx), %mm7 - movups 64(%ebx), %xmm7 - mulps %xmm7, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 120(%ecx) -/ movq %mm7, 112(%ecx) - movups %xmm1, 112(%ecx) - - -/ movq 80(%edx), %mm0 -/ movq 88(%edx), %mm4 - movups 80(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%edx), %mm1 -/ movq 96(%edx), %mm5 - movups 96(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 80(%ecx) -/ movq %mm4, 88(%ecx) - movups %xmm0, 80(%ecx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// optimized (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul 80(%ebx), %mm3 -/ pfmul 88(%ebx), %mm7 - movups 80(%ebx), %xmm7 - mulps %xmm7, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 104(%ecx) -/ movq %mm7, 96(%ecx) - movups %xmm1, 96(%ecx) - - -// phase 3 fpu -/* Phase 3*/ -/* - flds 64(%edx) - fadds 124(%edx) - fstps 64(%ecx) - - flds 68(%edx) - fadds 120(%edx) - fstps 68(%ecx) - - flds 124(%edx) - fsubs 64(%edx) - fmuls 64(%ebx) - fstps 124(%ecx) - - flds 120(%edx) - fsubs 68(%edx) - fmuls 68(%ebx) - fstps 120(%ecx) - - flds 72(%edx) - fadds 116(%edx) - fstps 72(%ecx) - - flds 76(%edx) - fadds 112(%edx) - fstps 76(%ecx) - - flds 116(%edx) - fsubs 72(%edx) - fmuls 72(%ebx) - fstps 116(%ecx) - - flds 112(%edx) - fsubs 76(%edx) - fmuls 76(%ebx) - fstps 112(%ecx) - - flds 80(%edx) - fadds 108(%edx) - fstps 80(%ecx) - - flds 84(%edx) - fadds 104(%edx) - fstps 84(%ecx) - - flds 108(%edx) - fsubs 80(%edx) - fmuls 80(%ebx) - fstps 108(%ecx) - - flds 104(%edx) - fsubs 84(%edx) - fmuls 84(%ebx) - fstps 104(%ecx) - - flds 88(%edx) - fadds 100(%edx) - fstps 88(%ecx) - - flds 92(%edx) - fadds 96(%edx) - fstps 92(%ecx) - - flds 100(%edx) - fsubs 88(%edx) - fmuls 88(%ebx) - fstps 100(%ecx) - - flds 96(%edx) - fsubs 92(%edx) - fmuls 92(%ebx) - fstps 96(%ecx) -*/ -// end phase 3 fpu - - -/* Phase 4 (completed, buggy) */ -/* -/ movq 96(%ebx), %mm2 -/ movq 104(%ebx), %mm6 - movups 96(%ebx), %xmm4 - - -/ movq (%ecx), %mm0 -/ movq 8(%ecx), %mm4 - movups (%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 24(%ecx), %mm1 -/ movq 16(%ecx), %mm5 - movups 16(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%edx) -/ movq %mm4, 8(%edx) - movups %xmm0, (%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 24(%edx) -/ movq %mm7, 16(%edx) - movups %xmm2, 16(%edx) - -/ movq 32(%ecx), %mm0 -/ movq 40(%ecx), %mm4 - movups 32(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 56(%ecx), %mm1 -/ movq 48(%ecx), %mm5 - movups 48(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%edx) -/ movq %mm4, 40(%edx) - movups %xmm0, 32(%edx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// Luckily we can swap this (xmm1<->xmm2) - subps %xmm2, %xmm1 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 56(%edx) -/ movq %mm7, 48(%edx) - movups %xmm1, 48(%edx) - - -/ movq 64(%ecx), %mm0 -/ movq 72(%ecx), %mm4 - movups 64(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 88(%ecx), %mm1 -/ movq 80(%ecx), %mm5 - movups 80(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%edx) -/ movq %mm4, 72(%edx) - movups %xmm0, 64(%edx) - -/ pfsub %mm1, %mm3 -/ pfsub %mm5, %mm7 - subps %xmm1, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm2, %xmm2 - -/ movq %mm3, 88(%edx) -/ movq %mm7, 80(%edx) - movups %xmm2, 80(%edx) - - -/ movq 96(%ecx), %mm0 -/ movq 104(%ecx), %mm4 - movups 96(%ecx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 120(%ecx), %mm1 -/ movq 112(%ecx), %mm5 - movups 112(%ecx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 -//// shufps $177, %xmm1, %xmm1 - shufps $27, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 96(%edx) -/ movq %mm4, 104(%edx) - movups %xmm0, 96(%edx) - -/ pfsubr %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase - subps %xmm2, %xmm1 - -/ pfmul %mm2, %mm3 -/ pfmul %mm6, %mm7 - mulps %xmm4, %xmm1 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $27, %xmm1, %xmm1 - -/ movq %mm3, 120(%edx) -/ movq %mm7, 112(%edx) - movups %xmm1, 112(%edx) -*/ - -// phase 4 fpu code -/* Phase 4*/ - - flds (%ecx) - fadds 28(%ecx) - fstps (%edx) - - flds (%ecx) - fsubs 28(%ecx) - fmuls 96(%ebx) - fstps 28(%edx) - - flds 4(%ecx) - fadds 24(%ecx) - fstps 4(%edx) - - flds 4(%ecx) - fsubs 24(%ecx) - fmuls 100(%ebx) - fstps 24(%edx) - - flds 8(%ecx) - fadds 20(%ecx) - fstps 8(%edx) - - flds 8(%ecx) - fsubs 20(%ecx) - fmuls 104(%ebx) - fstps 20(%edx) - - flds 12(%ecx) - fadds 16(%ecx) - fstps 12(%edx) - - flds 12(%ecx) - fsubs 16(%ecx) - fmuls 108(%ebx) - fstps 16(%edx) - - flds 32(%ecx) - fadds 60(%ecx) - fstps 32(%edx) - - flds 60(%ecx) - fsubs 32(%ecx) - fmuls 96(%ebx) - fstps 60(%edx) - - flds 36(%ecx) - fadds 56(%ecx) - fstps 36(%edx) - - flds 56(%ecx) - fsubs 36(%ecx) - fmuls 100(%ebx) - fstps 56(%edx) - - flds 40(%ecx) - fadds 52(%ecx) - fstps 40(%edx) - - flds 52(%ecx) - fsubs 40(%ecx) - fmuls 104(%ebx) - fstps 52(%edx) - - flds 44(%ecx) - fadds 48(%ecx) - fstps 44(%edx) - - flds 48(%ecx) - fsubs 44(%ecx) - fmuls 108(%ebx) - fstps 48(%edx) - - flds 64(%ecx) - fadds 92(%ecx) - fstps 64(%edx) - - flds 64(%ecx) - fsubs 92(%ecx) - fmuls 96(%ebx) - fstps 92(%edx) - - flds 68(%ecx) - fadds 88(%ecx) - fstps 68(%edx) - - flds 68(%ecx) - fsubs 88(%ecx) - fmuls 100(%ebx) - fstps 88(%edx) - - flds 72(%ecx) - fadds 84(%ecx) - fstps 72(%edx) - - flds 72(%ecx) - fsubs 84(%ecx) - fmuls 104(%ebx) - fstps 84(%edx) - - flds 76(%ecx) - fadds 80(%ecx) - fstps 76(%edx) - - flds 76(%ecx) - fsubs 80(%ecx) - fmuls 108(%ebx) - fstps 80(%edx) - - flds 96(%ecx) - fadds 124(%ecx) - fstps 96(%edx) - - flds 124(%ecx) - fsubs 96(%ecx) - fmuls 96(%ebx) - fstps 124(%edx) - - flds 100(%ecx) - fadds 120(%ecx) - fstps 100(%edx) - - flds 120(%ecx) - fsubs 100(%ecx) - fmuls 100(%ebx) - fstps 120(%edx) - - flds 104(%ecx) - fadds 116(%ecx) - fstps 104(%edx) - - flds 116(%ecx) - fsubs 104(%ecx) - fmuls 104(%ebx) - fstps 116(%edx) - - flds 108(%ecx) - fadds 112(%ecx) - fstps 108(%edx) - - flds 112(%ecx) - fsubs 108(%ecx) - fmuls 108(%ebx) - fstps 112(%edx) - - flds (%edx) - fadds 12(%edx) - fstps (%ecx) - - flds (%edx) - fsubs 12(%edx) - fmuls 112(%ebx) - fstps 12(%ecx) - - flds 4(%edx) - fadds 8(%edx) - fstps 4(%ecx) - - flds 4(%edx) - fsubs 8(%edx) - fmuls 116(%ebx) - fstps 8(%ecx) - - flds 16(%edx) - fadds 28(%edx) - fstps 16(%ecx) - - flds 28(%edx) - fsubs 16(%edx) - fmuls 112(%ebx) - fstps 28(%ecx) - - flds 20(%edx) - fadds 24(%edx) - fstps 20(%ecx) - - flds 24(%edx) - fsubs 20(%edx) - fmuls 116(%ebx) - fstps 24(%ecx) - - flds 32(%edx) - fadds 44(%edx) - fstps 32(%ecx) - - flds 32(%edx) - fsubs 44(%edx) - fmuls 112(%ebx) - fstps 44(%ecx) - - flds 36(%edx) - fadds 40(%edx) - fstps 36(%ecx) - - flds 36(%edx) - fsubs 40(%edx) - fmuls 116(%ebx) - fstps 40(%ecx) - - flds 48(%edx) - fadds 60(%edx) - fstps 48(%ecx) - - flds 60(%edx) - fsubs 48(%edx) - fmuls 112(%ebx) - fstps 60(%ecx) - - flds 52(%edx) - fadds 56(%edx) - fstps 52(%ecx) - - flds 56(%edx) - fsubs 52(%edx) - fmuls 116(%ebx) - fstps 56(%ecx) - - flds 64(%edx) - fadds 76(%edx) - fstps 64(%ecx) - - flds 64(%edx) - fsubs 76(%edx) - fmuls 112(%ebx) - fstps 76(%ecx) - - flds 68(%edx) - fadds 72(%edx) - fstps 68(%ecx) - - flds 68(%edx) - fsubs 72(%edx) - fmuls 116(%ebx) - fstps 72(%ecx) - - flds 80(%edx) - fadds 92(%edx) - fstps 80(%ecx) - - flds 92(%edx) - fsubs 80(%edx) - fmuls 112(%ebx) - fstps 92(%ecx) - - flds 84(%edx) - fadds 88(%edx) - fstps 84(%ecx) - - flds 88(%edx) - fsubs 84(%edx) - fmuls 116(%ebx) - fstps 88(%ecx) - - flds 96(%edx) - fadds 108(%edx) - fstps 96(%ecx) - - flds 96(%edx) - fsubs 108(%edx) - fmuls 112(%ebx) - fstps 108(%ecx) - - flds 100(%edx) - fadds 104(%edx) - fstps 100(%ecx) - - flds 100(%edx) - fsubs 104(%edx) - fmuls 116(%ebx) - fstps 104(%ecx) - - flds 112(%edx) - fadds 124(%edx) - fstps 112(%ecx) - - flds 124(%edx) - fsubs 112(%edx) - fmuls 112(%ebx) - fstps 124(%ecx) - - flds 116(%edx) - fadds 120(%edx) - fstps 116(%ecx) - - flds 120(%edx) - fsubs 116(%edx) - fmuls 116(%ebx) - fstps 120(%ecx) - -// end of phase 4 fpu - -// below stuff needs to be finished I use FPU code for first -/* Phase 5 (completed, crashing) */ -/* -/ movq 112(%ebx), %mm2 - // move 8 byte data to (low)high quadword - check this! atmos - movlps 112(%ebx), %xmm4 - // maybe I need movhlps too to get data into correct quadword - movlhps %xmm4, %xmm4 - -/ movq (%edx), %mm0 -/ movq 16(%edx), %mm4 - movups (%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -// hmm? this is strange -/ movq 8(%edx), %mm1 -/ movq 24(%edx), %mm5 - movlps 8(%edx), %xmm1 - movhps 24(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - pshufd $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, (%ecx) -/ movq %mm4, 16(%ecx) - movlps %xmm0, (%ecx) - movhps %xmm0, 16(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 -// I need to emulate pfsubr here - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -// now move correct quadword from reverse substration in xmm3 to correct -// quadword in xmm2 and leave other quadword with non-reversed substration untouched -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) (see ia32-ref p.749) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 8(%ecx) -/ movq %mm7, 24(%ecx) - movlps %xmm2, 8(%ecx) - movhps %xmm2, 24(%ecx) - -/ movq 32(%edx), %mm0 -/ movq 48(%edx), %mm4 - movlps 32(%edx), %xmm0 - movhps 48(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 40(%edx), %mm1 -/ movq 56(%edx), %mm5 - movlps 40(%edx), %xmm1 - movhps 56(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 32(%ecx) -/ movq %mm4, 48(%ecx) - movlps %xmm0, 32(%ecx) - movhps %xmm0, 48(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 40(%ecx) -/ movq %mm7, 56(%ecx) - movlps %xmm2, 40(%ecx) - movhps %xmm2, 56(%ecx) - - -/ movq 64(%edx), %mm0 -/ movq 80(%edx), %mm4 - movlps 64(%edx), %xmm0 - movhps 80(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 72(%edx), %mm1 -/ movq 88(%edx), %mm5 - movlps 72(%edx), %xmm1 - movhps 88(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 64(%ecx) -/ movq %mm4, 80(%ecx) - movlps %xmm0, 64(%ecx) - movhps %xmm0, 80(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 72(%ecx) -/ movq %mm7, 88(%ecx) - movlps %xmm2, 72(%ecx) - movhps %xmm2, 88(%ecx) - -/ movq 96(%edx), %mm0 -/ movq 112(%edx), %mm4 - movups 96(%edx), %xmm0 - -/ movq %mm0, %mm3 -/ movq %mm4, %mm7 - movaps %xmm0, %xmm2 - -/ movq 104(%edx), %mm1 -/ movq 120(%edx), %mm5 - movlps 104(%edx), %xmm1 - movhps 120(%edx), %xmm1 - -/ pswapd %mm1, %mm1 -/ pswapd %mm5, %mm5 - shufps $177, %xmm1, %xmm1 - -/ pfadd %mm1, %mm0 -/ pfadd %mm5, %mm4 - addps %xmm1, %xmm0 - -/ movq %mm0, 96(%ecx) -/ movq %mm4, 112(%ecx) - movups %xmm0, 96(%ecx) - -/ pfsub %mm1, %mm3 -/ pfsubr %mm5, %mm7 - movaps %xmm1, %xmm3 - subps %xmm2, %xmm3 - subps %xmm1, %xmm2 -/// shufpd $2, %xmm3, %xmm2 -// (or $1?) -// optimize - movq %xmm2, %xmm3 - movaps %xmm3, %xmm2 - -/ pfmul %mm2, %mm3 -/ pfmul %mm2, %mm7 - mulps %xmm4, %xmm2 - -/ pswapd %mm3, %mm3 -/ pswapd %mm7, %mm7 - shufps $177, %xmm2, %xmm2 - -/ movq %mm3, 104(%ecx) -/ movq %mm7, 120(%ecx) - movlps %xmm2, 104(%ecx) - movhps %xmm2, 120(%ecx) -*/ - - -/* Phase 6. This is the end of easy road. */ -/* Code below is coded in scalar mode. Should be optimized */ -// -// movd plus_1f, %mm6 -// punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ -// movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ -/* - movq 32(%ecx), %mm0 - movq 64(%ecx), %mm2 - movq %mm0, %mm1 - movq %mm2, %mm3 - pxor %mm7, %mm1 - pxor %mm7, %mm3 - pfacc %mm1, %mm0 - pfacc %mm3, %mm2 - pfmul %mm6, %mm0 - pfmul %mm6, %mm2 - movq %mm0, 32(%edx) - movq %mm2, 64(%edx) - - movd 44(%ecx), %mm0 - movd 40(%ecx), %mm2 - movd 120(%ebx), %mm3 - punpckldq 76(%ecx), %mm0 - punpckldq 72(%ecx), %mm2 - punpckldq %mm3, %mm3 - movq %mm0, %mm4 - movq %mm2, %mm5 - pfsub %mm2, %mm0 - pfmul %mm3, %mm0 - movq %mm0, %mm1 - pfadd %mm5, %mm0 - pfadd %mm4, %mm0 - movq %mm0, %mm2 - punpckldq %mm1, %mm0 - punpckhdq %mm1, %mm2 - movq %mm0, 40(%edx) - movq %mm2, 72(%edx) - - movd 48(%ecx), %mm3 - movd 60(%ecx), %mm2 |