summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mp3lib/dct64_sse.s2217
1 files changed, 2217 insertions, 0 deletions
diff --git a/mp3lib/dct64_sse.s b/mp3lib/dct64_sse.s
new file mode 100644
index 0000000000..922e1c881a
--- /dev/null
+++ b/mp3lib/dct64_sse.s
@@ -0,0 +1,2217 @@
+# This code is a translation of dct64_k7.s from MPlayer.
+# Coded by Felix Buenemann <atmosfear at users.sourceforge.net>
+#
+# TODO: - fix phases 4 and 5 (sse)
+# - optimize scalar FPU code? (interleave with sse code)
+#
+
+//.data
+// .align 8
+//x_plus_minus_3dnow: .long 0x00000000, 0x80000000
+//plus_1f: .float 1.0
+
+.text
+
+ .align 16
+
+ .global dct64_MMX_sse
+
+dct64_MMX_sse:
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $256,%esp
+ movl 280(%esp),%eax
+
+ leal 128(%esp),%edx
+ movl 272(%esp),%esi
+ movl 276(%esp),%edi
+ movl $costab_mmx,%ebx
+ orl %ecx,%ecx
+ movl %esp,%ecx
+
+/* Phase 1 (complete, worx) */
+
+// [1] Process Block A1 (16 Bytes)
+/ movq (%eax), %mm0
+/ movq 8(%eax), %mm4
+ movups (%eax), %xmm0
+
+// Copy A1 to another register A2
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+// Process Block B1 (last 16 bytes)
+/ movq 120(%eax), %mm1
+/ movq 112(%eax), %mm5
+ movups 112(%eax), %xmm1
+
+/* The PSWAPD instruction swaps or reverses the upper and lower
+ * doublewords of the source operand. PSWAPD mmreg1, mmreg2
+ * performs the following operations:
+ * temp = mmreg2
+ * mmreg1[63:32] = temp[31:0 ]
+ * mmreg1[31:0 ] = temp[63:32]
+ */
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+// shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752)
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+// Add B1 to A1
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+// Save Block A1
+/ movq %mm0, (%edx)
+/ movq %mm4, 8(%edx)
+ movups %xmm0, (%edx)
+
+// Sub B1 from A2
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+// Mul mem with A2
+/ pfmul (%ebx), %mm3
+/ pfmul 8(%ebx), %mm7
+ movups (%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+// Shuffle A2
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+// I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps)
+ shufps $27, %xmm2, %xmm2
+
+// Save A2 to mem (end)
+/ movq %mm3, 120(%edx)
+/ movq %mm7, 112(%edx)
+ movups %xmm2, 112(%edx)
+
+// [2] Process next data block
+/ movq 16(%eax), %mm0
+/ movq 24(%eax), %mm4
+ movups 16(%eax), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 104(%eax), %mm1
+/ movq 96(%eax), %mm5
+ movups 96(%eax), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 16(%edx)
+/ movq %mm4, 24(%edx)
+ movups %xmm0, 16(%edx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul 16(%ebx), %mm3
+/ pfmul 24(%ebx), %mm7
+ movups 16(%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 104(%edx)
+/ movq %mm7, 96(%edx)
+ movups %xmm2, 96(%edx)
+
+// [3]
+/ movq 32(%eax), %mm0
+/ movq 40(%eax), %mm4
+ movups 32(%eax), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 88(%eax), %mm1
+/ movq 80(%eax), %mm5
+ movups 80(%eax), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 32(%edx)
+/ movq %mm4, 40(%edx)
+ movups %xmm0, 32(%edx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul 32(%ebx), %mm3
+/ pfmul 40(%ebx), %mm7
+ movups 32(%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 88(%edx)
+/ movq %mm7, 80(%edx)
+ movups %xmm2, 80(%edx)
+
+// [4]
+/ movq 48(%eax), %mm0
+/ movq 56(%eax), %mm4
+ movups 48(%eax), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 72(%eax), %mm1
+/ movq 64(%eax), %mm5
+ movups 64(%eax), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 48(%edx)
+/ movq %mm4, 56(%edx)
+ movups %xmm0, 48(%edx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul 48(%ebx), %mm3
+/ pfmul 56(%ebx), %mm7
+ movups 48(%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 72(%edx)
+/ movq %mm7, 64(%edx)
+ movups %xmm2, 64(%edx)
+
+
+// phase 1 fpu code
+/* Phase 1*/
+/*
+ flds (%eax)
+ leal 128(%esp),%edx
+ fadds 124(%eax)
+ movl 272(%esp),%esi
+ fstps (%edx)
+ movl 276(%esp),%edi
+
+ flds 4(%eax)
+ movl $costab_mmx,%ebx
+ fadds 120(%eax)
+ orl %ecx,%ecx
+ fstps 4(%edx)
+
+ flds (%eax)
+ movl %esp,%ecx
+ fsubs 124(%eax)
+ fmuls (%ebx)
+ fstps 124(%edx)
+
+ flds 4(%eax)
+ fsubs 120(%eax)
+ fmuls 4(%ebx)
+ fstps 120(%edx)
+
+ flds 8(%eax)
+ fadds 116(%eax)
+ fstps 8(%edx)
+
+ flds 12(%eax)
+ fadds 112(%eax)
+ fstps 12(%edx)
+
+ flds 8(%eax)
+ fsubs 116(%eax)
+ fmuls 8(%ebx)
+ fstps 116(%edx)
+
+ flds 12(%eax)
+ fsubs 112(%eax)
+ fmuls 12(%ebx)
+ fstps 112(%edx)
+
+ flds 16(%eax)
+ fadds 108(%eax)
+ fstps 16(%edx)
+
+ flds 20(%eax)
+ fadds 104(%eax)
+ fstps 20(%edx)
+
+ flds 16(%eax)
+ fsubs 108(%eax)
+ fmuls 16(%ebx)
+ fstps 108(%edx)
+
+ flds 20(%eax)
+ fsubs 104(%eax)
+ fmuls 20(%ebx)
+ fstps 104(%edx)
+
+ flds 24(%eax)
+ fadds 100(%eax)
+ fstps 24(%edx)
+
+ flds 28(%eax)
+ fadds 96(%eax)
+ fstps 28(%edx)
+
+ flds 24(%eax)
+ fsubs 100(%eax)
+ fmuls 24(%ebx)
+ fstps 100(%edx)
+
+ flds 28(%eax)
+ fsubs 96(%eax)
+ fmuls 28(%ebx)
+ fstps 96(%edx)
+
+ flds 32(%eax)
+ fadds 92(%eax)
+ fstps 32(%edx)
+
+ flds 36(%eax)
+ fadds 88(%eax)
+ fstps 36(%edx)
+
+ flds 32(%eax)
+ fsubs 92(%eax)
+ fmuls 32(%ebx)
+ fstps 92(%edx)
+
+ flds 36(%eax)
+ fsubs 88(%eax)
+ fmuls 36(%ebx)
+ fstps 88(%edx)
+
+ flds 40(%eax)
+ fadds 84(%eax)
+ fstps 40(%edx)
+
+ flds 44(%eax)
+ fadds 80(%eax)
+ fstps 44(%edx)
+
+ flds 40(%eax)
+ fsubs 84(%eax)
+ fmuls 40(%ebx)
+ fstps 84(%edx)
+
+ flds 44(%eax)
+ fsubs 80(%eax)
+ fmuls 44(%ebx)
+ fstps 80(%edx)
+
+ flds 48(%eax)
+ fadds 76(%eax)
+ fstps 48(%edx)
+
+ flds 52(%eax)
+ fadds 72(%eax)
+ fstps 52(%edx)
+
+ flds 48(%eax)
+ fsubs 76(%eax)
+ fmuls 48(%ebx)
+ fstps 76(%edx)
+
+ flds 52(%eax)
+ fsubs 72(%eax)
+ fmuls 52(%ebx)
+ fstps 72(%edx)
+
+ flds 56(%eax)
+ fadds 68(%eax)
+ fstps 56(%edx)
+
+ flds 60(%eax)
+ fadds 64(%eax)
+ fstps 60(%edx)
+
+ flds 56(%eax)
+ fsubs 68(%eax)
+ fmuls 56(%ebx)
+ fstps 68(%edx)
+
+ flds 60(%eax)
+ fsubs 64(%eax)
+ fmuls 60(%ebx)
+ fstps 64(%edx)
+*/
+// end phase 1 fpu code
+
+/* Phase 2 (completed, worx) */
+
+/ movq (%edx), %mm0
+/ movq 8(%edx), %mm4
+ movups (%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 56(%edx), %mm1
+/ movq 48(%edx), %mm5
+ movups 48(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, (%ecx)
+/ movq %mm4, 8(%ecx)
+ movups %xmm0, (%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul 64(%ebx), %mm3
+/ pfmul 72(%ebx), %mm7
+ movups 64(%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 56(%ecx)
+/ movq %mm7, 48(%ecx)
+ movups %xmm2, 48(%ecx)
+
+/ movq 16(%edx), %mm0
+/ movq 24(%edx), %mm4
+ movups 16(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 40(%edx), %mm1
+/ movq 32(%edx), %mm5
+ movups 32(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 16(%ecx)
+/ movq %mm4, 24(%ecx)
+ movups %xmm0, 16(%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul 80(%ebx), %mm3
+/ pfmul 88(%ebx), %mm7
+ movups 80(%ebx), %xmm7
+ mulps %xmm7, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 40(%ecx)
+/ movq %mm7, 32(%ecx)
+ movups %xmm2, 32(%ecx)
+
+
+// phase 2 fpu
+/* Phase 2*/
+/*
+ flds (%edx)
+ fadds 60(%edx)
+ fstps (%ecx)
+
+ flds 4(%edx)
+ fadds 56(%edx)
+ fstps 4(%ecx)
+
+ flds (%edx)
+ fsubs 60(%edx)
+ fmuls 64(%ebx)
+ fstps 60(%ecx)
+
+ flds 4(%edx)
+ fsubs 56(%edx)
+ fmuls 68(%ebx)
+ fstps 56(%ecx)
+
+ flds 8(%edx)
+ fadds 52(%edx)
+ fstps 8(%ecx)
+
+ flds 12(%edx)
+ fadds 48(%edx)
+ fstps 12(%ecx)
+
+ flds 8(%edx)
+ fsubs 52(%edx)
+ fmuls 72(%ebx)
+ fstps 52(%ecx)
+
+ flds 12(%edx)
+ fsubs 48(%edx)
+ fmuls 76(%ebx)
+ fstps 48(%ecx)
+
+ flds 16(%edx)
+ fadds 44(%edx)
+ fstps 16(%ecx)
+
+ flds 20(%edx)
+ fadds 40(%edx)
+ fstps 20(%ecx)
+
+ flds 16(%edx)
+ fsubs 44(%edx)
+ fmuls 80(%ebx)
+ fstps 44(%ecx)
+
+ flds 20(%edx)
+ fsubs 40(%edx)
+ fmuls 84(%ebx)
+ fstps 40(%ecx)
+
+ flds 24(%edx)
+ fadds 36(%edx)
+ fstps 24(%ecx)
+
+ flds 28(%edx)
+ fadds 32(%edx)
+ fstps 28(%ecx)
+
+ flds 24(%edx)
+ fsubs 36(%edx)
+ fmuls 88(%ebx)
+ fstps 36(%ecx)
+
+ flds 28(%edx)
+ fsubs 32(%edx)
+ fmuls 92(%ebx)
+ fstps 32(%ecx)
+*/
+// end phase 2 fpu
+
+/* Phase 3 (completed, working) */
+
+/ movq 64(%edx), %mm0
+/ movq 72(%edx), %mm4
+ movups 64(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 120(%edx), %mm1
+/ movq 112(%edx), %mm5
+ movups 112(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 64(%ecx)
+/ movq %mm4, 72(%ecx)
+ movups %xmm0, 64(%ecx)
+
+/ pfsubr %mm1, %mm3
+/ pfsubr %mm5, %mm7
+// optimized (xmm1<->xmm2)
+ subps %xmm2, %xmm1
+
+/ pfmul 64(%ebx), %mm3
+/ pfmul 72(%ebx), %mm7
+ movups 64(%ebx), %xmm7
+ mulps %xmm7, %xmm1
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm1, %xmm1
+
+/ movq %mm3, 120(%ecx)
+/ movq %mm7, 112(%ecx)
+ movups %xmm1, 112(%ecx)
+
+
+/ movq 80(%edx), %mm0
+/ movq 88(%edx), %mm4
+ movups 80(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 104(%edx), %mm1
+/ movq 96(%edx), %mm5
+ movups 96(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 80(%ecx)
+/ movq %mm4, 88(%ecx)
+ movups %xmm0, 80(%ecx)
+
+/ pfsubr %mm1, %mm3
+/ pfsubr %mm5, %mm7
+// optimized (xmm1<->xmm2)
+ subps %xmm2, %xmm1
+
+/ pfmul 80(%ebx), %mm3
+/ pfmul 88(%ebx), %mm7
+ movups 80(%ebx), %xmm7
+ mulps %xmm7, %xmm1
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm1, %xmm1
+
+/ movq %mm3, 104(%ecx)
+/ movq %mm7, 96(%ecx)
+ movups %xmm1, 96(%ecx)
+
+
+// phase 3 fpu
+/* Phase 3*/
+/*
+ flds 64(%edx)
+ fadds 124(%edx)
+ fstps 64(%ecx)
+
+ flds 68(%edx)
+ fadds 120(%edx)
+ fstps 68(%ecx)
+
+ flds 124(%edx)
+ fsubs 64(%edx)
+ fmuls 64(%ebx)
+ fstps 124(%ecx)
+
+ flds 120(%edx)
+ fsubs 68(%edx)
+ fmuls 68(%ebx)
+ fstps 120(%ecx)
+
+ flds 72(%edx)
+ fadds 116(%edx)
+ fstps 72(%ecx)
+
+ flds 76(%edx)
+ fadds 112(%edx)
+ fstps 76(%ecx)
+
+ flds 116(%edx)
+ fsubs 72(%edx)
+ fmuls 72(%ebx)
+ fstps 116(%ecx)
+
+ flds 112(%edx)
+ fsubs 76(%edx)
+ fmuls 76(%ebx)
+ fstps 112(%ecx)
+
+ flds 80(%edx)
+ fadds 108(%edx)
+ fstps 80(%ecx)
+
+ flds 84(%edx)
+ fadds 104(%edx)
+ fstps 84(%ecx)
+
+ flds 108(%edx)
+ fsubs 80(%edx)
+ fmuls 80(%ebx)
+ fstps 108(%ecx)
+
+ flds 104(%edx)
+ fsubs 84(%edx)
+ fmuls 84(%ebx)
+ fstps 104(%ecx)
+
+ flds 88(%edx)
+ fadds 100(%edx)
+ fstps 88(%ecx)
+
+ flds 92(%edx)
+ fadds 96(%edx)
+ fstps 92(%ecx)
+
+ flds 100(%edx)
+ fsubs 88(%edx)
+ fmuls 88(%ebx)
+ fstps 100(%ecx)
+
+ flds 96(%edx)
+ fsubs 92(%edx)
+ fmuls 92(%ebx)
+ fstps 96(%ecx)
+*/
+// end phase 3 fpu
+
+
+/* Phase 4 (completed, buggy) */
+/*
+/ movq 96(%ebx), %mm2
+/ movq 104(%ebx), %mm6
+ movups 96(%ebx), %xmm4
+
+
+/ movq (%ecx), %mm0
+/ movq 8(%ecx), %mm4
+ movups (%ecx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 24(%ecx), %mm1
+/ movq 16(%ecx), %mm5
+ movups 16(%ecx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, (%edx)
+/ movq %mm4, 8(%edx)
+ movups %xmm0, (%edx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm6, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 24(%edx)
+/ movq %mm7, 16(%edx)
+ movups %xmm2, 16(%edx)
+
+/ movq 32(%ecx), %mm0
+/ movq 40(%ecx), %mm4
+ movups 32(%ecx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 56(%ecx), %mm1
+/ movq 48(%ecx), %mm5
+ movups 48(%ecx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 32(%edx)
+/ movq %mm4, 40(%edx)
+ movups %xmm0, 32(%edx)
+
+/ pfsubr %mm1, %mm3
+/ pfsubr %mm5, %mm7
+// Luckily we can swap this (xmm1<->xmm2)
+ subps %xmm2, %xmm1
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm6, %mm7
+ mulps %xmm4, %xmm1
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm1, %xmm1
+
+/ movq %mm3, 56(%edx)
+/ movq %mm7, 48(%edx)
+ movups %xmm1, 48(%edx)
+
+
+/ movq 64(%ecx), %mm0
+/ movq 72(%ecx), %mm4
+ movups 64(%ecx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 88(%ecx), %mm1
+/ movq 80(%ecx), %mm5
+ movups 80(%ecx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 64(%edx)
+/ movq %mm4, 72(%edx)
+ movups %xmm0, 64(%edx)
+
+/ pfsub %mm1, %mm3
+/ pfsub %mm5, %mm7
+ subps %xmm1, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm6, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm2, %xmm2
+
+/ movq %mm3, 88(%edx)
+/ movq %mm7, 80(%edx)
+ movups %xmm2, 80(%edx)
+
+
+/ movq 96(%ecx), %mm0
+/ movq 104(%ecx), %mm4
+ movups 96(%ecx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 120(%ecx), %mm1
+/ movq 112(%ecx), %mm5
+ movups 112(%ecx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+//// shufps $177, %xmm1, %xmm1
+ shufps $27, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 96(%edx)
+/ movq %mm4, 104(%edx)
+ movups %xmm0, 96(%edx)
+
+/ pfsubr %mm1, %mm3
+/ pfsubr %mm5, %mm7
+// This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase
+ subps %xmm2, %xmm1
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm6, %mm7
+ mulps %xmm4, %xmm1
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $27, %xmm1, %xmm1
+
+/ movq %mm3, 120(%edx)
+/ movq %mm7, 112(%edx)
+ movups %xmm1, 112(%edx)
+*/
+
+// phase 4 fpu code
+/* Phase 4*/
+
+ flds (%ecx)
+ fadds 28(%ecx)
+ fstps (%edx)
+
+ flds (%ecx)
+ fsubs 28(%ecx)
+ fmuls 96(%ebx)
+ fstps 28(%edx)
+
+ flds 4(%ecx)
+ fadds 24(%ecx)
+ fstps 4(%edx)
+
+ flds 4(%ecx)
+ fsubs 24(%ecx)
+ fmuls 100(%ebx)
+ fstps 24(%edx)
+
+ flds 8(%ecx)
+ fadds 20(%ecx)
+ fstps 8(%edx)
+
+ flds 8(%ecx)
+ fsubs 20(%ecx)
+ fmuls 104(%ebx)
+ fstps 20(%edx)
+
+ flds 12(%ecx)
+ fadds 16(%ecx)
+ fstps 12(%edx)
+
+ flds 12(%ecx)
+ fsubs 16(%ecx)
+ fmuls 108(%ebx)
+ fstps 16(%edx)
+
+ flds 32(%ecx)
+ fadds 60(%ecx)
+ fstps 32(%edx)
+
+ flds 60(%ecx)
+ fsubs 32(%ecx)
+ fmuls 96(%ebx)
+ fstps 60(%edx)
+
+ flds 36(%ecx)
+ fadds 56(%ecx)
+ fstps 36(%edx)
+
+ flds 56(%ecx)
+ fsubs 36(%ecx)
+ fmuls 100(%ebx)
+ fstps 56(%edx)
+
+ flds 40(%ecx)
+ fadds 52(%ecx)
+ fstps 40(%edx)
+
+ flds 52(%ecx)
+ fsubs 40(%ecx)
+ fmuls 104(%ebx)
+ fstps 52(%edx)
+
+ flds 44(%ecx)
+ fadds 48(%ecx)
+ fstps 44(%edx)
+
+ flds 48(%ecx)
+ fsubs 44(%ecx)
+ fmuls 108(%ebx)
+ fstps 48(%edx)
+
+ flds 64(%ecx)
+ fadds 92(%ecx)
+ fstps 64(%edx)
+
+ flds 64(%ecx)
+ fsubs 92(%ecx)
+ fmuls 96(%ebx)
+ fstps 92(%edx)
+
+ flds 68(%ecx)
+ fadds 88(%ecx)
+ fstps 68(%edx)
+
+ flds 68(%ecx)
+ fsubs 88(%ecx)
+ fmuls 100(%ebx)
+ fstps 88(%edx)
+
+ flds 72(%ecx)
+ fadds 84(%ecx)
+ fstps 72(%edx)
+
+ flds 72(%ecx)
+ fsubs 84(%ecx)
+ fmuls 104(%ebx)
+ fstps 84(%edx)
+
+ flds 76(%ecx)
+ fadds 80(%ecx)
+ fstps 76(%edx)
+
+ flds 76(%ecx)
+ fsubs 80(%ecx)
+ fmuls 108(%ebx)
+ fstps 80(%edx)
+
+ flds 96(%ecx)
+ fadds 124(%ecx)
+ fstps 96(%edx)
+
+ flds 124(%ecx)
+ fsubs 96(%ecx)
+ fmuls 96(%ebx)
+ fstps 124(%edx)
+
+ flds 100(%ecx)
+ fadds 120(%ecx)
+ fstps 100(%edx)
+
+ flds 120(%ecx)
+ fsubs 100(%ecx)
+ fmuls 100(%ebx)
+ fstps 120(%edx)
+
+ flds 104(%ecx)
+ fadds 116(%ecx)
+ fstps 104(%edx)
+
+ flds 116(%ecx)
+ fsubs 104(%ecx)
+ fmuls 104(%ebx)
+ fstps 116(%edx)
+
+ flds 108(%ecx)
+ fadds 112(%ecx)
+ fstps 108(%edx)
+
+ flds 112(%ecx)
+ fsubs 108(%ecx)
+ fmuls 108(%ebx)
+ fstps 112(%edx)
+
+ flds (%edx)
+ fadds 12(%edx)
+ fstps (%ecx)
+
+ flds (%edx)
+ fsubs 12(%edx)
+ fmuls 112(%ebx)
+ fstps 12(%ecx)
+
+ flds 4(%edx)
+ fadds 8(%edx)
+ fstps 4(%ecx)
+
+ flds 4(%edx)
+ fsubs 8(%edx)
+ fmuls 116(%ebx)
+ fstps 8(%ecx)
+
+ flds 16(%edx)
+ fadds 28(%edx)
+ fstps 16(%ecx)
+
+ flds 28(%edx)
+ fsubs 16(%edx)
+ fmuls 112(%ebx)
+ fstps 28(%ecx)
+
+ flds 20(%edx)
+ fadds 24(%edx)
+ fstps 20(%ecx)
+
+ flds 24(%edx)
+ fsubs 20(%edx)
+ fmuls 116(%ebx)
+ fstps 24(%ecx)
+
+ flds 32(%edx)
+ fadds 44(%edx)
+ fstps 32(%ecx)
+
+ flds 32(%edx)
+ fsubs 44(%edx)
+ fmuls 112(%ebx)
+ fstps 44(%ecx)
+
+ flds 36(%edx)
+ fadds 40(%edx)
+ fstps 36(%ecx)
+
+ flds 36(%edx)
+ fsubs 40(%edx)
+ fmuls 116(%ebx)
+ fstps 40(%ecx)
+
+ flds 48(%edx)
+ fadds 60(%edx)
+ fstps 48(%ecx)
+
+ flds 60(%edx)
+ fsubs 48(%edx)
+ fmuls 112(%ebx)
+ fstps 60(%ecx)
+
+ flds 52(%edx)
+ fadds 56(%edx)
+ fstps 52(%ecx)
+
+ flds 56(%edx)
+ fsubs 52(%edx)
+ fmuls 116(%ebx)
+ fstps 56(%ecx)
+
+ flds 64(%edx)
+ fadds 76(%edx)
+ fstps 64(%ecx)
+
+ flds 64(%edx)
+ fsubs 76(%edx)
+ fmuls 112(%ebx)
+ fstps 76(%ecx)
+
+ flds 68(%edx)
+ fadds 72(%edx)
+ fstps 68(%ecx)
+
+ flds 68(%edx)
+ fsubs 72(%edx)
+ fmuls 116(%ebx)
+ fstps 72(%ecx)
+
+ flds 80(%edx)
+ fadds 92(%edx)
+ fstps 80(%ecx)
+
+ flds 92(%edx)
+ fsubs 80(%edx)
+ fmuls 112(%ebx)
+ fstps 92(%ecx)
+
+ flds 84(%edx)
+ fadds 88(%edx)
+ fstps 84(%ecx)
+
+ flds 88(%edx)
+ fsubs 84(%edx)
+ fmuls 116(%ebx)
+ fstps 88(%ecx)
+
+ flds 96(%edx)
+ fadds 108(%edx)
+ fstps 96(%ecx)
+
+ flds 96(%edx)
+ fsubs 108(%edx)
+ fmuls 112(%ebx)
+ fstps 108(%ecx)
+
+ flds 100(%edx)
+ fadds 104(%edx)
+ fstps 100(%ecx)
+
+ flds 100(%edx)
+ fsubs 104(%edx)
+ fmuls 116(%ebx)
+ fstps 104(%ecx)
+
+ flds 112(%edx)
+ fadds 124(%edx)
+ fstps 112(%ecx)
+
+ flds 124(%edx)
+ fsubs 112(%edx)
+ fmuls 112(%ebx)
+ fstps 124(%ecx)
+
+ flds 116(%edx)
+ fadds 120(%edx)
+ fstps 116(%ecx)
+
+ flds 120(%edx)
+ fsubs 116(%edx)
+ fmuls 116(%ebx)
+ fstps 120(%ecx)
+
+// end of phase 4 fpu
+
+// below stuff needs to be finished I use FPU code for first
+/* Phase 5 (completed, crashing) */
+/*
+/ movq 112(%ebx), %mm2
+ // move 8 byte data to (low)high quadword - check this! atmos
+ movlps 112(%ebx), %xmm4
+ // maybe I need movhlps too to get data into correct quadword
+ movlhps %xmm4, %xmm4
+
+/ movq (%edx), %mm0
+/ movq 16(%edx), %mm4
+ movups (%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+// hmm? this is strange
+/ movq 8(%edx), %mm1
+/ movq 24(%edx), %mm5
+ movlps 8(%edx), %xmm1
+ movhps 24(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+ pshufd $177, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, (%ecx)
+/ movq %mm4, 16(%ecx)
+ movlps %xmm0, (%ecx)
+ movhps %xmm0, 16(%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsubr %mm5, %mm7
+// I need to emulate pfsubr here
+ movaps %xmm1, %xmm3
+ subps %xmm2, %xmm3
+ subps %xmm1, %xmm2
+// now move correct quadword from reverse substration in xmm3 to correct
+// quadword in xmm2 and leave other quadword with non-reversed substration untouched
+/// shufpd $2, %xmm3, %xmm2
+// (or $1?) (see ia32-ref p.749)
+// optimize
+ movq %xmm2, %xmm3
+ movaps %xmm3, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm2, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $177, %xmm2, %xmm2
+
+/ movq %mm3, 8(%ecx)
+/ movq %mm7, 24(%ecx)
+ movlps %xmm2, 8(%ecx)
+ movhps %xmm2, 24(%ecx)
+
+/ movq 32(%edx), %mm0
+/ movq 48(%edx), %mm4
+ movlps 32(%edx), %xmm0
+ movhps 48(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 40(%edx), %mm1
+/ movq 56(%edx), %mm5
+ movlps 40(%edx), %xmm1
+ movhps 56(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+ shufps $177, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 32(%ecx)
+/ movq %mm4, 48(%ecx)
+ movlps %xmm0, 32(%ecx)
+ movhps %xmm0, 48(%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsubr %mm5, %mm7
+ movaps %xmm1, %xmm3
+ subps %xmm2, %xmm3
+ subps %xmm1, %xmm2
+/// shufpd $2, %xmm3, %xmm2
+// (or $1?)
+// optimize
+ movq %xmm2, %xmm3
+ movaps %xmm3, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm2, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $177, %xmm2, %xmm2
+
+/ movq %mm3, 40(%ecx)
+/ movq %mm7, 56(%ecx)
+ movlps %xmm2, 40(%ecx)
+ movhps %xmm2, 56(%ecx)
+
+
+/ movq 64(%edx), %mm0
+/ movq 80(%edx), %mm4
+ movlps 64(%edx), %xmm0
+ movhps 80(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 72(%edx), %mm1
+/ movq 88(%edx), %mm5
+ movlps 72(%edx), %xmm1
+ movhps 88(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+ shufps $177, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 64(%ecx)
+/ movq %mm4, 80(%ecx)
+ movlps %xmm0, 64(%ecx)
+ movhps %xmm0, 80(%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsubr %mm5, %mm7
+ movaps %xmm1, %xmm3
+ subps %xmm2, %xmm3
+ subps %xmm1, %xmm2
+/// shufpd $2, %xmm3, %xmm2
+// (or $1?)
+// optimize
+ movq %xmm2, %xmm3
+ movaps %xmm3, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm2, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $177, %xmm2, %xmm2
+
+/ movq %mm3, 72(%ecx)
+/ movq %mm7, 88(%ecx)
+ movlps %xmm2, 72(%ecx)
+ movhps %xmm2, 88(%ecx)
+
+/ movq 96(%edx), %mm0
+/ movq 112(%edx), %mm4
+ movups 96(%edx), %xmm0
+
+/ movq %mm0, %mm3
+/ movq %mm4, %mm7
+ movaps %xmm0, %xmm2
+
+/ movq 104(%edx), %mm1
+/ movq 120(%edx), %mm5
+ movlps 104(%edx), %xmm1
+ movhps 120(%edx), %xmm1
+
+/ pswapd %mm1, %mm1
+/ pswapd %mm5, %mm5
+ shufps $177, %xmm1, %xmm1
+
+/ pfadd %mm1, %mm0
+/ pfadd %mm5, %mm4
+ addps %xmm1, %xmm0
+
+/ movq %mm0, 96(%ecx)
+/ movq %mm4, 112(%ecx)
+ movups %xmm0, 96(%ecx)
+
+/ pfsub %mm1, %mm3
+/ pfsubr %mm5, %mm7
+ movaps %xmm1, %xmm3
+ subps %xmm2, %xmm3
+ subps %xmm1, %xmm2
+/// shufpd $2, %xmm3, %xmm2
+// (or $1?)
+// optimize
+ movq %xmm2, %xmm3
+ movaps %xmm3, %xmm2
+
+/ pfmul %mm2, %mm3
+/ pfmul %mm2, %mm7
+ mulps %xmm4, %xmm2
+
+/ pswapd %mm3, %mm3
+/ pswapd %mm7, %mm7
+ shufps $177, %xmm2, %xmm2
+
+/ movq %mm3, 104(%ecx)
+/ movq %mm7, 120(%ecx)
+ movlps %xmm2, 104(%ecx)
+ movhps %xmm2, 120(%ecx)
+*/
+
+
+/* Phase 6. This is the end of easy road. */
+/* Code below is coded in scalar mode. Should be optimized */
+//
+// movd plus_1f, %mm6
+// punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/
+// movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */
+/*
+ movq 32(%ecx), %mm0
+ movq 64(%ecx), %mm2
+ movq %mm0, %mm1
+ movq %mm2, %mm3
+ pxor %mm7, %mm1
+ pxor %mm7, %mm3
+ pfacc %mm1, %mm0
+ pfacc %mm3, %mm2
+ pfmul %mm6, %mm0
+ pfmul %mm6, %mm2
+ movq %mm0, 32(%edx)
+ movq %mm2, 64(%edx)
+
+ movd 44(%ecx), %mm0
+ movd 40(%ecx), %mm2
+ movd 120(%ebx), %mm3
+ punpckldq 76(%ecx), %mm0
+ punpckldq 72(%ecx), %mm2
+ punpckldq %mm3, %mm3
+ movq %mm0, %mm4
+ movq %mm2, %mm5
+ pfsub %mm2, %mm0
+ pfmul %mm3, %mm0
+ movq %mm0, %mm1
+ pfadd %mm5, %mm0
+ pfadd %mm4, %mm0
+ movq %mm0, %mm2
+ punpckldq %mm1, %mm0
+ punpckhdq %mm1, %mm2
+ movq %mm0, 40(%edx)
+ movq %mm2, 72(%edx)
+
+ movd 48(%ecx), %mm3
+ movd 60(%ecx), %mm2
+ pfsub 52(%ecx), %mm3
+ pfsub 56(%ecx), %mm2
+ pfmul 120(%ebx), %mm3
+ pfmul 120(%ebx), %mm2
+ movq %mm2, %mm1
+
+ pfadd 56(%ecx), %mm1
+ pfadd 60(%ecx), %mm1
+ movq %mm1, %mm0
+
+ pfadd 48(%ecx), %mm0
+ pfadd 52(%ecx), %mm0
+ pfadd %mm3, %mm1
+ punpckldq %mm2, %mm1
+ pfadd %mm3, %mm2
+ punpckldq %mm2, %mm0
+ movq %mm1, 56(%edx)
+ movq %mm0, 48(%edx)
+*/
+/*---*/
+/*
+ movd 92(%ecx), %mm1
+ pfsub 88(%ecx), %mm1
+ pfmul 120(%ebx), %mm1
+ movd %mm1, 92(%edx)
+ pfadd 92(%ecx), %mm1
+ pfadd 88(%ecx), %mm1
+ movq %mm1, %mm0
+
+ pfadd 80(%ecx), %mm0
+ pfadd 84(%ecx), %mm0
+ movd %mm0, 80(%edx)
+
+ movd 80(%ecx), %mm0
+ pfsub 84(%ecx), %mm0
+ pfmul 120(%ebx), %mm0
+ pfadd %mm0, %mm1
+ pfadd 92(%edx), %mm0
+ punpckldq %mm1, %mm0
+ movq %mm0, 84(%edx)
+
+ movq 96(%ecx), %mm0
+ movq %mm0, %mm1
+ pxor %mm7, %mm1
+ pfacc %mm1, %mm0
+ pfmul %mm6, %mm0
+ movq %mm0, 96(%edx)
+
+ movd 108(%ecx), %mm0
+ pfsub 104(%ecx), %mm0
+ pfmul 120(%ebx), %mm0
+ movd %mm0, 108(%edx)
+ pfadd 104(%ecx), %mm0
+ pfadd 108(%ecx), %mm0
+ movd %mm0, 104(%edx)
+
+ movd 124(%ecx), %mm1
+ pfsub 120(%ecx), %mm1
+ pfmul 120(%ebx), %mm1
+ movd %mm1, 124(%edx)
+ pfadd 120(%ecx), %mm1
+ pfadd 124(%ecx), %mm1
+ movq %mm1, %mm0
+
+ pfadd 112(%ecx), %mm0
+ pfadd 116(%ecx), %mm0
+ movd %mm0, 112(%edx)
+
+ movd 112(%ecx), %mm0
+ pfsub 116(%ecx), %mm0</