Web lists-archives.org

Re: [MPlayer-dev-eng] [PATCH] compiling with Intel C




Hi Dmitry

Dmitry Antipov a écrit :
Hello all,

an attached patch proposes the bits needed for the first successful :-) compilation of svn-22004 with Intel C 9.1.046. It compiles and plays a few movies found on my hard drive, but it's highly experimental.

You did it!!! Weeeh!

Were you able to measure any speed-up?
the option '-benchmark' is your friend.


[...]

A few things about your patch that look very much like cosmetics...



------------------------------------------------------------------------

diff -ur MPlayer-1.0svn22004/configure MPlayer-1.0svn22004-devel/configure
--- MPlayer-1.0svn22004/configure	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/configure	2007-01-24 18:18:17.000000000 +0300
@@ -1597,7 +1597,7 @@
   _stripbinaries=no
 elif test -z "$CFLAGS" ; then
   if test "$cc_vendor" = "intel" ; then
-    CFLAGS="-O2 $_march $_mcpu $_pipe -fomit-frame-pointer"
+    CFLAGS="-w0 -O3 $_march $_mcpu $_pipe -fomit-frame-pointer -funroll-loops -fno-math-errno"
   elif test "$cc_vendor" != "gnu" ; then
     CFLAGS="-O2 $_march $_mcpu $_pipe"
   else
@@ -1609,8 +1609,6 @@
 if test -n "$LDFLAGS" ; then
   _ld_extra="$_ld_extra $LDFLAGS"
   _warn_CFLAGS=yes
-elif test "$cc_vendor" = "intel" ; then
-  _ld_extra="$_ld_extra -i-static"
 fi
 if test -n "$CPPFLAGS" ; then
   _inc_extra="$_inc_extra $CPPFLAGS"
@@ -7236,7 +7234,7 @@
   echores "no"
 fi
-if cc_check -Wdeclaration-after-statement ; then
+if test "$cc_vendor" != "intel" && cc_check -Wdeclaration-after-statement ; then
   CFLAGS="-Wdeclaration-after-statement $CFLAGS"
 fi
@@ -7721,7 +7719,11 @@
 #endif
/* attribute(used) as needed by some compilers */
-#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 300)
+#if __INTEL_COMPILER
+/* Since Intel C (as of 9.1.046) doesn't understand 'used' attribute, here is a hack
+   to force it to emit variable which is referenced only in inline assembly code */
+# define attribute_used __attribute__((section (".data")))
+#elif (__GNUC__ * 100 + __GNUC_MINOR__ >= 300)
 # define attribute_used __attribute__((used))
 #else
 # define attribute_used
diff -ur MPlayer-1.0svn22004/libavcodec/h264.c MPlayer-1.0svn22004-devel/libavcodec/h264.c
--- MPlayer-1.0svn22004/libavcodec/h264.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/libavcodec/h264.c	2007-01-24 17:20:49.000000000 +0300
@@ -5982,7 +5982,7 @@
     return ctx + 4 * cat;
 }
-static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
+static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
diff -ur MPlayer-1.0svn22004/libavcodec/i386/dsputil_mmx.c MPlayer-1.0svn22004-devel/libavcodec/i386/dsputil_mmx.c
--- MPlayer-1.0svn22004/libavcodec/i386/dsputil_mmx.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/libavcodec/i386/dsputil_mmx.c	2007-01-24 17:45:20.000000000 +0300
@@ -633,6 +633,32 @@
 }
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+#ifdef __INTEL_COMPILER
+  /* Ugly, but uses a minimal number of operands just to fit into Intel C limits */
+  asm volatile("movd (%1), %%mm0               \n\t"
+	       "mov %3, %%eax                  \n\t"
+	       "movd (%1, %%eax), %%mm1        \n\t"
+	       "add %3, %%eax                  \n\t"
+	       "movd (%1, %%eax), %%mm2        \n\t"
+	       "add %3, %%eax                  \n\t"
+	       "movd (%1, %%eax), %%mm3        \n\t"
+	       "punpcklbw %%mm1, %%mm0         \n\t"
+	       "punpcklbw %%mm3, %%mm2         \n\t"
+	       "movq %%mm0, %%mm1              \n\t"
+	       "punpcklwd %%mm2, %%mm0         \n\t"
+	       "punpckhwd %%mm2, %%mm1         \n\t"
+	       "movd %%mm0, (%0)               \n\t"
+	       "punpckhdq %%mm0, %%mm0         \n\t"
+	       "mov %2, %%eax                  \n\t"
+	       "movd %%mm0, (%0, %%eax)        \n\t"
+	       "add %2, %%eax                  \n\t"
+	       "movd %%mm1, (%0, %%eax)        \n\t"
+	       "punpckhdq %%mm1, %%mm1         \n\t"
+	       "add %2, %%eax                  \n\t"
+	       "movd %%mm1, (%0, %%eax)        \n\t"
+	       :: "r" (dst), "r" (src), "r"(dst_stride), "r"(src_stride)
+	       : "eax");
+#else

Is there a reason why this special version should only be used by ICC? Can GCC compile it too? Is it supposed to be slower? By how much?


static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
diff -ur MPlayer-1.0svn22004/libmpcodecs/vf_fspp.c MPlayer-1.0svn22004-devel/libmpcodecs/vf_fspp.c
--- MPlayer-1.0svn22004/libmpcodecs/vf_fspp.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/libmpcodecs/vf_fspp.c	2007-01-24 17:21:20.000000000 +0300
@@ -717,7 +717,7 @@
#ifdef HAVE_MMX -static uint64_t attribute_used __attribute__((aligned(8))) temps[4];//!!
+static uint64_t attribute_used __attribute__((aligned(8))) temps[4] = {0ULL};//!!

why 0ULL in particular?


static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_382683433=FIX64(0.382683433, 14); static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_541196100=FIX64(0.541196100, 14); diff -ur MPlayer-1.0svn22004/loader/dmo/DMO_AudioDecoder.c MPlayer-1.0svn22004-devel/loader/dmo/DMO_AudioDecoder.c
--- MPlayer-1.0svn22004/loader/dmo/DMO_AudioDecoder.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/loader/dmo/DMO_AudioDecoder.c	2007-01-24 17:18:31.000000000 +0300
@@ -35,8 +35,6 @@
#include "../../mp_msg.h" -#define __MODULE__ "DirectShow audio decoder"
-

Why ?


diff -ur MPlayer-1.0svn22004/loader/dmo/DMO_VideoDecoder.c MPlayer-1.0svn22004-devel/loader/dmo/DMO_VideoDecoder.c
--- MPlayer-1.0svn22004/loader/dmo/DMO_VideoDecoder.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/loader/dmo/DMO_VideoDecoder.c	2007-01-24 17:18:34.000000000 +0300
@@ -58,8 +58,6 @@
 // strcmp((const char*)info.dll,...)  is used instead of  (... == ...)
 // so Arpi could use char* pointer in his simplified DMO_VideoDecoder class
-#define __MODULE__ "DirectShow_VideoDecoder"
-

same here


diff -ur MPlayer-1.0svn22004/loader/dshow/DS_AudioDecoder.c MPlayer-1.0svn22004-devel/loader/dshow/DS_AudioDecoder.c
--- MPlayer-1.0svn22004/loader/dshow/DS_AudioDecoder.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/loader/dshow/DS_AudioDecoder.c	2007-01-24 17:18:39.000000000 +0300
@@ -33,8 +33,6 @@
 #include <stdio.h>
 #include <stdlib.h>
-#define __MODULE__ "DirectShow audio decoder"

here too.


diff -ur MPlayer-1.0svn22004/loader/dshow/DS_VideoDecoder.c MPlayer-1.0svn22004-devel/loader/dshow/DS_VideoDecoder.c
--- MPlayer-1.0svn22004/loader/dshow/DS_VideoDecoder.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/loader/dshow/DS_VideoDecoder.c	2007-01-24 17:18:41.000000000 +0300
@@ -58,8 +58,6 @@
 // strcmp((const char*)info.dll,...)  is used instead of  (... == ...)
 // so Arpi could use char* pointer in his simplified DS_VideoDecoder class
-#define __MODULE__ "DirectShow_VideoDecoder"

here also


diff -ur MPlayer-1.0svn22004/mp3lib/dct64_sse.c MPlayer-1.0svn22004-devel/mp3lib/dct64_sse.c
--- MPlayer-1.0svn22004/mp3lib/dct64_sse.c	2007-01-24 18:08:55.000000000 +0300
+++ MPlayer-1.0svn22004-devel/mp3lib/dct64_sse.c	2007-01-24 18:14:00.000000000 +0300
@@ -70,30 +70,27 @@
for (i = 0; i < 0x20; i += 0x10)
         {
-            asm(
-                "movaps    %4, %%xmm1\n\t"
-                "movaps    %5, %%xmm3\n\t"
-                "movaps    %6, %%xmm4\n\t"
-                "movaps    %7, %%xmm6\n\t"
-                "movaps    %%xmm1, %%xmm7\n\t"
-                "shufps    $27, %%xmm7, %%xmm7\n\t"
-                "movaps    %%xmm3, %%xmm5\n\t"
-                "shufps    $27, %%xmm5, %%xmm5\n\t"
-                "movaps    %%xmm4, %%xmm2\n\t"
-                "shufps    $27, %%xmm2, %%xmm2\n\t"
-                "movaps    %%xmm6, %%xmm0\n\t"
-                "shufps    $27, %%xmm0, %%xmm0\n\t"
-                "addps     %%xmm0, %%xmm1\n\t"
-                "movaps    %%xmm1, %0\n\t"
-                "addps     %%xmm2, %%xmm3\n\t"
-                "movaps    %%xmm3, %1\n\t"
-                "subps     %%xmm4, %%xmm5\n\t"
-                "movaps    %%xmm5, %2\n\t"
-                "subps     %%xmm6, %%xmm7\n\t"
-                "movaps    %%xmm7, %3\n\t"
-                :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
-                :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
-                );
+	    asm("movaps    (%1), %%xmm1\n\t"
+		"movaps    16(%1), %%xmm3\n\t"
+		"movaps    32(%1), %%xmm4\n\t"
+		"movaps    48(%1), %%xmm6\n\t"
+		"movaps    %%xmm1, %%xmm7\n\t"
+		"shufps    $27, %%xmm7, %%xmm7\n\t"
+		"movaps    %%xmm3, %%xmm5\n\t"
+		"shufps    $27, %%xmm5, %%xmm5\n\t"
+		"movaps    %%xmm4, %%xmm2\n\t"
+		"shufps    $27, %%xmm2, %%xmm2\n\t"
+		"movaps    %%xmm6, %%xmm0\n\t"
+		"shufps    $27, %%xmm0, %%xmm0\n\t"
+		"addps     %%xmm0, %%xmm1\n\t"
+		"movaps    %%xmm1, (%0)\n\t"
+		"addps     %%xmm2, %%xmm3\n\t"
+		"movaps    %%xmm3, 16(%0)\n\t"
+		"subps     %%xmm4, %%xmm5\n\t"
+		"movaps    %%xmm5, 32(%0)\n\t"
+		"subps     %%xmm6, %%xmm7\n\t"
+		"movaps    %%xmm7, 48(%0)\n\t"
+		: : "r"(b2 + i), "r"(b1 + i));
         }
     }

Having this re-indenting the code doesn't help tracking down changes. Please split cosmetic and functional changes.

I don't know what others may think, but let me say that it's really a huge achievement you've done here!
Thanks!

Guillaume
_______________________________________________
MPlayer-dev-eng mailing list
MPlayer-dev-eng@xxxxxxxxxxxx
http://lists.mplayerhq.hu/mailman/listinfo/mplayer-dev-eng