Bill Allombert on Mon, 12 Apr 2004 14:31:45 +0200


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Inline assembly kernel for x86_64/amd64/ia32e 64-bit mode PARI


Hello PARI-dev,

I am investigating what need to be done for proper x86_64/amd64/ia32e
optimized assembly support in PARI.

The good news is the attached patch that should enable inline assembly level0
kernel for x86_64/amd64/ia32e in 64-bit mode.

Some basic performance points:
The box is a AMD Opteron(tm) Processor 240 at 1400MHz with 1MB of cache
'make bench' is around 400ms instead of 600ms with this patch.
factor(2^256+1) take around 5s instead of 10s.

The bad news is that I don't have access x86_64 box with a standard
installation so I don't really know they behave, I know nothing about
x86_64 assembly and I am not better with plain x86 asm. This patch was
done by changing addl to addq, etc.

Before commiting this patch there are some issue to iron out :

--- Given the 3 names for the architecture, how to name the kernel ?
    (I would choose x86_64 to match the Linux kernel uname value).

--- We need to tell apart 32-bit mode from 64-bit mode (easy) and use
    either the ix86 kernel or the x86_64 one. 

--- x86_64 in 64-bit mode seems to need -fPIC to build shared libraries,
    but not in 32-bit mode. 

--- The ix86 non-inline asm kernel is not PIC-aware. This is not a
    problem in 32-bit mode, but just renaming addl to addq will not lead 
    to a PIC-aware 64-bit mode on-inline asm kernel...

--- Given the amount of changes to make the x86_64 level0.h from the ix86
    one, I wonder if it could be worthwhile to use only one copy with
    some CPP tricks.

Before I do that, please test the patch on a x86-64 with a more standard
installation so I get some idea of what is going on.

Cheers,
Bill.
Index: config/get_archos
===================================================================
RCS file: /home/cvs/pari/config/get_archos,v
retrieving revision 1.14
diff -u -r1.14 get_archos
--- config/get_archos	15 Oct 2003 12:34:30 -0000	1.14
+++ config/get_archos	12 Apr 2004 10:49:00 -0000
@@ -52,6 +52,7 @@
         esac ;;
   alpha)         asmarch=$arch;         pretty=Alpha ;;
   ppc)           asmarch=$arch;         pretty='Power PC' ;;
+  x86_64)        asmarch=$arch;         pretty='amd64/ia32e' ;;
   arm*)          asmarch=none;          pretty=$arch ;;
   mips)          asmarch=none;          pretty=MIPS ;;
   sh3)           asmarch=none;          pretty=SH-3 ;;
Index: config/get_cc
===================================================================
RCS file: /home/cvs/pari/config/get_cc,v
retrieving revision 1.22
diff -u -r1.22 get_cc
--- config/get_cc	21 Oct 2003 16:41:54 -0000	1.22
+++ config/get_cc	12 Apr 2004 10:49:00 -0000
@@ -119,7 +119,7 @@
   DBGFLAGS=${DBGFLAGS:-"-g $warn"}
   # Some architectures need -fPIC for building dynamic lib
   case "$osname-$arch" in
-    hpux-*|*-ia64) DLCFLAGS=-fPIC ;;
+    hpux-*|*-ia64|*-x86_64) DLCFLAGS=-fPIC ;;
     darwin-*) DLCFLAGS=-fno-common;;
   esac
   # Specific optimisations for some architectures
--- /dev/null	Wed Apr 10 22:14:05 2002
+++ src/kernel/x86_64/MakeLVL0.SH	Mon Apr 12 02:08:23 2004
@@ -0,0 +1,12 @@
+# Level 0 kernel is "asm inline" if gcc and "asm extern" if not
+
+level0=$src/kernel/$kernlvl0
+none=$src/kernel/none
+
+cat >> $file << EOT
+parilvl0.h: $level0/asm0.h 
+	cat $level0/asm0.h > parilvl0.h
+kernel\$(_O): $none/level0.h pariinl.h
+	\$(CC) -c \$(CFLAGS) \$(CPPFLAGS) -o kernel\$(_O) $none/level0.c
+EOT
+
--- /dev/null	Wed Apr 10 22:14:05 2002
+++ src/kernel/x86_64/asm0.h	Mon Apr 12 02:37:44 2004
@@ -0,0 +1,136 @@
+#line 2 "../src/kernel/x86-64/asm0.h"
+/* $Id: level0.h,v 1.9 2003/03/05 20:17:11 karim Exp $
+
+Copyright (C) 2004  The PARI group.
+
+This file is part of the PARI/GP package.
+
+PARI/GP is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation. It is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY WHATSOEVER.
+
+Check the License for details. You should have received a copy of it, along
+with the package; see the file 'COPYING'. If not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+/* Written by Bill Allombert from the ix86 version by Bruno Haible. Basically
+ * change insl to insq*/
+
+#ifndef ASMINLINE
+
+#define LOCAL_OVERFLOW
+#define LOCAL_HIREMAINDER
+
+BEGINEXTERN
+  extern ulong overflow, hiremainder;
+  extern long addll(ulong a, ulong b);
+  extern long addllx(ulong a, ulong b);
+  extern long subll(ulong a, ulong b);
+  extern long subllx(ulong a, ulong b);
+  extern long shiftl(ulong x, ulong y);
+  extern long shiftlr(ulong x, ulong y);
+  extern long mulll(ulong x, ulong y);
+  extern long addmul(ulong x, ulong y);
+  extern long divll(ulong x, ulong y);
+  extern long bfffo(ulong x);
+ENDEXTERN
+
+#else /* ASMINLINE */
+
+#define LOCAL_HIREMAINDER  register ulong hiremainder
+#define LOCAL_OVERFLOW     register ulong overflow
+
+/* Different assemblers have different syntax for the "shldl" and "shrdl"
+   instructions. */
+#if defined(__EMX__) || defined(__DJGCC__) || defined(__GO32__) || (defined(linux) && !defined(__ELF__)) || defined(__386BSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || defined(NeXT) || defined(__CYGWIN32__) || defined(__MINGW32__) || defined(COHERENT)
+#  define SHCL "%%cl,"
+#else
+#  define SHCL
+#endif
+
+
+#define addll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("addq %3,%0 ; adcq %1,%1" \
+        : "=r" (__value), "=r" (overflow) \
+        : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \
+        : "cc"); \
+  __value; \
+})
+
+#define addllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \
+   __asm__ ("subq %5,%2 ; adcq %4,%0 ; adcq %1,%1" \
+        : "=r" (__value), "=r" (overflow), "=r" (__temp) \
+        : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \
+        : "cc"); \
+  __value; \
+})
+
+#define subll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("subq %3,%0 ; adcq %1,%1" \
+        : "=r" (__value), "=r" (overflow) \
+        : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \
+        : "cc"); \
+  __value; \
+})
+
+#define subllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \
+   __asm__ ("subq %5,%2 ; sbbq %4,%0 ; adcq %1,%1" \
+        : "=r" (__value), "=r" (overflow), "=r" (__temp) \
+        : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \
+        : "cc"); \
+  __value; \
+})
+
+#define shiftl(a,c) \
+({ ulong __valuelo = (a), __count = (c), __valuehi; \
+   __asm__ ("shldq "SHCL"%2,%0" /* shift %0 left by %cl bits, feeding in %2 from the right */ \
+        : "=q" (__valuehi) \
+        : "0" ((ulong)0), "q" (__valuelo), "c" /* %ecx */ (__count)); \
+   hiremainder = __valuehi; \
+   __valuelo << __count; \
+})
+#define shiftlr(a,c) \
+({ ulong __valuehi = (a), __count = (c), __valuelo; \
+   __asm__ ("shrdq "SHCL"%2,%0" /* shift %0 right by %cl bits, feeding in %2 from the left */ \
+        : "=q" (__valuelo) \
+        : "0" ((ulong)0), "q" (__valuehi), "c" /* %ecx */ (__count)); \
+   hiremainder = __valuelo; \
+   __valuehi >> __count; \
+})
+
+#define mulll(a,b) \
+({ ulong __valuelo, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("mulq %3" \
+        : "=a" /* %eax */ (__valuelo), "=d" /* %edx */ (hiremainder) \
+        : "0" (__arg1), "rm" (__arg2)); \
+   __valuelo; \
+})
+
+#define addmul(a,b) \
+({ ulong __valuelo, __arg1 = (a), __arg2 = (b), __temp; \
+   __asm__ ("mulq %4 ; addq %5,%0 ; adcq %6,%1" \
+        : "=a" /* %eax */ (__valuelo), "=&d" /* %edx */ (hiremainder), "=r" (__temp) \
+        : "0" (__arg1), "rm" (__arg2), "g" (hiremainder), "2" ((ulong)0)); \
+   __valuelo; \
+})
+
+#define divll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("divq %4" \
+        : "=a" /* %eax */ (__value), "=d" /* %edx */ (hiremainder) \
+        : "0" /* %eax */ (__arg1), "1" /* %edx */ (hiremainder), "mr" (__arg2)); \
+   __value; \
+})
+
+#define bfffo(x) \
+({ ulong __arg = (x); \
+   long leading_one_position; \
+  __asm__ ("bsrq %1,%0" : "=r" (leading_one_position) : "rm" (__arg)); \
+  63 - leading_one_position; \
+})
+#endif