\documentclass{beamer}

\mode<presentation> {
  \usetheme{Montpellier}
  \setbeamercovered{transparent}
  \beamertemplatenavigationsymbolsempty
}

\usepackage[french]{babel}
\usepackage[latin1]{inputenc}
\usepackage{times}
\usepackage[T1]{fontenc}
\newcommand{\Cl}{\mathcal{C}\ell}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\P}{\mathcal{P}}
\renewcommand{\O}{\mathcal{O}}
\newcommand{\p}{\mathfrak{p}}
\newcommand{\f}{\mathfrak{f}}
\renewcommand{\S}{\mathfrak{S}}
\newcommand{\CC}{\mathbb{C}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\FF}{\mathbb{F}}
\renewcommand{\d}{\mathrm{d}}
\newcommand{\Gal}{\mathrm{Gal}}
\newcommand{\frob}[2]{\left(\frac{#1}{#2}\right)}
\newcommand{\val}{v}
%\renewcommand{\theorem}{}
\newtheorem{defi}{Définition}
\newtheorem{theorema}{Théorème}
\newtheorem{lemmaa}[theorem]{Lemme}
\newtheorem{remark}[theorem]{Remarque}
\newtheorem{princip}[theorem]{Principe}
\newtheorem{algorithm}[theorem]{Algorithme}
\renewcommand{\theorem}{\theorema}
\renewcommand{\lemma}{\lemmaa}
\newcommand{\kbd}[1]{\texttt{#1}}
\newcommand{\fun}[3]{\texttt{#1 #2(#3)}}

\title{Parallel GP interface tutorial}

\author{B.~Allombert}
\institute
{
  IMB \\
  CNRS/Université Bordeaux 1
}

\date{15/01/2019}

\subject{Talk}
%\logo{\pgfimage[height=1cm]{um2.png}}
%\logo{\pgfuseimage{um2}}
%\logo{\includegraphics[height=1.5cm]{um2.png}}
%\AtBeginSubsection[] {
%  \begin{frame}<beamer>{Lignes directrices}
%    \tableofcontents[currentsection,currentsubsection]
%  \end{frame}
%}

%\beamerdefaultoverlayspecification{<+->}

\begin{document}
\begin{frame}
  \titlepage
\end{frame}

\begin{frame}
  \tableofcontents
  % Vous pouvez, si vous le souhaiter ajouter l'option [pausesections]
\end{frame}

\section{Introduction}
\begin{frame}{Introduction}
PARI now supports two common multi-threading technologies:
\begin{itemize}
\item POSIX thread: run on a single machine, lightweight, flexible, fragile.
\item Message passing interface (MPI): run on as many machine as you want,
robust, rigid, heavyweight. Used by most clusters.
\end{itemize}
However the parallel GP interface does not depend on the multithread interface:
a properly written GP program will work identically with both.
In this tutorial we will focus on POSIX threads.
\end{frame}

\section{POSIX threads}
\begin{frame}[fragile]{POSIX threads}
\begin{itemize}
\item To use POSIX threads, add \verb!--mt=pthread!.
\item \kbd{gp-sta} is generally $20\%$ faster than gp-dyn.
\end{itemize}
  \begin{verbatim}
./Configure --mt=pthread
make test-parallel
make test-rnfkummer
  \end{verbatim}
Compare \kbd{gp-sta} against \kbd{gp-dyn}.
  \begin{verbatim}
ln -s Olinux-x86_64/gp-sta .
./gp-sta
  \end{verbatim}
Check for "threading engine: pthread"
\end{frame}

\section{Resources}
\begin{frame}[fragile]{Resources}
The number of secondary threads to use is controlled by
\kbd{default(nbthreads)}. The default value of nbthreads is the number of CPU
threads (i.e. the number of CPU cores multiplied by the hyperthreading factor).
The default can be freely modified.

The PARI stack size in secondary threads is controlled by
\kbd{default(threadsize)}, so the total memory allocated is equal to
$\kbd{parisize}+\kbd{nbthreads}\times\kbd{threadsize}$.  By default,
$\kbd{threadsize}=\kbd{parisize}$.
\begin{verbatim}
default(nbthreads)
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{Parallel algorithms}
A number of PARI functions will use parallelism when available:
\begin{verbatim}
? default(timer,1);
? isprime(2^600+187)
cpu time = 1,244 ms, real time = 197 ms.
%2 = 1
? nbthreads = default(nbthreads);
? default(nbthreads,1)
? isprime(2^600+187)
time = 660 ms.
%5 = 1
? default(nbthreads,nbthreads);
\end{verbatim}
Here the parallel version is three times faster.
Under pthread, the CPU time is the sum of the time used by all threads.
The realtime is smaller than the CPU time due to parallelism.
\end{frame}

\begin{frame}[fragile]{Simple examples}
\begin{verbatim}
? ismersenne(x)=ispseudoprime(2^x-1);
? apply(ismersenne,primes(400))
cpu time = 1,248 ms, real time = 1,247 ms.
%7 = [1,1,1,1,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,...
? parapply(ismersenne,primes(400))
cpu time = 2,253 ms, real time = 298 ms.
%8 = [1,1,1,1,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,...
? select(ismersenne,primes(400))
cpu time = 1,192 ms, real time = 1,199 ms.
%9 = [2,3,5,7,13,17,19,31,61,89,107,127,521,607,1279,2203,2281]
? parselect(ismersenne,primes(400))
cpu time = 2,248 ms, real time = 299 ms.
%10 = [2,3,5,7,13,17,19,31,61,89,107,127,521,607,1279,2203,2281]
\end{verbatim}
Compare the real time.
\end{frame}

\section{Concept}
\begin{frame}[fragile]{Concept}
GP provides functions that allow parallel execution of GP code, subject to
the following limitations: the parallel code
\begin{itemize}
\item must be free of side effect,
\item cannot access global variables or local variables declared with
  \kbd{local()} (but \kbd{my()} is OK),
\item instead access variables exported to the parallel world with
 \kbd{export}.
\end{itemize}
\end{frame}

\begin{frame}[fragile]{The parallel world}
\kbd{export} is used to set values in the parallel world.
\begin{verbatim}
? ismersenne(x)=ispseudoprime(2^x-1);
? fun(V)=parvector(#V,i,ismersenne(V[i]));
? fun(primes(400))
  *** parvector: mt: please use export(ismersenne)
> break
? export(ismersenne)
? fun(primes(400))
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{Silly example}
\begin{verbatim}
? export(f=25);
? f
%2 = f
? parsum(i=1,1,f)
%3 = 25
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{exportall}
\kbd{exportall} exports all current global variables.
\begin{verbatim}
? V=primes(400);
? parvector(#V,i,ispseudoprime(2^V[i]-1))
  *** parvector: mt: please use export(V).
> break
? exportall()
? parvector(#V,i,ispseudoprime(2^V[i]-1))
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{Using polynomial indeterminates}
\begin{verbatim}
? fun(n)=bnfinit(x^n-2).no;
? parapply(fun,[1..30])
  *** parapply: mt: please use export(x).
> break
? fun(n)=bnfinit('x^n-2).no;
? default(timer,1);
? default(parisize,"16M");
? apply(fun,[1..30])
cpu time = 5,284 ms, real time = 5,281 ms.
%26 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
?  parapply(fun,[1..30])
cpu time = 9,084 ms, real time = 2,045 ms.
%27 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
?  parapply(fun,-[-30..-1])
cpu time = 9,281 ms, real time = 1,905 ms.
%28 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
\end{verbatim}
\end{frame}

\section{Grouping small tasks}
\begin{frame}[fragile]{Grouping small tasks}
\begin{verbatim}
? thuemorse(n)= 
 my(V=binary(n)); (-1)^sum(i=1,#V,V[i]);
? export(thuemorse);
? default(timer,1);
? sum(n=1,2*10^6, thuemorse(n)/n*1.)
cpu time = 4,861 ms, real time = 4,862 ms.
%33 = -1.1962837643252564372222916332008191772
? parsum(n=1,2*10^6, thuemorse(n)/n*1.)
cpu time = 17,245 ms, real time = 4,426 ms.
%34 = -1.1962837643252564372222916332008191772
? parsum(N=1,200, \
  sum(n=1+(N-1)*10^4, N*10^4, thuemorse(n)/n*1.))
cpu time = 11,836 ms, real time = 1,526 ms.
%35 = -1.1962837643252564372222916332008191811
\end{verbatim}
\end{frame}

\section{Using parfor/parforprime}
\begin{frame}[fragile]{Using parfor}
\begin{verbatim}
? ismersenne(x)=ispseudoprime(2^x-1);
? export(ismersenne)
? parfor(p=1,999,ismersenne(p),c,if(c,print(p)))
? prodmersenne(N)=
  { my(R=1);
    parforprime(p=1,N,
      ismersenne(p),
      c,
      if(c, R*=p));
    R;
  }
? prodmersenne(1000)
cpu time = 108 ms, real time = 31 ms.
%13 = 637764906056784026430
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{Using parforprime}
\begin{verbatim}
? parforprime(p=1,999,ismersenne(p),c,\
      if(c,print(p)))
? prodmersenne(N)=
{ my(R=1);
  parforprime(p=1,N,
    ismersenne(p),
    c,
    if(c, R*=p));
  R;
}
? prodmersenne(1000)
%15 = 637764906056784026430
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{parforvec/parploth}
\begin{verbatim}
? parforvec(v=[[1,3],[1,3]],factorback(v)\
            ,f,print(v,":",f))
? parploth(x=-4.5,6,\
    intnum(t=0,x,if(t,1/gamma(t))))
\end{verbatim}
\end{frame}


\begin{frame}[fragile]{return}
\begin{verbatim}
? ismersenne(x)=ispseudoprime(2^x-1);
? export(ismersenne)
? findmersenne(a)=
  {
    parforprime(p=a,,ismersenne(p),c,
      if(c,return(p)));
  }
? findmersenne(4000)
cpu time = 2,600 ms, real time = 366 ms.
%17 = 4253
? findmersenne(8)
cpu time = 4 ms, real time = 1 ms.
%18 = 13
? findmersenne(8)
%19 = 13
\end{verbatim}
\end{frame}

\begin{frame}[fragile]{return}
\begin{verbatim}
? parfirst(fun,V)=
  parfor(i=1,#V,fun(V[i]),j,if(j,return([i,j])));
? parfirst(ismersenne,[4001..5000])
cpu time = 3,104 ms, real time = 442 ms.
%21 = [253,1]
\end{verbatim}
\end{frame}

\end{document}
