|
|
@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
(use-modules (skribilo engine)
|
|
|
|
|
|
|
|
(skribilo engine latex)
|
|
|
|
|
|
|
|
(skribilo ast)
|
|
|
|
|
|
|
|
(skribilo writer)
|
|
|
|
|
|
|
|
(skribilo output)
|
|
|
|
|
|
|
|
(skribilo utils strings)
|
|
|
|
|
|
|
|
(skribilo lib)
|
|
|
|
|
|
|
|
(skribilo evaluator)
|
|
|
|
|
|
|
|
(skribilo biblio author)
|
|
|
|
|
|
|
|
(skribilo source)
|
|
|
|
|
|
|
|
(skribilo source lisp)
|
|
|
|
|
|
|
|
(skribilo source parameters)
|
|
|
|
|
|
|
|
(ice-9 match)
|
|
|
|
|
|
|
|
(rnrs io ports))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (---) ; emdash
|
|
|
|
|
|
|
|
(resolve (lambda (n e env)
|
|
|
|
|
|
|
|
(if (engine-format? "html" e)
|
|
|
|
|
|
|
|
(! "—")
|
|
|
|
|
|
|
|
(! "---")))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (--) ; endash
|
|
|
|
|
|
|
|
(resolve (lambda (n e env)
|
|
|
|
|
|
|
|
(if (engine-format? "html" e)
|
|
|
|
|
|
|
|
(! "–")
|
|
|
|
|
|
|
|
(! "--")))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (dash-dash)
|
|
|
|
|
|
|
|
(resolve (lambda (n e env)
|
|
|
|
|
|
|
|
(if (engine-format? "latex" e)
|
|
|
|
|
|
|
|
(! "{-}{-}")
|
|
|
|
|
|
|
|
"--"))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (url url)
|
|
|
|
|
|
|
|
(ref :text (tt url) :url url))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (=>)
|
|
|
|
|
|
|
|
(symbol "=>"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; XXX: Terrible hack to turn hyphens into hyphenation points in 'tt'.
|
|
|
|
|
|
|
|
(define latex-tt-encoding
|
|
|
|
|
|
|
|
`((#\- "-\\-")
|
|
|
|
|
|
|
|
(#\h "h\\-") ;“authorizations”, “authenticate”
|
|
|
|
|
|
|
|
,@(@@ (skribilo engine latex) latex-tt-encoding)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer 'tt (find-engine 'latex)
|
|
|
|
|
|
|
|
:before "{\\texttt{"
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
(let ((ne (make-engine
|
|
|
|
|
|
|
|
(gensym "latex")
|
|
|
|
|
|
|
|
:delegate e
|
|
|
|
|
|
|
|
:filter (make-string-replace latex-tt-encoding)
|
|
|
|
|
|
|
|
:custom (engine-customs e)
|
|
|
|
|
|
|
|
:symbol-table (engine-symbol-table e))))
|
|
|
|
|
|
|
|
(output (markup-body n) ne)))
|
|
|
|
|
|
|
|
:after "}}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; For pdflatex.
|
|
|
|
|
|
|
|
(engine-custom-set! (find-engine 'latex) 'image-format '("pdf"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Avoid "option clash" with acmart.
|
|
|
|
|
|
|
|
(engine-custom-set! (find-engine 'latex) 'hyperref #f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(engine-custom-set! (find-engine 'latex) 'usepackage
|
|
|
|
|
|
|
|
(let ((u (engine-custom (find-engine 'latex)
|
|
|
|
|
|
|
|
'usepackage)))
|
|
|
|
|
|
|
|
;; See <https://en.wikibooks.org/wiki/LaTeX/Labels_and_Cross-referencing>
|
|
|
|
|
|
|
|
;; and <http://tug.org/pipermail/texhax/2010-September/015596.html>.
|
|
|
|
|
|
|
|
(string-append u "\n"
|
|
|
|
|
|
|
|
"\\usepackage{inconsolata}\n"
|
|
|
|
|
|
|
|
"\\usepackage{tikz}\n"
|
|
|
|
|
|
|
|
"\\usetikzlibrary{arrows,shapes,shadows}\n"
|
|
|
|
|
|
|
|
"\\definecolor{guixorange1}{RGB}{243,154,38} % guixorange P\n"
|
|
|
|
|
|
|
|
"\\definecolor{guixblue2}{RGB}{10,50,80} % guixblue S\n"
|
|
|
|
|
|
|
|
"\\definecolor{guixred2}{RGB}{230,68,57} % red S\n"
|
|
|
|
|
|
|
|
"\\definecolor{guixdarkgrey}{RGB}{46,47,55} % guixdarkgrey S\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Trick so that ‘…’ is properly
|
|
|
|
|
|
|
|
;; typeset inside teletype text.
|
|
|
|
|
|
|
|
"\\DeclareUnicodeCharacter{2026}{\\textrm{\\ldots}}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Improve hyphenation.
|
|
|
|
|
|
|
|
"\\hyphenation{Open-PGP}\n")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(let ((latex (find-engine 'latex)))
|
|
|
|
|
|
|
|
(engine-custom-set! latex 'documentclass
|
|
|
|
|
|
|
|
"\\documentclass{IEEEcsmag}")
|
|
|
|
|
|
|
|
(engine-custom-set! latex 'maketitle #f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer '&latex-author latex
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
(let ((body (markup-body n)))
|
|
|
|
|
|
|
|
(for-each (lambda (a)
|
|
|
|
|
|
|
|
(display "\\author{{")
|
|
|
|
|
|
|
|
(output (markup-option a :name) e)
|
|
|
|
|
|
|
|
(display "}}\n\\affil{\n")
|
|
|
|
|
|
|
|
(output (markup-option a :affiliation) e)
|
|
|
|
|
|
|
|
(display "}\n\n"))
|
|
|
|
|
|
|
|
(if (pair? body) body (list body))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer 'image latex
|
|
|
|
|
|
|
|
:options '(:file :url :width :height :zoom)
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
(format #t "\n\\includegraphics[width=~a\\textwidth]{~a}\n"
|
|
|
|
|
|
|
|
(or (markup-option n :width) 0.5)
|
|
|
|
|
|
|
|
(markup-option n :file))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer 'prog latex
|
|
|
|
|
|
|
|
:class "small"
|
|
|
|
|
|
|
|
:options '(:line :mark)
|
|
|
|
|
|
|
|
:before "\n\n\\vspace{3mm}\n\\begin{footnotesize}\n"
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
;; Delegate actual work to the "real" 'prog'.
|
|
|
|
|
|
|
|
(output (prog :line (markup-option n :line)
|
|
|
|
|
|
|
|
:mark (markup-option n :mark)
|
|
|
|
|
|
|
|
(node-body n))
|
|
|
|
|
|
|
|
e))
|
|
|
|
|
|
|
|
:after "\n\\end{footnotesize}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer 'figure latex
|
|
|
|
|
|
|
|
:options '(:legend :number :multicolumns)
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
(let ((ident (markup-ident n))
|
|
|
|
|
|
|
|
(legend (markup-option n :legend))
|
|
|
|
|
|
|
|
(mc (markup-option n :multicolumns)))
|
|
|
|
|
|
|
|
(display "\\begin{figure}[ht]\n\\begin{scriptsize}\n")
|
|
|
|
|
|
|
|
(output (markup-body n) e)
|
|
|
|
|
|
|
|
(display "\n\\end{scriptsize}\n")
|
|
|
|
|
|
|
|
(format #t "\\caption{\\label{~a}"
|
|
|
|
|
|
|
|
(string-canonicalize ident))
|
|
|
|
|
|
|
|
(output legend e)
|
|
|
|
|
|
|
|
(display "}\\end{figure}\n"))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(markup-writer '&bib-entry-author
|
|
|
|
|
|
|
|
:action (lambda (n e)
|
|
|
|
|
|
|
|
(let ((names (markup-body n)))
|
|
|
|
|
|
|
|
(evaluate-document
|
|
|
|
|
|
|
|
(if (string? names)
|
|
|
|
|
|
|
|
(abbreviate-first-names
|
|
|
|
|
|
|
|
names
|
|
|
|
|
|
|
|
abbreviate-author-first-names)
|
|
|
|
|
|
|
|
names)
|
|
|
|
|
|
|
|
e)))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (word-count)
|
|
|
|
|
|
|
|
"Emit the word count."
|
|
|
|
|
|
|
|
(define (body-words body)
|
|
|
|
|
|
|
|
(match body
|
|
|
|
|
|
|
|
((? string? str)
|
|
|
|
|
|
|
|
(length (string-tokenize str)))
|
|
|
|
|
|
|
|
((? ast?) 0) ;don’t double-count
|
|
|
|
|
|
|
|
((things ...)
|
|
|
|
|
|
|
|
(apply + (map body-words things)))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(color :fg "red" (bold
|
|
|
|
|
|
|
|
[,(resolve (lambda (n env engine)
|
|
|
|
|
|
|
|
(ast-fold (lambda (n r)
|
|
|
|
|
|
|
|
(cond ((is-markup? n 'figure)
|
|
|
|
|
|
|
|
(+ r 250))
|
|
|
|
|
|
|
|
((and (is-markup? n 'chapter)
|
|
|
|
|
|
|
|
(equal? (markup-option n :title)
|
|
|
|
|
|
|
|
"References"))
|
|
|
|
|
|
|
|
(+ r 250))
|
|
|
|
|
|
|
|
((container? n)
|
|
|
|
|
|
|
|
r)
|
|
|
|
|
|
|
|
((markup? n)
|
|
|
|
|
|
|
|
(let ((body (markup-body n)))
|
|
|
|
|
|
|
|
(+ r (body-words body))))
|
|
|
|
|
|
|
|
(else r)))
|
|
|
|
|
|
|
|
(+ 250 250) ;abstract + biography
|
|
|
|
|
|
|
|
(ast-document n)))) words.])))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define (abstract . body)
|
|
|
|
|
|
|
|
(!latex "\n\\begin{abstract}\n$1\n\\end{abstract}\n\n" body))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(bibliography "../els-2013/guix.sbib")
|
|
|
|
|
|
|
|
(bibliography "../reppar-2015/reppar.sbib")
|
|
|
|
|
|
|
|
(bibliography "../programming-2022/security.sbib")
|
|
|
|
|
|
|
|
(bibliography "hpc.sbib")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(document :title [Reproducibility and Performance: Why Choose?]
|
|
|
|
|
|
|
|
;;[Conciliating Performance and Reproducibility]
|
|
|
|
|
|
|
|
:author (list (author :name "Ludovic Courtès"
|
|
|
|
|
|
|
|
:affiliation "Inria"
|
|
|
|
|
|
|
|
:address "Bordeaux, France"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(abstract [Research processes often rely on high-performance
|
|
|
|
|
|
|
|
computing (HPC), but HPC is often seen as antithetical to
|
|
|
|
|
|
|
|
“reproducibility”: one would have to choose between software that
|
|
|
|
|
|
|
|
achieves high performance, and software that can be deployed in a
|
|
|
|
|
|
|
|
reproducible fashion. However, by giving up on reproducibility we would
|
|
|
|
|
|
|
|
give up on verifiability, a foundation of the scientific process. How
|
|
|
|
|
|
|
|
can we conciliate performance and reproducibility? This article looks
|
|
|
|
|
|
|
|
at two performance-critical aspects in HPC: message passing (MPI) and
|
|
|
|
|
|
|
|
CPU micro-architecture tuning. Engineering work that has gone into
|
|
|
|
|
|
|
|
performance portability has already proved fruitful, but some areas
|
|
|
|
|
|
|
|
remain unaddressed when it comes to CPU tuning. We propose package
|
|
|
|
|
|
|
|
multi-versioning, a technique developed for GNU Guix, a tool for
|
|
|
|
|
|
|
|
reproducible software deployment, and show that it allows us to
|
|
|
|
|
|
|
|
implement CPU tuning without compromising on reproducibility and
|
|
|
|
|
|
|
|
provenance tracking.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(!latex "\n\\maketitle\n")
|
|
|
|
|
|
|
|
(!latex "\n\\chapterinitial{Introduction.}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; (word-count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [It should come as no surprise that the execution speed of programs is a
|
|
|
|
|
|
|
|
primary concern in high-performance computing (HPC). Many HPC
|
|
|
|
|
|
|
|
practitioners would tell you that, among their top concerns, is the
|
|
|
|
|
|
|
|
performance of high-speed networks used by the Message Passing Interface
|
|
|
|
|
|
|
|
(MPI) and use of the latest vectorization extensions of modern CPUs.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [This article focuses on the latter: tuning code for specific CPU
|
|
|
|
|
|
|
|
micro-architectures, to reap the benefits of modern CPUs. This question
|
|
|
|
|
|
|
|
is particularly acute in the context of GNU Guix, a software deployment
|
|
|
|
|
|
|
|
tool with strong support for ,(emph [reproducible deployment]). We like
|
|
|
|
|
|
|
|
to present Guix as a key element of the reproducible research toolbox:
|
|
|
|
|
|
|
|
as more research output is produced by software, the ability to ,(emph
|
|
|
|
|
|
|
|
[verify and validate]) research results depends on the ability to ,(emph
|
|
|
|
|
|
|
|
[re-deploy and re-run]) the software. We present a recently-introduced
|
|
|
|
|
|
|
|
CPU-tuning option for Guix, the design choices we made, and how this
|
|
|
|
|
|
|
|
affects reproducibility.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [But let us first consider this central question in the HPC and
|
|
|
|
|
|
|
|
scientific community: can “reproducibility” be achieved ,(emph
|
|
|
|
|
|
|
|
[without]) sacrificing performance? Our answer is a resounding “yes”,
|
|
|
|
|
|
|
|
but that deserves clarifications.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [Reproducibility & High Performance]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [The author remembers advice heard at the beginning of their
|
|
|
|
|
|
|
|
career in HPC—advice still given today—: that to get optimal MPI
|
|
|
|
|
|
|
|
performance, you would have to use the vendor-provided MPI library; that
|
|
|
|
|
|
|
|
to get your code to perform well on this new cluster, you would have to
|
|
|
|
|
|
|
|
recompile the complete software stack locally; that using generic,
|
|
|
|
|
|
|
|
pre-built binaries from a GNU/Linux distribution will not give you good
|
|
|
|
|
|
|
|
performance.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [From a software engineering viewpoint, this looks like a sad
|
|
|
|
|
|
|
|
situation and an inefficient approach, dismissing the benefits of
|
|
|
|
|
|
|
|
automated software deployment as pioneered by Debian, Red Hat, and
|
|
|
|
|
|
|
|
others in the 90’s or, more recently, as popularized with container
|
|
|
|
|
|
|
|
images. It also means doing away with reproducibility, where
|
|
|
|
|
|
|
|
“reproducibility” is to be understood in two different ways: first as
|
|
|
|
|
|
|
|
the ability to re-deploy the same software stack on another machine or
|
|
|
|
|
|
|
|
at a different point in time, and second as the ability to ,(emph [verify]) that
|
|
|
|
|
|
|
|
binaries being run match the source code—the latter is what reproducible
|
|
|
|
|
|
|
|
builds are concerned with ,(ref :bib 'lamb2021:reproducible).])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [But does it really have to be this way? Engineering efforts to
|
|
|
|
|
|
|
|
support ,(emph [performance portability]) suggest otherwise. A mature
|
|
|
|
|
|
|
|
MPI implementation like Open MPI, today, does achieve performance
|
|
|
|
|
|
|
|
portability: it takes advantage of high-speed networking hardware by
|
|
|
|
|
|
|
|
determining, at run-time, which drivers to use to obtain optimal
|
|
|
|
|
|
|
|
performance for the network at hand—no recompilation is needed ,(ref :bib
|
|
|
|
|
|
|
|
'courtes2019:openmpi).])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Likewise, generic, pre-built binaries can and indeed often do
|
|
|
|
|
|
|
|
take advantage of modern CPUs by selecting at run-time the most
|
|
|
|
|
|
|
|
efficient implementation of performance-sensitive routines for the host
|
|
|
|
|
|
|
|
CPU ,(ref :bib 'courtes2018:prebuilt). There are cases, though, where
|
|
|
|
|
|
|
|
this is ,(emph [not]) the case; these are those we will focus on in the
|
|
|
|
|
|
|
|
remainder of this article.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [The Jungle of SIMD Extensions]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [While major CPU architectures such as x86_64, AArch64, and
|
|
|
|
|
|
|
|
POWER9 were defined years ago, CPU vendors regularly extend them.
|
|
|
|
|
|
|
|
Extensions that matter most in HPC are vector extensions: single
|
|
|
|
|
|
|
|
instruction/multiple data (SIMD) instructions and registers. In this
|
|
|
|
|
|
|
|
area, a ,(emph [lot]) has happened on x86_64 CPUs since the baseline
|
|
|
|
|
|
|
|
instruction set architecture (ISA) was defined. As shown in ,(numref
|
|
|
|
|
|
|
|
:text [Figure] :ident "fig-simd-extensions"), Intel and AMD have been
|
|
|
|
|
|
|
|
tacking ever more powerful SIMD extensions to their CPUs over the years,
|
|
|
|
|
|
|
|
from SSE3 to AVX-512, leading to a wealth of CPU “micro-architectures”.
|
|
|
|
|
|
|
|
This gives a high-level view, but just looking at generations of Intel
|
|
|
|
|
|
|
|
processors by their code name—from “Nehalem” to “Skylake” ,(it [via])
|
|
|
|
|
|
|
|
“Ivybridge”—shows an already more complicated story.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(figure :legend [Timeline of x86_64 SIMD extensions]
|
|
|
|
|
|
|
|
:ident "fig-simd-extensions"
|
|
|
|
|
|
|
|
:multicolumns #t
|
|
|
|
|
|
|
|
(!latex (call-with-input-file "images/cpu-simd-extensions.tex"
|
|
|
|
|
|
|
|
get-string-all)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Linear algebra routines that scientific software relies on
|
|
|
|
|
|
|
|
greatly benefit from SIMD extensions. For example, on a modest Intel
|
|
|
|
|
|
|
|
CORE i7 processor (of the Skylake generation), the AVX2-optimized
|
|
|
|
|
|
|
|
version of the dense matrix multiplication routines of Eigen ,(ref :bib
|
|
|
|
|
|
|
|
'guennebaud2022:eigen), built with GCC 10.3, peaks at about 40 Gflops/s,
|
|
|
|
|
|
|
|
compared to 11 Gflops/s for its baseline x86_64 version—four times
|
|
|
|
|
|
|
|
faster!]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [Portable Performance Through Function Multi-Versioning]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [How to create binaries that are portable, yet are able to get
|
|
|
|
|
|
|
|
the most out of the CPU on which they are executed? This has been an
|
|
|
|
|
|
|
|
important question for distributors of binaries. Distributions such as
|
|
|
|
|
|
|
|
Debian and CentOS provide the convenience of fast automated deployment,
|
|
|
|
|
|
|
|
thanks to pre-built binaries; asking users to either recompile part of
|
|
|
|
|
|
|
|
their software stack or give up on performance is not a reasonable
|
|
|
|
|
|
|
|
alternative.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [To address this and achieve performance portability,
|
|
|
|
|
|
|
|
developers have largely adopted ,(emph [function multi-versioning])
|
|
|
|
|
|
|
|
(FMV): the implementation provides multiple versions of “hot” routines,
|
|
|
|
|
|
|
|
one for each relevant CPU micro-architecture, and picks the best one for
|
|
|
|
|
|
|
|
the host CPU at run time. Many pieces of performance-critical software
|
|
|
|
|
|
|
|
already use this technique: the C standard library (libc) contains
|
|
|
|
|
|
|
|
multiple versions of its string handling and math routines, the GMP
|
|
|
|
|
|
|
|
library for multi-precision arithmetic uses FMV, and so do software
|
|
|
|
|
|
|
|
packages ranging from cryptography libraries (Libgcrypt, Nettle) to
|
|
|
|
|
|
|
|
linear algebra (OpenBLAS, FFTW).])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [To make it easier for developers to adopt FMV, the GNU
|
|
|
|
|
|
|
|
compilation tool chain (GCC, the Binary Utilities, and the C Library),
|
|
|
|
|
|
|
|
which is widely used in HPC, provides helpers at different levels.
|
|
|
|
|
|
|
|
Developers can annotate relevant functions with the ,(tt [target_clone])
|
|
|
|
|
|
|
|
attribute to instruct the compiler to generate optimized versions of the
|
|
|
|
|
|
|
|
function for each selected architecture. GCC not only generates these
|
|
|
|
|
|
|
|
versions, but also generates code to choose the right function version
|
|
|
|
|
|
|
|
for the host CPU at load time, with support from the dynamic linker,
|
|
|
|
|
|
|
|
,(tt [ld.so]). That relieves developers from the need to implement
|
|
|
|
|
|
|
|
their own ad-hoc machinery. From that perspective, it would seem that
|
|
|
|
|
|
|
|
performance portability, ,(it [via]) FMV, is a solved problem.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(stuff on auto-fmv commented out!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [To make the case for FMV, we wanted to see what it would take us to
|
|
|
|
|
|
|
|
actually add FMV support to code that would benefit from it. In the
|
|
|
|
|
|
|
|
spirit of the Clear Linux automatic FMV patch
|
|
|
|
|
|
|
|
generator (https://github.com/clearlinux/make-fmv-patch), we wrote an
|
|
|
|
|
|
|
|
automatic FMV tool for
|
|
|
|
|
|
|
|
Guix (https://gitlab.inria.fr/guix-hpc/function-multi-versioning): you
|
|
|
|
|
|
|
|
would give it a package name, and it would:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(itemize
|
|
|
|
|
|
|
|
(item [Build the package with the ,(tt [-fopt-info-vec]) compiler flag to gather
|
|
|
|
|
|
|
|
information about vectorization opportunities and their source code
|
|
|
|
|
|
|
|
location.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(item [Generate a patch that, for each C function with vectorization
|
|
|
|
|
|
|
|
opportunities, adds the ,(tt [target_clone])
|
|
|
|
|
|
|
|
attribute to generate a couple of vectorized versions—generic,
|
|
|
|
|
|
|
|
AVX2, and
|
|
|
|
|
|
|
|
AVX-512.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(item [Build the package with this FMV patch.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [The tool can successfully FMV-patch a variety of packages
|
|
|
|
|
|
|
|
written in C, such as the GNU Scientific Library (FIXME
|
|
|
|
|
|
|
|
https://www.gnu.org/software/gsl), which contains plain sequential
|
|
|
|
|
|
|
|
implementations of a variety of math routines. It was an exciting
|
|
|
|
|
|
|
|
engineering experiment… but we found it to be all too often
|
|
|
|
|
|
|
|
inapplicable, for two reasons: performance-critical software already
|
|
|
|
|
|
|
|
does FMV, or it is not written in C.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [There is at least one common pattern though where FMV is not
|
|
|
|
|
|
|
|
applicable, or at least is not applied: C++ header-only libraries.
|
|
|
|
|
|
|
|
These are libraries that provide generic template code in header files;
|
|
|
|
|
|
|
|
that code is specialized ,(emph [at build time]) in software that uses
|
|
|
|
|
|
|
|
them. There is no shortage of C++ header-only math libraries providing
|
|
|
|
|
|
|
|
efficient, optimized SIMD versions of their routines: Eigen, MIPP, xsimd
|
|
|
|
|
|
|
|
and xtensor, SIMD Everywhere (SIMDe), Highway, and many more. All
|
|
|
|
|
|
|
|
these, except Highway, have in common that they do ,(emph [not]) support
|
|
|
|
|
|
|
|
FMV. Since they “just” provide headers, it is up to ,(emph [each])
|
|
|
|
|
|
|
|
package using them to figure out what to do in terms of performance
|
|
|
|
|
|
|
|
portability.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [In practice though, software using these C++ header-only
|
|
|
|
|
|
|
|
libraries rarely makes provisions for performance portability. Thus,
|
|
|
|
|
|
|
|
when compiling those packages for the baseline ISA, one misses out on
|
|
|
|
|
|
|
|
all the vectorized implementations that libraries like Eigen provide.
|
|
|
|
|
|
|
|
This is a known issue in search of a solution ,(ref :bib
|
|
|
|
|
|
|
|
'larsen2021:eigen-fmv). It can have a very concrete impact on
|
|
|
|
|
|
|
|
performance since many scientific packages—the ARPACK-NG library for
|
|
|
|
|
|
|
|
solving eigenvalue problems, the Ceres solver for optimization problems,
|
|
|
|
|
|
|
|
the FEniCSx platform for solving differential equations, to name a
|
|
|
|
|
|
|
|
few—depend on Eigen.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [Fundamentally, run-time dispatch is at odds with the all-compile-time
|
|
|
|
|
|
|
|
approach that header-only C++ template libraries are about.
|
|
|
|
|
|
|
|
Furthermore, Eigen, for example, supports fine-grain vectorization; it
|
|
|
|
|
|
|
|
may be used to operate on small matrices, as is common in computer
|
|
|
|
|
|
|
|
graphics, and in that case inlining matrix operations is key to good
|
|
|
|
|
|
|
|
performance—run-time dispatch would have to be done at a higher
|
|
|
|
|
|
|
|
level.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [Reproducible Deployment]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Distributions such as Debian and Fedora that provide pre-built
|
|
|
|
|
|
|
|
binaries miss out on SIMD optimizations of C++ header-only libraries
|
|
|
|
|
|
|
|
like Eigen because they provide binaries targeting the baseline CPU
|
|
|
|
|
|
|
|
architecture so that those binaries run on any CPU. The Spack ,(ref
|
|
|
|
|
|
|
|
:bib 'gamblin2015:spack) and EasyBuild ,(ref :bib 'geimer2014:easybuild)
|
|
|
|
|
|
|
|
package managers address that by ,(emph [rebuilding]) software on the
|
|
|
|
|
|
|
|
target computer, which allows them to instruct the compiler to optimize
|
|
|
|
|
|
|
|
for the host CPU.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Unfortunately, EasyBuild and Spack both have limited support
|
|
|
|
|
|
|
|
for reproducible deployment—they do not, in general, guarantee that you
|
|
|
|
|
|
|
|
can redeploy the same software environment on different machines, or at
|
|
|
|
|
|
|
|
different points in time. This is because they build upon software
|
|
|
|
|
|
|
|
provided by the host system—the compiler tool chain, “system” libraries,
|
|
|
|
|
|
|
|
etc.—and that foundation differs from one system to another—e.g., CentOS
|
|
|
|
|
|
|
|
might provide some version of GCC, and Ubuntu might provide another.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [To avoid that, Guix builds software in ,(emph [isolated
|
|
|
|
|
|
|
|
environments]), as pioneered by Nix ,(ref :bib '(dolstra2004:nix
|
|
|
|
|
|
|
|
courtes2013:functional)), and its package collection is ,(emph
|
|
|
|
|
|
|
|
[self-contained])—it does not rely on external software packages. This
|
|
|
|
|
|
|
|
is what makes Guix builds reproducible bit-for-bit—or in other words,
|
|
|
|
|
|
|
|
,(emph [verifiable]) ,(ref :bib 'lamb2021:reproducible). Given binaries
|
|
|
|
|
|
|
|
and provenance data, anyone can independently verify the
|
|
|
|
|
|
|
|
binary/source-code correspondence.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Guix provides a command-line interface similar to that of
|
|
|
|
|
|
|
|
other package managers: ,(tt [guix install python]), for instance,
|
|
|
|
|
|
|
|
installs the Python interpreter. Package management is per-user rather
|
|
|
|
|
|
|
|
than system-wide and does not require system administrator privileges,
|
|
|
|
|
|
|
|
which makes it suitable for multi-user HPC clusters ,(ref :bib
|
|
|
|
|
|
|
|
'courtes2015:reproducible). To offer the level of flexibility that HPC
|
|
|
|
|
|
|
|
users expect, Guix lets users customize packages ,(it [via]) ,(emph
|
|
|
|
|
|
|
|
[package transformation options]) on the command line—for instance to
|
|
|
|
|
|
|
|
swap two packages in the dependency graph—or through programming
|
|
|
|
|
|
|
|
interfaces ,(ref :bib 'courtes2015:reproducible).])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Quite uniquely, Guix supports ,(emph [“time traveling”]): with
|
|
|
|
|
|
|
|
,(tt [guix time-machine]), users can run a specific revision of Guix and
|
|
|
|
|
|
|
|
use it to deploy packages as they were defined in that revision. The
|
|
|
|
|
|
|
|
typical use case is redeploying software that was used to produce
|
|
|
|
|
|
|
|
computational results for a scientific publication ,(ref :bib
|
|
|
|
|
|
|
|
'(hinsen2020:staged-computation courtes2020:storage
|
|
|
|
|
|
|
|
perkel2020:challenge)). The command below deploys Python, NumPy, and
|
|
|
|
|
|
|
|
their dependencies as they were defined in a Guix revision from October
|
|
|
|
|
|
|
|
2021:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(prog :class "small" :line #f [
|
|
|
|
|
|
|
|
guix time-machine --commit=b0735c79b0d1d341 -- \\
|
|
|
|
|
|
|
|
shell python python-numpy
|
|
|
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [Whether you run it today or two years from now, it will deploy
|
|
|
|
|
|
|
|
the ,(emph [exact same binaries]), bit-for-bit, down to the C
|
|
|
|
|
|
|
|
library.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [Package Multi-Versioning]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [With our packaging hammer, one could envision a solution to
|
|
|
|
|
|
|
|
these CPU tuning problems: if we cannot do function multi-versioning,
|
|
|
|
|
|
|
|
what about implementing ,(emph [package]) multi-versioning? Guix makes
|
|
|
|
|
|
|
|
it easy to define package variants, so we can define package variants
|
|
|
|
|
|
|
|
optimized for a specific CPU—compiled with ,(tt [-march=skylake]), for
|
|
|
|
|
|
|
|
instance. What we need is to define those variants “on the fly”.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [The recently-introduced ,(tt [--tune]) package transformation
|
|
|
|
|
|
|
|
option works along those lines. Users can pass ,(tt [--tune]) to any of
|
|
|
|
|
|
|
|
the command-line tools (,(tt [guix install]), ,(tt [guix shell]), etc.)
|
|
|
|
|
|
|
|
and that causes “tunable” packages to be optimized for the host CPU.
|
|
|
|
|
|
|
|
For example, here is how you would run Eigen’s matrix multiplication
|
|
|
|
|
|
|
|
benchmark from the ,(tt [eigen-benchmarks]) package with
|
|
|
|
|
|
|
|
micro-architecture tuning:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(prog :class "small" :line #f [
|
|
|
|
|
|
|
|
$ guix shell --tune eigen-benchmarks -- \\
|
|
|
|
|
|
|
|
benchBlasGemm 240 240 240
|
|
|
|
|
|
|
|
guix shell: tuning for CPU skylake
|
|
|
|
|
|
|
|
240 x 240 x 240
|
|
|
|
|
|
|
|
cblas: 0.208547 (15.908 GFlops/s)
|
|
|
|
|
|
|
|
eigen : 0.0720303 (46.06 GFlops/s)
|
|
|
|
|
|
|
|
l1: 32768
|
|
|
|
|
|
|
|
l2: 262144
|
|
|
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [,(tt [--tune]) determines the name of the host CPU as
|
|
|
|
|
|
|
|
recognized by GCC’s (and Clang’s) ,(tt [-march]) option. Users can
|
|
|
|
|
|
|
|
override auto-detection by passing a CPU name—e.g., ,(tt
|
|
|
|
|
|
|
|
[--tune=skylake-avx512]). As mentioned earlier, we made the conscious
|
|
|
|
|
|
|
|
choice of letting ,(tt [--tune]) affect solely software that packagers
|
|
|
|
|
|
|
|
explicitly marked as “tunable”. This ensures Guix does not end up
|
|
|
|
|
|
|
|
rebuilding packages that could not possibly benefit from
|
|
|
|
|
|
|
|
micro-architecture-specific optimizations, which would be a waste of
|
|
|
|
|
|
|
|
resources.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [(For the same
|
|
|
|
|
|
|
|
reason, we rejected the idea of defining separate system types for the
|
|
|
|
|
|
|
|
various x86_64 CPU micro-architectures the way Nix 2.4 did (FIXME
|
|
|
|
|
|
|
|
https://discourse.nixos.org/t/nix-2-4-released/15822#other-features-2).)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [In the spirit of avoiding needless package rebuilds, ,(tt [--tune])
|
|
|
|
|
|
|
|
leverages the “graft” mechanism (XREF
|
|
|
|
|
|
|
|
https://guix.gnu.org/manual/en/html_node/Security-Updates.html): package
|
|
|
|
|
|
|
|
variants are ,(emph [grafted]) to the dependency graph, such that dependents of
|
|
|
|
|
|
|
|
a tuned package do not need to be rebuilt. To illustrate that, consider
|
|
|
|
|
|
|
|
the figure below:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;![Dependency graph of OpenCV, where the tuned variant of VTK is grafted.](/static/images/blog/cpu-tuning-graft.png)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [OpenCV depends on VTK, which depends on Eigen, as shown by the
|
|
|
|
|
|
|
|
dotted arrows. VTK is marked as tunable so it can benefit from SIMD
|
|
|
|
|
|
|
|
optimizations in Eigen. When ,(tt [--tune]) is passed, the optimized variant
|
|
|
|
|
|
|
|
of VTK built with ,(tt [-march=skylake]) is generated and grafted onto the
|
|
|
|
|
|
|
|
dependency graph, such that OpenCV itself does not need to be recompiled
|
|
|
|
|
|
|
|
and instead is relinked against the optimized VTK variant.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [This implementation of package multi-versioning does not
|
|
|
|
|
|
|
|
sacrifice reproducibility. When ,(tt [--tune]) is used, from Guix’s
|
|
|
|
|
|
|
|
viewpoint, it is just an alternate, but well-defined dependency graph
|
|
|
|
|
|
|
|
that gets built. Guix records package transformation options that were
|
|
|
|
|
|
|
|
used so it can “replay” them. For example, one can export a ,(emph
|
|
|
|
|
|
|
|
[manifest]) representing packages that have been deployed:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(prog :class "small" :line #f [
|
|
|
|
|
|
|
|
$ guix shell eigen-benchmarks --tune
|
|
|
|
|
|
|
|
guix shell: tuning for CPU skylake
|
|
|
|
|
|
|
|
\[env\]$ guix package --export-manifest \\
|
|
|
|
|
|
|
|
-p $GUIX_ENVIRONMENT
|
|
|
|
|
|
|
|
(use-modules (guix transformations))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(define transform1
|
|
|
|
|
|
|
|
(options->transformation
|
|
|
|
|
|
|
|
'((tune . "skylake"))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(packages->manifest
|
|
|
|
|
|
|
|
(list (transform1
|
|
|
|
|
|
|
|
(specification->package
|
|
|
|
|
|
|
|
"eigen-benchmarks"))))
|
|
|
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [The manifest above is a code snippet that can be passed to
|
|
|
|
|
|
|
|
,(tt [guix shell]) or ,(tt [guix package]) to redeploy the package with
|
|
|
|
|
|
|
|
the same tuning parameters. Like other transformation options, ,(tt
|
|
|
|
|
|
|
|
[--tune]) is accepted by all the commands; for example, here is how you
|
|
|
|
|
|
|
|
would build a Docker image tuned for a particular CPU:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(prog :class "small" :line #f [
|
|
|
|
|
|
|
|
guix pack -f docker -S /bin=bin \
|
|
|
|
|
|
|
|
eigen-benchmarks --tune=skylake
|
|
|
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [This comes in handy if you want to prepare an image to run on
|
|
|
|
|
|
|
|
another cluster, where you know you can rely on a given CPU extension.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#;(p [The Guix build farm is set up to build a few optimized package
|
|
|
|
|
|
|
|
variants. That way, users of ,(tt [--tune]) are likely to get pre-built
|
|
|
|
|
|
|
|
binaries even for the optimized variants, making deployment just as fast
|
|
|
|
|
|
|
|
as with non-tuned packages. To achieve this, ,(tt [--tune]) skips
|
|
|
|
|
|
|
|
running test suites when building packages: we cannot be sure that build
|
|
|
|
|
|
|
|
machines implement the CPU micro-architecture at hand.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title [Conclusion and Outlook]
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [We implemented what we call “package multi-versioning” for
|
|
|
|
|
|
|
|
C/C++ software that lacks function multi-versioning and run-time
|
|
|
|
|
|
|
|
dispatch, a notable example of which is optimized C++ header-only
|
|
|
|
|
|
|
|
libraries. It is another way to ensure that users do not have to trade
|
|
|
|
|
|
|
|
reproducibility for performance.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; refs:
|
|
|
|
|
|
|
|
;; (FIXME https://docs.julialang.org/en/v1/devdocs/sysimg/)
|
|
|
|
|
|
|
|
;; (FIXME https://docs.rs/multiversion/0.6.1/multiversion/)
|
|
|
|
|
|
|
|
(p [The scientific programming landscape has been evolving over
|
|
|
|
|
|
|
|
the last few years. It is encouraging to see that Julia offers function
|
|
|
|
|
|
|
|
multi-versioning for its “system image”, and that, similarly, Rust
|
|
|
|
|
|
|
|
supports it with annotations similar to GCC’s ,(tt [target_clones]).
|
|
|
|
|
|
|
|
Hopefully these new development environments will support performance
|
|
|
|
|
|
|
|
portability well enough that users and packagers will not need to worry
|
|
|
|
|
|
|
|
about it.])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(p [But first and foremost, it is up to us, research software
|
|
|
|
|
|
|
|
engineers and scientists, to dispel the myth that performance is a valid
|
|
|
|
|
|
|
|
excuse for non-reproducible computational workflows.]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(chapter :title "References"
|
|
|
|
|
|
|
|
:number #f
|
|
|
|
|
|
|
|
(flush :side 'left
|
|
|
|
|
|
|
|
(the-bibliography
|
|
|
|
|
|
|
|
:sort bib-sort/first-author-last-name)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(!latex
|
|
|
|
|
|
|
|
"\n\\begin{IEEEbiography}{Ludovic Courtès}\n$1\n\\end{IEEEbiography}\n"
|
|
|
|
|
|
|
|
[is a research software engineer at Inria, France. He has been
|
|
|
|
|
|
|
|
contributing to the development of GNU Guix since its inception in 2012
|
|
|
|
|
|
|
|
and works on its use in support of reproducible research workflows. He
|
|
|
|
|
|
|
|
holds a PhD in computer science from LAAS-CNRS. You can reach him at
|
|
|
|
|
|
|
|
,(it [ludovic.courtes@inria.fr]).]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Local Variables:
|
|
|
|
|
|
|
|
;; ispell-local-dictionary: "american"
|
|
|
|
|
|
|
|
;; compile-command: "guix shell -m manifest.scm -- make -j5"
|
|
|
|
|
|
|
|
;; eval: (setq indent-tabs-mode nil)
|
|
|
|
|
|
|
|
;; End:
|