$ git clone git@github.com:llvm/llvm-project.git
$ mkdir llvm-project/llvm/build
$ cd !$
$ cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-G Ninja \
-DCMAKE_C_COMPILER=/android1/android-master/prebuilts/clang/host/linux-x86/clang-r353983c/bin/clang \
-DCMAKE_CXX_COMPILER=/android1/android-master/prebuilts/clang/host/linux-x86/clang-r353983c/bin/clang++ \
-DLLVM_ENABLE_LLD=ON \
-DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" \
-DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" \
-DLLVM_ENABLE_ASSERTIONS=OFF
$ ninja
Release builds run faster than Debug. Debug needed for debug symbols and some debugging features.
Ninja builds faster than GNU Make.
Build Clang with Clang.
Linking Clang with LLD is way faster than BFD.
-DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"
I don't need libcxx and friends. compiler-rt has ASAN/UBSAN/MSAN runtimes, but I frequently omit that, too.
-DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86"
Anti-pattern; Clang is distributed with all non-experimental backends enabled. I turn off backends I don't need to speed up builds of Clang.
$ llc --version
LLVM (http://llvm.org/):
LLVM version 11.0.0git
Optimized build.
Default target: x86_64-unknown-linux-gnu
Host CPU: skylake-avx512
Registered Targets:
aarch64 - AArch64 (little endian)
aarch64_32 - AArch64 (little endian ILP32)
aarch64_be - AArch64 (big endian)
arm - ARM
arm64 - ARM64 (little endian)
arm64_32 - ARM64 (little endian ILP32)
armeb - ARM (big endian)
thumb - Thumb
thumbeb - Thumb (big endian)
x86 - 32-bit X86: Pentium-Pro and above
x86-64 - 64-bit X86: EM64T and AMD64
LLVM is full of assert()
ions.
OFF == -DNDEBUG
Turning ON
assertions is generally faster than a DEBUG build.
Good idea to start with when debugging compiler crashes.
$ ninja clang
For edit/compile/debug cycle, don't rebuild all of LLVM if you don't need to!
-E: stop before compiling, after preprocessing, produces .i
-S: stop before assembling, produces .s
-c: stop before linking, produces .o
-v: print commands executed and run
-###: print commands executed, don't run
-o -: print to stdout rather than write output to file
#include <stddef.h>
void foo (int* a, int x, int y) {
for (size_t i = 0; i < 100; ++i)
a[i] = x + y;
}
$ clang++ -E foo.cpp
# 1 "foo.cpp"
# 1 "<built-in>" 1
# 1 "<built-in>" 3
# 383 "<built-in>" 3
# 1 "<command line>" 1
# 1 "<built-in>" 2
# 1 "foo.cpp" 2
# 1 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/stddef.h" 1 3
# 35 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/stddef.h" 3
typedef long int ptrdiff_t;
# 46 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/stddef.h" 3
typedef long unsigned int size_t;
# 102 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/stddef.h" 3
# 1 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/__stddef_max_align_t.h" 1 3
# 19 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/__stddef_max_align_t.h" 3
typedef struct {
long long __clang_max_align_nonce1
__attribute__((__aligned__(__alignof__(long long))));
long double __clang_max_align_nonce2
__attribute__((__aligned__(__alignof__(long double))));
} max_align_t;
# 103 "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include/stddef.h" 2 3
# 2 "foo.cpp" 2
void foo (int* a, int x, int y) {
for (size_t i = 0; i < 100; ++i)
a[i] = x + y;
}
$ clang++ -O2 -S clang.cpp
.text
.file "foo.cpp"
.globl _Z3fooPiii # -- Begin function _Z3fooPiii
.p2align 4, 0x90
.type _Z3fooPiii,@function
_Z3fooPiii: # @_Z3fooPiii
.cfi_startproc
# %bb.0:
addl %edx, %esi
movd %esi, %xmm0
pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
movdqu %xmm0, (%rdi)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movdqu %xmm0, 64(%rdi)
movdqu %xmm0, 80(%rdi)
movdqu %xmm0, 96(%rdi)
movdqu %xmm0, 112(%rdi)
movdqu %xmm0, 128(%rdi)
movdqu %xmm0, 144(%rdi)
movdqu %xmm0, 160(%rdi)
movdqu %xmm0, 176(%rdi)
movdqu %xmm0, 192(%rdi)
movdqu %xmm0, 208(%rdi)
movdqu %xmm0, 224(%rdi)
movdqu %xmm0, 240(%rdi)
movdqu %xmm0, 256(%rdi)
movdqu %xmm0, 272(%rdi)
movdqu %xmm0, 288(%rdi)
movdqu %xmm0, 304(%rdi)
movdqu %xmm0, 320(%rdi)
movdqu %xmm0, 336(%rdi)
movdqu %xmm0, 352(%rdi)
movdqu %xmm0, 368(%rdi)
movdqu %xmm0, 384(%rdi)
retq
.Lfunc_end0:
.size _Z3fooPiii, .Lfunc_end0-_Z3fooPiii
.cfi_endproc
# -- End function
.ident "Nick Desaulniers clang version 11.0.0 (git@github.com:llvm/llvm-project.git 276a6b88898a9847d6b401b769e53fb3af6e7b78)"
.section ".note.GNU-stack","",@progbits
.addrsig
$ clang++ -### foo.cpp
clang version 11.0.0 (git@github.com:llvm/llvm-project.git 9050d0fb593c60628f47caa122c01ea1dc7a1bf5)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /android0/llvm-project/llvm/build/bin
(in-process) "/android0/llvm-project/llvm/build/bin/clang-10" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-emit-obj" "-mrelax-all" "-disable-free" "-disable-llvm-verifier" "-discard-value-names" "-main-file-name" "foo.cpp" "-mrelocation-model" "static" "-mthread-model" "posix" "-mframe-pointer=all" "-fmath-errno" "-fno-rounding-math" "-masm-verbose" "-mconstructor-aliases" "-munwind-tables" "-target-cpu" "x86-64" "-dwarf-column-info" "-fno-split-dwarf-inlining" "-debugger-tuning=gdb" "-resource-dir" "/android0/llvm-project/llvm/build/lib/clang/11.0.0" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/backward" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/android0/llvm-project/llvm/build/lib/clang/11.0.0/include" "-internal-externc-isystem" "/usr/include/x86_64-linux-gnu" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-fdeprecated-macro" "-fdebug-compilation-dir" "/tmp" "-ferror-limit" "19" "-fmessage-length" "0" "-fgnuc-version=4.2.1" "-fobjc-runtime=gcc" "-fcxx-exceptions" "-fexceptions" "-fdiagnostics-show-option" "-fcolor-diagnostics" "-faddrsig" "-o" "/tmp/foo-7c7f31.o" "-x" "c++" "foo.cpp"
"/usr/bin/ld" "--eh-frame-hdr" "-m" "elf_x86_64" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "a.out" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crt1.o" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crti.o" "/usr/lib/gcc/x86_64-linux-gnu/8/crtbegin.o" "-L/usr/lib/gcc/x86_64-linux-gnu/8" "-L/usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu" "-L/usr/lib/gcc/x86_64-linux-gnu/8/../../../../lib64" "-L/lib/x86_64-linux-gnu" "-L/lib/../lib64" "-L/usr/lib/x86_64-linux-gnu" "-L/usr/lib/../lib64" "-L/usr/lib/x86_64-linux-gnu/../../lib64" "-L/usr/lib/gcc/x86_64-linux-gnu/8/../../.." "-L/android0/llvm-project/llvm/build/bin/../lib" "-L/lib" "-L/usr/lib" "/tmp/foo-7c7f31.o" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "/usr/lib/gcc/x86_64-linux-gnu/8/crtend.o" "/usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crtn.o"
$ clang++ -v foo.cpp
clang version 11.0.0 (git@github.com:llvm/llvm-project.git 9050d0fb593c60628f47caa122c01ea1dc7a1bf5)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /android0/llvm-project/llvm/build/bin
Found candidate GCC installation: /usr/lib/gcc/i686-linux-gnu/8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/6.5.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.4.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Candidate multilib: x32;@mx32
Selected multilib: .;@m64
(in-process) "/android0/llvm-project/llvm/build/bin/clang-10" -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -mrelax-all -disable-free -disable-llvm-verifier -discard-value-names -main-file-name foo.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -v -resource-dir /android0/llvm-project/llvm/build/lib/clang/11.0.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/backward -internal-isystem /usr/local/include -internal-isystem /android0/llvm-project/llvm/build/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fdeprecated-macro -fdebug-compilation-dir /tmp -ferror-limit 19 -fmessage-length 0 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -fcolor-diagnostics -faddrsig -o /tmp/foo-747b03.o -x c++ foo.cpp
clang -cc1 version 11.0.0 based upon LLVM 11.0.0git default target x86_64-unknown-linux-gnu
ignoring nonexistent directory "/include"
ignoring duplicate directory "/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8"
#include "..." search starts here:
#include <...> search starts here:
/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8
/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8
/usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/backward
/usr/local/include
/android0/llvm-project/llvm/build/lib/clang/11.0.0/include
/usr/include/x86_64-linux-gnu
/usr/include
End of search list.
"/usr/bin/ld" --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o a.out /usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/8/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/8 -L/usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/8/../../../../lib64 -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib64 -L/usr/lib/x86_64-linux-gnu/../../lib64 -L/usr/lib/gcc/x86_64-linux-gnu/8/../../.. -L/android0/llvm-project/llvm/build/bin/../lib -L/lib -L/usr/lib /tmp/foo-747b03.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/8/crtend.o /usr/lib/gcc/x86_64-linux-gnu/8/../../../x86_64-linux-gnu/crtn.o
$ llvm-readelf --string-dump=.comment vmlinux
String dump of section '.comment':
[ 0] Linker: LLD 11.0.0 (git@github.com:llvm/llvm-project.git 276a6b88898a9847d6b401b769e53fb3af6e7b78)
[ 64] clang version 11.0.0 (git@github.com:llvm/llvm-project.git 276a6b88898a9847d6b401b769e53fb3af6e7b78)
$ clang -Xclang -dump-tokens -E foo.cpp
...
void 'void' [StartOfLine] Loc=<foo.cpp:2:1>
identifier 'foo' [LeadingSpace] Loc=<foo.cpp:2:6>
l_paren '(' [LeadingSpace] Loc=<foo.cpp:2:10>
int 'int' Loc=<foo.cpp:2:11>
star '*' Loc=<foo.cpp:2:14>
identifier 'a' [LeadingSpace] Loc=<foo.cpp:2:16>
...
$ clang -Xclang -help -E foo.cpp
Prints all the options, for example:
$ clang -Xclang -emit-html -o foo.html foo.cpp
$ clang -Xclang -ast-dump -E foo.cpp
$ sudo apt install xdot
$ clang++ -Xclang -ast-view -E foo.cpp
Requires -DCMAKE_BUILD_TYPE=Debug
.
$ clang++ foo.cpp -emit-llvm -S -o - -fno-discard-value-names -g0
# -emit-llvm: produce LLVM IR
# -S: as human readable foo.ll, not binary foo.bc
# -o -: output to stdout, rather than to .ll or .bc file.
# -fno-discard-value-names:
identifiers from source rather than numbers.
# -g0: less debug info in IR.
; ModuleID = 'foo.cpp'
source_filename = "foo.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @_Z3fooPiii(i32* %a, i32 %x, i32 %y) #0 {
entry:
%a.addr = alloca i32*, align 8
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
%i = alloca i64, align 8
store i32* %a, i32** %a.addr, align 8
store i32 %x, i32* %x.addr, align 4
store i32 %y, i32* %y.addr, align 4
store i64 0, i64* %i, align 8
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i64, i64* %i, align 8
%cmp = icmp ult i64 %0, 100
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %x.addr, align 4
%2 = load i32, i32* %y.addr, align 4
%add = add nsw i32 %1, %2
%3 = load i32*, i32** %a.addr, align 8
%4 = load i64, i64* %i, align 8
%arrayidx = getelementptr inbounds i32, i32* %3, i64 %4
store i32 %add, i32* %arrayidx, align 4
br label %for.inc
for.inc: ; preds = %for.body
%5 = load i64, i64* %i, align 8
%inc = add i64 %5, 1
store i64 %inc, i64* %i, align 8
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"Nick Desaulniers clang version 11.0.0 (git@github.com:llvm/llvm-project.git 9050d0fb593c60628f47caa122c01ea1dc7a1bf5)"}
opt
, dump with
-O2 -Xclang -disable-llvm-passes
.
llc
, dump with
-O2
.
-O0
will add optnone
function attribute, preventing any optimizations from running.
You probably don't want that.
$ clang++ foo.cpp -emit-llvm -S -Xclang -disable-llvm-passes
; ModuleID = 'foo.cpp'
source_filename = "foo.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local void @_Z3fooPiii(i32* %a, i32 %x, i32 %y) #0 {
entry:
%a.addr = alloca i32*, align 8
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
%i = alloca i64, align 8
store i32* %a, i32** %a.addr, align 8, !tbaa !2
store i32 %x, i32* %x.addr, align 4, !tbaa !6
store i32 %y, i32* %y.addr, align 4, !tbaa !6
%0 = bitcast i64* %i to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #2
store i64 0, i64* %i, align 8, !tbaa !8
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%1 = load i64, i64* %i, align 8, !tbaa !8
%cmp = icmp ult i64 %1, 100
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
%2 = bitcast i64* %i to i8*
call void @llvm.lifetime.end.p0i8(i64 8, i8* %2) #2
br label %for.end
for.body: ; preds = %for.cond
%3 = load i32, i32* %x.addr, align 4, !tbaa !6
%4 = load i32, i32* %y.addr, align 4, !tbaa !6
%add = add nsw i32 %3, %4
%5 = load i32*, i32** %a.addr, align 8, !tbaa !2
%6 = load i64, i64* %i, align 8, !tbaa !8
%arrayidx = getelementptr inbounds i32, i32* %5, i64 %6
store i32 %add, i32* %arrayidx, align 4, !tbaa !6
br label %for.inc
for.inc: ; preds = %for.body
%7 = load i64, i64* %i, align 8, !tbaa !8
%inc = add i64 %7, 1
store i64 %inc, i64* %i, align 8, !tbaa !8
br label %for.cond
for.end: ; preds = %for.cond.cleanup
ret void
}
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"Nick Desaulniers clang version 11.0.0 (git@github.com:llvm/llvm-project.git 8ca263577731ced4ba8b69e1d2444676eda2e2e7)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"any pointer", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"int", !4, i64 0}
!8 = !{!9, !9, i64 0}
!9 = !{!"long", !4, i64 0}
$ clang++ foo.cpp -emit-llvm -S -O2
; ModuleID = 'foo.cpp'
source_filename = "foo.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nofree norecurse nounwind uwtable writeonly
define dso_local void @_Z3fooPiii(i32* nocapture %a, i32 %x, i32 %y) local_unnamed_addr #0 {
entry:
%add = add nsw i32 %y, %x
%broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %add, i32 0
%broadcast.splat6 = shufflevector <4 x i32> %broadcast.splatinsert5, <4 x i32> undef, <4 x i32> zeroinitializer
%0 = bitcast i32* %a to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %0, align 4, !tbaa !2
%1 = getelementptr inbounds i32, i32* %a, i64 4
%2 = bitcast i32* %1 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %2, align 4, !tbaa !2
%3 = getelementptr inbounds i32, i32* %a, i64 8
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %4, align 4, !tbaa !2
%5 = getelementptr inbounds i32, i32* %a, i64 12
%6 = bitcast i32* %5 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %6, align 4, !tbaa !2
%7 = getelementptr inbounds i32, i32* %a, i64 16
%8 = bitcast i32* %7 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %8, align 4, !tbaa !2
%9 = getelementptr inbounds i32, i32* %a, i64 20
%10 = bitcast i32* %9 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %10, align 4, !tbaa !2
%11 = getelementptr inbounds i32, i32* %a, i64 24
%12 = bitcast i32* %11 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %12, align 4, !tbaa !2
%13 = getelementptr inbounds i32, i32* %a, i64 28
%14 = bitcast i32* %13 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %14, align 4, !tbaa !2
%15 = getelementptr inbounds i32, i32* %a, i64 32
%16 = bitcast i32* %15 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %16, align 4, !tbaa !2
%17 = getelementptr inbounds i32, i32* %a, i64 36
%18 = bitcast i32* %17 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %18, align 4, !tbaa !2
%19 = getelementptr inbounds i32, i32* %a, i64 40
%20 = bitcast i32* %19 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %20, align 4, !tbaa !2
%21 = getelementptr inbounds i32, i32* %a, i64 44
%22 = bitcast i32* %21 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %22, align 4, !tbaa !2
%23 = getelementptr inbounds i32, i32* %a, i64 48
%24 = bitcast i32* %23 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %24, align 4, !tbaa !2
%25 = getelementptr inbounds i32, i32* %a, i64 52
%26 = bitcast i32* %25 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %26, align 4, !tbaa !2
%27 = getelementptr inbounds i32, i32* %a, i64 56
%28 = bitcast i32* %27 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %28, align 4, !tbaa !2
%29 = getelementptr inbounds i32, i32* %a, i64 60
%30 = bitcast i32* %29 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %30, align 4, !tbaa !2
%31 = getelementptr inbounds i32, i32* %a, i64 64
%32 = bitcast i32* %31 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %32, align 4, !tbaa !2
%33 = getelementptr inbounds i32, i32* %a, i64 68
%34 = bitcast i32* %33 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %34, align 4, !tbaa !2
%35 = getelementptr inbounds i32, i32* %a, i64 72
%36 = bitcast i32* %35 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %36, align 4, !tbaa !2
%37 = getelementptr inbounds i32, i32* %a, i64 76
%38 = bitcast i32* %37 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %38, align 4, !tbaa !2
%39 = getelementptr inbounds i32, i32* %a, i64 80
%40 = bitcast i32* %39 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %40, align 4, !tbaa !2
%41 = getelementptr inbounds i32, i32* %a, i64 84
%42 = bitcast i32* %41 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %42, align 4, !tbaa !2
%43 = getelementptr inbounds i32, i32* %a, i64 88
%44 = bitcast i32* %43 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %44, align 4, !tbaa !2
%45 = getelementptr inbounds i32, i32* %a, i64 92
%46 = bitcast i32* %45 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %46, align 4, !tbaa !2
%47 = getelementptr inbounds i32, i32* %a, i64 96
%48 = bitcast i32* %47 to <4 x i32>*
store <4 x i32> %broadcast.splat6, <4 x i32>* %48, align 4, !tbaa !2
ret void
}
attributes #0 = { nofree norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"Nick Desaulniers clang version 11.0.0 (git@github.com:llvm/llvm-project.git 8ca263577731ced4ba8b69e1d2444676eda2e2e7)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
$ opt -verify -S foo.ll
$ opt -O2 -S foo.ll
$ opt -O2 -verify-each -S foo.ll
$ clang++ -emit-llvm -S -O2 -Xclang -disable-llvm-passes foo.cpp
$ opt -O2 -S foo.ll
Should generally match
$ clang++ -emit-llvm -S -O2 foo.cpp
Though frontends can choose their own pass pipelines.
$ clang++ -emit-llvm -S -O2 -Xclang -disable-llvm-passes foo.cpp
$ opt -licm -S foo.ll
#include <stddef.h>
void foo (int* a, int x, int y) {
for (size_t i = 0; i < 100; ++i)
a[i] = x + y;
}
Before:
for.body: ; preds = %for.cond
%3 = load i32, i32* %x.addr, align 4, !tbaa !6
%4 = load i32, i32* %y.addr, align 4, !tbaa !6
%add = add nsw i32 %3, %4
%5 = load i32*, i32** %a.addr, align 8, !tbaa !2
%6 = load i64, i64* %i, align 8, !tbaa !8
%arrayidx = getelementptr inbounds i32, i32* %5, i64 %6
store i32 %add, i32* %arrayidx, align 4, !tbaa !6
br label %for.inc
After:
for.body: ; preds = %for.cond
%arrayidx = getelementptr inbounds i32, i32* %3, i64 %inc1
store i32 %add, i32* %arrayidx, align 4, !tbaa !6
br label %for.inc
$ opt -O2 -print-after-all -S foo.ll
# -print-after-all doesn't print a pass if it did no modifications.
$ opt -O2 -print-before-all -S foo.ll
*** IR Dump Before Module Verifier ***
*** IR Dump Before Instrument function entry/exit with calls to e.g. mcount() (pre inlining) ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before SROA ***
*** IR Dump Before Early CSE ***
*** IR Dump Before Lower 'expect' Intrinsics ***
*** IR Dump Before Force set function attributes ***
*** IR Dump Before Infer set function attributes ***
*** IR Dump Before Interprocedural Sparse Conditional Constant Propagation ***
*** IR Dump Before Called Value Propagation ***
*** IR Dump Before Deduce and propagate attributes ***
*** IR Dump Before Global Variable Optimizer ***
*** IR Dump Before Promote Memory to Register ***
*** IR Dump Before Dead Argument Elimination ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Remove unused exception handling info ***; Function Attrs: nounwind uwtable
*** IR Dump Before Function Integration/Inlining ***; Function Attrs: nounwind uwtable
*** IR Dump Before Deduce function attributes ***; Function Attrs: nounwind uwtable
*** IR Dump Before SROA ***
*** IR Dump Before Early CSE w/ MemorySSA ***
*** IR Dump Before Speculatively execute instructions if target has divergent branches ***
*** IR Dump Before Jump Threading ***
*** IR Dump Before Value Propagation ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Conditionally eliminate dead library calls ***
*** IR Dump Before PGOMemOPSize ***
*** IR Dump Before Tail Call Elimination ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Reassociate expressions ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Rotate Loops ***
*** IR Dump Before Loop Invariant Code Motion ***
*** IR Dump Before Unswitch loops ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Induction Variable Simplification ***
*** IR Dump Before Recognize loop idioms ***
*** IR Dump Before Delete dead loops ***
*** IR Dump Before Unroll loops ***
*** IR Dump Before MergedLoadStoreMotion ***
*** IR Dump Before Global Value Numbering ***
*** IR Dump Before MemCpy Optimization ***
*** IR Dump Before Sparse Conditional Constant Propagation ***
*** IR Dump Before Demanded bits analysis ***
*** IR Dump Before Bit-Tracking Dead Code Elimination ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Jump Threading ***
*** IR Dump Before Value Propagation ***
*** IR Dump Before Dead Store Elimination ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Loop Invariant Code Motion ***
*** IR Dump Before Aggressive Dead Code Elimination ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Remove unused exception handling info ***
*** IR Dump Before Function Integration/Inlining ***
*** IR Dump Before Deduce function attributes ***
*** IR Dump Before A No-Op Barrier Pass ***
*** IR Dump Before Eliminate Available Externally Globals ***
*** IR Dump Before Deduce function attributes in RPO ***
*** IR Dump Before Global Variable Optimizer ***
*** IR Dump Before Dead Global Elimination ***
*** IR Dump Before Float to int ***
*** IR Dump Before Lower constant intrinsics ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Rotate Loops ***
*** IR Dump Before Loop Distribution ***
*** IR Dump Before Demanded bits analysis ***
*** IR Dump Before Inject TLI Mappings ***
*** IR Dump Before Loop Vectorization ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before Loop Load Elimination ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Demanded bits analysis ***
*** IR Dump Before SLP Vectorizer ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Unroll loops ***
*** IR Dump Before Combine redundant instructions ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Warn about non-applied transformations ***
*** IR Dump Before Alignment from assumptions ***
*** IR Dump Before Strip Unused Function Prototypes ***
*** IR Dump Before Dead Global Elimination ***
*** IR Dump Before Merge Duplicate Global Constants ***
*** IR Dump Before Canonicalize natural loops ***
*** IR Dump Before LCSSA Verifier ***
*** IR Dump Before Loop-Closed SSA Form Pass ***
*** IR Dump Before Remove redundant instructions ***
*** IR Dump Before Hoist/decompose integer division and remainder ***
*** IR Dump Before Simplify the CFG ***
*** IR Dump Before Module Verifier ***
$ llc foo.ll # produces foo.s
$ llc -filetype=obj foo.ll # produces foo.o
$ llc --print-after-all foo.ll
*** IR Dump After Pre-ISel Intrinsic Lowering ***
*** IR Dump After Expand Atomic instructions ***
*** IR Dump After Module Verifier ***
*** IR Dump After Canonicalize natural loops ***
*** IR Dump After Loop Strength Reduction ***
*** IR Dump After Merge contiguous icmps into a memcmp ***
*** IR Dump After Expand memcmp() to load/stores ***
*** IR Dump After Lower Garbage Collection Instructions ***
*** IR Dump After Shadow Stack GC Lowering ***
*** IR Dump After Lower constant intrinsics ***
*** IR Dump After Remove unreachable blocks from the CFG ***
*** IR Dump After Constant Hoisting ***
*** IR Dump After Partially inline calls to library functions ***
*** IR Dump After Instrument function entry/exit with calls to e.g. mcount() (post inlining) ***
*** IR Dump After Scalarize Masked Memory Intrinsics ***
*** IR Dump After Expand reduction intrinsics ***
*** IR Dump After Interleaved Access Pass ***
*** IR Dump After Expand indirectbr instructions ***
*** IR Dump After CodeGen Prepare ***
*** IR Dump After Rewrite Symbols ***
*** IR Dump After Exception handling preparation ***
*** IR Dump After Safe Stack instrumentation pass ***
*** IR Dump After Module Verifier ***
# *** IR Dump After Finalize ISel and expand pseudo-instructions ***:
# *** IR Dump After X86 Domain Reassignment Pass ***:
# *** IR Dump After Early Tail Duplication ***:
# *** IR Dump After Optimize machine instruction PHIs ***:
# *** IR Dump After Slot index numbering ***:
# *** IR Dump After Merge disjoint stack slots ***:
# *** IR Dump After Local Stack Slot Allocation ***:
# *** IR Dump After Remove dead machine instructions ***:
# *** IR Dump After Early If-Conversion ***:
# *** IR Dump After Machine InstCombiner ***:
# *** IR Dump After X86 cmov Conversion ***:
# *** IR Dump After Early Machine Loop Invariant Code Motion ***:
# *** IR Dump After Machine Common Subexpression Elimination ***:
# *** IR Dump After Machine code sinking ***:
# *** IR Dump After Peephole Optimizations ***:
# *** IR Dump After Remove dead machine instructions ***:
# *** IR Dump After Live Range Shrink ***:
# *** IR Dump After X86 LEA Optimize ***:
# *** IR Dump After X86 Optimize Call Frame ***:
# *** IR Dump After X86 Avoid Store Forwarding Blocks ***:
# *** IR Dump After X86 speculative load hardening ***:
# *** IR Dump After X86 EFLAGS copy lowering ***:
# *** IR Dump After Detect Dead Lanes ***:
# *** IR Dump After Process Implicit Definitions ***:
# *** IR Dump After Remove unreachable machine basic blocks ***:
# *** IR Dump After Live Variable Analysis ***:
# *** IR Dump After Eliminate PHI nodes for register allocation ***:
# *** IR Dump After Two-Address instruction pass ***:
# *** IR Dump After Slot index numbering ***:
# *** IR Dump After Live Interval Analysis ***:
# *** IR Dump After Simple Register Coalescing ***:
# *** IR Dump After Rename Disconnected Subregister Components ***:
# *** IR Dump After Machine Instruction Scheduler ***:
# *** IR Dump After Debug Variable Analysis ***:
# *** IR Dump After Live Stack Slot Analysis ***:
# *** IR Dump After Virtual Register Map ***:
# *** IR Dump After Live Register Matrix ***:
# *** IR Dump After Greedy Register Allocator ***:
# *** IR Dump After Virtual Register Rewriter ***:
# *** IR Dump After Stack Slot Coloring ***:
# *** IR Dump After Machine Copy Propagation Pass ***:
# *** IR Dump After Machine Loop Invariant Code Motion ***:
# *** IR Dump After X86 FP Stackifier ***:
# *** IR Dump After PostRA Machine Sink ***:
# *** IR Dump After Shrink Wrapping analysis ***:
# *** IR Dump After Prologue/Epilogue Insertion & Frame Finalization ***:
# *** IR Dump After Control Flow Optimizer ***:
# *** IR Dump After Tail Duplication ***:
# *** IR Dump After Machine Copy Propagation Pass ***:
# *** IR Dump After Post-RA pseudo instruction expansion pass ***:
# *** IR Dump After X86 pseudo instruction expansion pass ***:
# *** IR Dump After Post RA top-down list latency scheduler ***:
# *** IR Dump After Analyze Machine Code For Garbage Collection ***:
# *** IR Dump After Branch Probability Basic Block Placement ***:
# *** IR Dump After Insert fentry calls ***:
# *** IR Dump After Insert XRay ops ***:
# *** IR Dump After Implement the 'patchable-function' attribute ***:
# *** IR Dump After X86 Execution Dependency Fix ***:
# *** IR Dump After BreakFalseDeps ***:
# *** IR Dump After X86 Byte/Word Instruction Fixup ***:
# *** IR Dump After X86 LEA Fixup ***:
# *** IR Dump After Compressing EVEX instrs to VEX encoding when possible ***:
# *** IR Dump After Contiguously Lay Out Funclets ***:
# *** IR Dump After StackMap Liveness Analysis ***:
# *** IR Dump After Live DEBUG_VALUE analysis ***:
# *** IR Dump After Check CFA info and insert CFI instructions if needed ***:
$ llc -debug-pass=Structure foo.ll
...
Target Library Information
Target Pass Configuration
Machine Module Information
Target Transform Information
Type-Based Alias Analysis
Scoped NoAlias Alias Analysis
Assumption Cache Tracker
Profile summary info
Create Garbage Collector Module Metadata
Machine Branch Probability Analysis
ModulePass Manager
Pre-ISel Intrinsic Lowering
FunctionPass Manager
Expand Atomic instructions
Dominator Tree Construction
Basic Alias Analysis (stateless AA impl)
Module Verifier
Natural Loop Information
Canonicalize natural loops
Scalar Evolution Analysis
Loop Pass Manager
Induction Variable Users
Loop Strength Reduction
Basic Alias Analysis (stateless AA impl)
Function Alias Analysis Results
Merge contiguous icmps into a memcmp
Natural Loop Information
Lazy Branch Probability Analysis
Lazy Block Frequency Analysis
Expand memcmp() to load/stores
Lower Garbage Collection Instructions
Shadow Stack GC Lowering
Lower constant intrinsics
Remove unreachable blocks from the CFG
Dominator Tree Construction
Natural Loop Information
Branch Probability Analysis
Block Frequency Analysis
Constant Hoisting
Partially inline calls to library functions
Instrument function entry/exit with calls to e.g. mcount() (post inlining)
Scalarize Masked Memory Intrinsics
Expand reduction intrinsics
Dominator Tree Construction
Interleaved Access Pass
Expand indirectbr instructions
Dominator Tree Construction
Natural Loop Information
CodeGen Prepare
Rewrite Symbols
FunctionPass Manager
Dominator Tree Construction
Exception handling preparation
Safe Stack instrumentation pass
Insert stack protectors
Module Verifier
Dominator Tree Construction
Basic Alias Analysis (stateless AA impl)
Function Alias Analysis Results
Natural Loop Information
Branch Probability Analysis
Lazy Branch Probability Analysis
Lazy Block Frequency Analysis
X86 DAG->DAG Instruction Selection
MachineDominator Tree Construction
Local Dynamic TLS Access Clean-up
X86 PIC Global Base Reg Initialization
Finalize ISel and expand pseudo-instructions
X86 Domain Reassignment Pass
Lazy Machine Block Frequency Analysis
Early Tail Duplication
Optimize machine instruction PHIs
Slot index numbering
Merge disjoint stack slots
Local Stack Slot Allocation
Remove dead machine instructions
MachineDominator Tree Construction
Machine Natural Loop Construction
Machine Trace Metrics
Early If-Conversion
Lazy Machine Block Frequency Analysis
Machine InstCombiner
X86 cmov Conversion
MachineDominator Tree Construction
Machine Natural Loop Construction
Early Machine Loop Invariant Code Motion
MachineDominator Tree Construction
Machine Block Frequency Analysis
Machine Common Subexpression Elimination
MachinePostDominator Tree Construction
Machine code sinking
Peephole Optimizations
Remove dead machine instructions
Live Range Shrink
X86 Fixup SetCC
Lazy Machine Block Frequency Analysis
X86 LEA Optimize
X86 Optimize Call Frame
X86 Avoid Store Forwarding Blocks
X86 speculative load hardening
MachineDominator Tree Construction
X86 EFLAGS copy lowering
X86 WinAlloca Expander
Detect Dead Lanes
Process Implicit Definitions
Remove unreachable machine basic blocks
Live Variable Analysis
MachineDominator Tree Construction
Machine Natural Loop Construction
Eliminate PHI nodes for register allocation
Two-Address instruction pass
Slot index numbering
Live Interval Analysis
Simple Register Coalescing
Rename Disconnected Subregister Components
Machine Instruction Scheduler
Machine Block Frequency Analysis
Debug Variable Analysis
Live Stack Slot Analysis
Virtual Register Map
Live Register Matrix
Bundle Machine CFG Edges
Spill Code Placement Analysis
Lazy Machine Block Frequency Analysis
Machine Optimization Remark Emitter
Greedy Register Allocator
Virtual Register Rewriter
Stack Slot Coloring
Machine Copy Propagation Pass
Machine Loop Invariant Code Motion
Bundle Machine CFG Edges
X86 FP Stackifier
PostRA Machine Sink
Machine Block Frequency Analysis
MachineDominator Tree Construction
MachinePostDominator Tree Construction
Lazy Machine Block Frequency Analysis
Machine Optimization Remark Emitter
Shrink Wrapping analysis
Prologue/Epilogue Insertion & Frame Finalization
Control Flow Optimizer
Lazy Machine Block Frequency Analysis
Tail Duplication
Machine Copy Propagation Pass
Post-RA pseudo instruction expansion pass
X86 pseudo instruction expansion pass
MachineDominator Tree Construction
Machine Natural Loop Construction
Post RA top-down list latency scheduler
Analyze Machine Code For Garbage Collection
Machine Block Frequency Analysis
MachinePostDominator Tree Construction
Branch Probability Basic Block Placement
Insert fentry calls
Insert XRay ops
Implement the 'patchable-function' attribute
ReachingDefAnalysis
X86 Execution Dependency Fix
BreakFalseDeps
X86 Indirect Branch Tracking
X86 vzeroupper inserter
MachineDominator Tree Construction
Machine Natural Loop Construction
Lazy Machine Block Frequency Analysis
X86 Byte/Word Instruction Fixup
Lazy Machine Block Frequency Analysis
X86 Atom pad short functions
X86 LEA Fixup
Compressing EVEX instrs to VEX encoding when possible
X86 Discriminate Memory Operands
X86 Insert Cache Prefetches
X86 insert wait instruction
Contiguously Lay Out Funclets
StackMap Liveness Analysis
Live DEBUG_VALUE analysis
X86 Retpoline Thunks
Check CFA info and insert CFI instructions if needed
Lazy Machine Block Frequency Analysis
Machine Optimization Remark Emitter
X86 Assembly Printer
Free MachineFunction
Before:
define dso_local void @_Z3fooPiii(i32* nocapture %a, i32 %x, i32 %y) local_unnamed_addr #0 {
entry:
%add = add nsw i32 %y, %x
%broadcast.splatinsert5 = insertelement <4 x i32> undef, i32 %add, i32 0
...
After:
# *** IR Dump After Finalize ISel and expand pseudo-instructions ***:
bb.0.entry:
liveins: $rdi, $esi, $edx
%2:gr32 = COPY $edx
%1:gr32 = COPY $esi
%0:gr64 = COPY $rdi
%3:gr32 = nsw ADD32rr %2:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
%4:vr128 = MOVDI2PDIrr killed %3:gr32
%5:vr128 = PSHUFDri killed %4:vr128, 0
...
0B bb.0.entry:
liveins: $edx, $esi, $rdi
80B renamable $esi = nsw ADD32rr killed renamable $esi(tied-def 0), killed renamable $edx, implicit-def dead $eflags
96B renamable $xmm0 = MOVDI2PDIrr killed renamable $esi
112B renamable $xmm0 = PSHUFDri killed renamable $xmm0, 0
#include <llvm/Support/raw_ostream.h>
llvm::outs() << object;
llvm::errs() << object;
object->print(llvm::errs());
object->dump();
#include <llvm/Support/Debug.h>
#define DEBUG_TYPE "foo"
LLVM_DEBUG(llvm::dbgs() << object);
// llvm/lib/Transforms/Scalar/LICM.cpp
// ...
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "licm"
STATISTIC(NumCreatedBlocks, "Number of blocks created");
STATISTIC(NumClonedBranches, "Number of branches cloned");
// ...
$ opt -licm -debug-only=licm foo.ll -S -o -
LICM: Using MemorySSA.
LICM sinking instruction: %arrayidx = getelementptr inbounds i32, i32* %5, i64 %6
LICM hoisting to entry: %3 = load i32, i32* %x.addr, align 4, !tbaa !6
LICM hoisting to entry: %4 = load i32, i32* %y.addr, align 4, !tbaa !6
LICM hoisting to entry: %add = add nsw i32 %1, %2
LICM hoisting to entry: %5 = load i32*, i32** %a.addr, align 8, !tbaa !2
LICM: Promoting value stored to in loop: %i = alloca i64, align 8
; ModuleID = 'foo.ll'
...
OptimizationRemark
objects are keyed off the same
DEBUG_TYPE
.
$ clang++ -O2 -Rpass=licm -c foo.cpp
foo.cpp:4:5: remark: sinking getelementptr [-Rpass=licm]
a[i] = x + y;
^
foo.cpp:4:14: remark: hoisting add [-Rpass=licm]
a[i] = x + y;
^
foo.cpp:4:5: remark: sinking getelementptr [-Rpass=licm]
a[i] = x + y;
^
-print-after-all
-print-after=
-print-before-all
-print-before=
-stop-after=
-print-module-scope
-debug-pass=Arguments
You can pass these to clang
with -mllvm
prefixed.
While opt passes generally can be run in differing order, llc passes that operate on MIR generally cannot.
-print-before=
is your friend for MIR tests.
$ cmake -DCMAKE_BUILD_TYPE=debug ...
$ ninja
Most objects have a dump()
method.
(lldb) object.dump()
Personally, I find relinking a Release build with printf's faster. ¯\_(ツ)_/¯
Template like language for generating C++ code fed into the build of LLVM itself.
.td files become .inc or .def files.
$ clang-tblgen clang/include/clang/Basic/Diagnostic.td \
-I clang/include/clang/Basic --gen-clang-diags-defs
$ wc -l clang/include/clang/Basic/Diagnostic.td
152 clang/include/clang/Basic/Diagnostic.td
$ clang-tblgen clang/include/clang/Basic/Diagnostic.td \
-I clang/include/clang/Basic --gen-clang-diags-defs | wc -l
5380
Q: Which backend to run?
A: Check CMakeLists.txt in dir of .td.
ninja check-all
to run all tests.
llvm-lit -vv <test>
to better understand failures.
Break up tests with multiple RUN lines to understand regressions.
-Weverything
.grep
for "generic" part of warning.grep
for def
from .td
file.During development of -fpatchable-function-entry=M,N Linaro's TCWG report's "There are 3 files in linux that assert fail on the Implement the 'patchable-function attribute'."
llvm-project: 67c608a9695496cfc9d3fdf9d0b12b554ac6b4df
linux: ccaaaf6fe5a5e1fffca5cca0f3fc4ec84d7ae752
$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang \
-j71 allyesconfig
$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang \
-j71 mm/kasan/quarantine.o
$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang \
-j71 mm/kasan/quarantine.o V=1
Also printed in the stack trace.
$ clang -Wp,-MD,mm/kasan/.quarantine.o.d -nostdinc -isystem /android0/llvm-project/llvm/build/lib/clang/11.0.0/include -I./arch/arm64/include -I./arch/arm64/include/generated -I./include -I./arch/arm64/include/uapi -I./arch/arm64/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/kconfig.h -include ./include/linux/compiler_types.h -D__KERNEL__ -mlittle-endian -DCC_USING_PATCHABLE_FUNCTION_ENTRY -DKASAN_SHADOW_SCALE_SHIFT=3 -Qunused-arguments -Wall -Wundef -Werror=strict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE -Werror=implicit-function-declaration -Werror=implicit-int -Wno-format-security -std=gnu89 --target=aarch64-linux-gnu --prefix=/usr/bin/ --gcc-toolchain=/usr -no-integrated-as -Werror=unknown-warning-option -mgeneral-regs-only -DCONFIG_CC_HAS_K_CONSTRAINT=1 -fno-asynchronous-unwind-tables -DKASAN_SHADOW_SCALE_SHIFT=3 -fno-delete-null-pointer-checks -Wno-address-of-packed-member -O2 -Wframe-larger-than=2048 -fstack-protector-strong -Wno-format-invalid-specifier -Wno-gnu -Wno-tautological-compare -mno-global-merge -Wno-unused-const-variable -fno-omit-frame-pointer -fno-optimize-sibling-calls -ftrivial-auto-var-init=pattern -fpatchable-function-entry=2 -Wdeclaration-after-statement -Wvla -Wno-pointer-sign -fno-strict-overflow -fno-merge-all-constants -fno-stack-check -Werror=date-time -Werror=incompatible-pointer-types -fmacro-prefix-map=./= -Wno-initializer-overrides -Wno-format -Wno-sign-compare -Wno-format-zero-length -fno-builtin -DKBUILD_MODFILE="mm/kasan/quarantine" -DKBUILD_BASENAME="quarantine" -DKBUILD_MODNAME="quarantine" -c -o mm/kasan/quarantine.o mm/kasan/quarantine.c
#!/bin/bash
OUT=$(<compiler invocation> 2>&1)
echo $OUT | grep "PatchableFunction::runOnMachineFunction"
-I
flags generally are relative.2>&1
.-c -o foo.o
with -E -o foo.i
in reproducer script.
.i
.-E -o foo.i foo.c
with
-c foo.i
in reproducer script.
$ wc -l mm/kasan/quarantine.c
329 mm/kasan/quarantine.c
$ wc -l quarantine.i
53444 quarantine.i
$ creduce --n 71 ./repo2.sh quarantine.i
$ wc -l quarantine.i
5 quarantine.i
a() {
int b;
while (b);
}
Take a look at cvise , for improved concurrency!
# From:
$ clang -Wp,-MD,mm/kasan/.quarantine.o.d -nostdinc -isystem /android0/llvm-project/llvm/build/lib/clang/11.0.0/include -I./arch/arm64/include -I./arch/arm64/include/generated -I./include -I./arch/arm64/include/uapi -I./arch/arm64/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/kconfig.h -include ./include/linux/compiler_types.h -D__KERNEL__ -mlittle-endian -DCC_USING_PATCHABLE_FUNCTION_ENTRY -DKASAN_SHADOW_SCALE_SHIFT=3 -Qunused-arguments -Wall -Wundef -Werror=strict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE -Werror=implicit-function-declaration -Werror=implicit-int -Wno-format-security -std=gnu89 --target=aarch64-linux-gnu --prefix=/usr/bin/ --gcc-toolchain=/usr -no-integrated-as -Werror=unknown-warning-option -mgeneral-regs-only -DCONFIG_CC_HAS_K_CONSTRAINT=1 -fno-asynchronous-unwind-tables -DKASAN_SHADOW_SCALE_SHIFT=3 -fno-delete-null-pointer-checks -Wno-address-of-packed-member -O2 -Wframe-larger-than=2048 -fstack-protector-strong -Wno-format-invalid-specifier -Wno-gnu -Wno-tautological-compare -mno-global-merge -Wno-unused-const-variable -fno-omit-frame-pointer -fno-optimize-sibling-calls -ftrivial-auto-var-init=pattern -fpatchable-function-entry=2 -Wdeclaration-after-statement -Wvla -Wno-pointer-sign -fno-strict-overflow -fno-merge-all-constants -fno-stack-check -Werror=date-time -Werror=incompatible-pointer-types -fmacro-prefix-map=./= -Wno-initializer-overrides -Wno-format -Wno-sign-compare -Wno-format-zero-length -fno-builtin -DKBUILD_MODFILE="mm/kasan/quarantine" -DKBUILD_BASENAME="quarantine" -DKBUILD_MODNAME="quarantine" -c quarantine.i
# To:
$ clang -O2 -ftrivial-auto-var-init=pattern -fpatchable-function-entry=2 -c quarantine.i
$ clang -O2 -ftrivial-auto-var-init=pattern \
-fpatchable-function-entry=2 \
-S -emit-llvm quarantine.i
$ llc -O2 quarantine.ll
# crashes in same frame
$ wc -l quarantine.ll
20 quarantine.ll
$ llvm-extract --func=a -o a.ll quarantine.ll
Won't be able to run creduce once converted to LLVM IR.
$ llvm-reduce --test repo4.sh quarantine.ll
$ wc -l reduced.ll
15 reduced.ll
; ModuleID = 'quarantine.ll'
source_filename = "quarantine.i"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse noreturn nounwind readnone uwtable
define dso_local i32 @a() local_unnamed_addr #0 {
entry:
ret void
}
attributes #0 = { norecurse noreturn nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "patchable-function-entry"="2" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"wchar_size", i32 4}
llvm-reduce
to bugpoint, though my experience has been that llvm-reduce
crashes often for larger inputs.
$ bugpoint -compile-custom -compile-command=./repro5.sh quarantine.ll
$ llvm-dis bugpoint-reduced-simplified.bc
; ModuleID = 'bugpoint-reduced-simplified.bc'
source_filename = "quarantine.i"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define dso_local void @a() local_unnamed_addr #0 {
entry:
unreachable
}
attributes #0 = { "patchable-function-entry"="2" }
!llvm.ident = !{!0}
!0 = !{!"Nick Desaulniers clang version 11.0.0 (git@github.com:llvm/llvm-project.git 67c608a9695496cfc9d3fdf9d0b12b554ac6b4df)"}
This can be cut down further manually; verify with opt -verify
. Nice to have minimal test cases.
The master has failed more times than the beginner has even tried.