-mllvm -debug-pass=Structure
can tell you which pass
operates on which level.
Transform machinery is likely distinct from policy.
llvm/lib/Transform/Utils/
contains machinery for making changes.
llvm/lib/Transform/*/
contains the policy decisions in passes.
Check out these methods to get a sense of how optimization passes are ordered.
PassBuilder::buildPerModuleDefaultPipeline
PassBuilder::buildModuleSimplificationPipeline
PassBuilder::buildModuleOptimizationPipeline
-print-before-all
to
-print-after-all
.
-print-after-all
doesn't print
anything if the pass made no changes.
$ clang -mllvm -print-before-all --target=aarch64-linux-gnu \
-O2 -c /tmp/x.c 2>&1 | grep Dump
...
*** IR Dump Before CoroEarlyPass on [module] ***
*** IR Dump Before LowerExpectIntrinsicPass on foo ***
*** IR Dump Before SimplifyCFGPass on foo ***
*** IR Dump Before SROAPass on foo ***
*** IR Dump Before EarlyCSEPass on foo ***
...
$ !! | wc -l
205
$ clang ... | grep Dump | sort -u | wc -l
168
These slides are non-exhaustive.
Passes generally aren't commutative.
The examples shown here are barely the tip of the iceberg.
goto foo;
foo:
// poof!
struct foo my_foo {
int x, y;
} = {
.x = 42,
.y = 10
};
return my_foo.x;
int x = 42;
int y = 10;
return x;
SROA invokes promoteMemoryToRegister
, ie. the core of
mem2reg.
Tries to match literally identical subexpressions.
int a = b * c + d;
int e = b * c * e;
int bc = b * c;
int a = bc + d;
int e = bc * e;
static int foo(int x) {
return x;
}
void bar (void) { foo(42); }
static int foo(int x) {
return 42;
}
void bar (void) { foo(42); }
static int baz(void) {
return 42;
}
int quux (void) {
return baz();
}
static int baz(void) {
return 42;
}
int quux (void) {
return 42;
}
int foo(int x) {
return 42;
}
int foo(void) {
return 42;
}
y = x + 1;
z = y + 1;
z = x + 2;
if (strlen(x) != 0)
if (*x)
void foo (int);
void bar (void) {
int x = 42;
foo(x);
}
void baz (void) {
bar();
}
void foo (int);
void bar (void) {
int x = 42;
foo(x);
}
void baz (void) {
{
int x = 42;
foo(x);
}
}
if (x)
y = 42;
if (y < 0)
foo();
else
bar();
if (x) {
y = 42;
bar();
goto end;
}
if (y < 0)
foo();
else
bar();
end:;
void baz (void) {
foo();
}
.globl baz
.type baz,@function
foo:
stp x29, x30, [sp, #-16]!
mov x29, sp
bl foo
ldp x29, x30, [sp], #16
ret
.globl baz
.type baz,@function
foo:
b foo
Ensure that loops are in a canonical form ("Loop Simplify Form")
Loops have 1 backedge.
for (...) {
if (x)
continue;
if (y)
continue;
...
}
for (...) {
if (x)
goto end;
if (y)
goto end;
...
end:;
}
if (...) {
int x = *y;
x = foo(x);
*y = x;
} else {
int z = *y;
z = bar(z);
*y = z;
}
int xz = *y;
if (...)
xz = foo(xz);
else
xz = bar(xz);
*y = xz;
Unlike CSE, tries to compute equivalency of subexpressions. Requires SSA form.
int w = 3;
int x = 3;
int y = x + 4;
int z = w + 4;
int w = 3;
int x = w;
int y = w + 4;
int z = y;
goto foo;
expensive();
foo:;
goto foo;
foo:;
LLVM has Aggressive, Bit-tracking, and Global DCE variants.
if (x)
foo(x ? b : c);
if (x)
foo(b);
struct foo { ... } a, b;
memcpy(&a, &b, sizeof a);
a = b;
a[0] = 0;
a[1] = 0;
a[2] = 0;
a[3] = 0;
memset(a, 0, 4 * sizeof a[0]);
{
int x = 42;
foo(x);
...
x = 0;
}
{
int x = 42;
foo(x);
...
}
int x = 4 + 5;
// only written to
static int y;
static int z;
int foo (void) {
z = bar();
return z;
}
// No callers
void baz(void) {}
int x = 9;
// poof! y
void foo (void) {
int z = bar();
return z;
}
// poof! baz
__attribute__((gnu_inline))
extern inline int foo(void) {
return 42;
}
// After inlining
// poof! foo
// -Wextern-initializer
extern int y = 42;
int x (void) {
return y + 1;
}
int y;
int x (void) {
return y + 1;
}
void foo(int x) {
if (x < 1.1f)
bar();
}
void foo(int x) {
if (x < 1)
bar();
}
__builtin_constant_p();
__builtin_object_size( \
foo, type);
// 0 or 1;
// -1, 0, or min/max size
// of foo;
Setup for loop vectorizor.
void foo (int *a) {
// Assume 4x32b SIMD.
// Unaligned loads.
for (int i = 0; i != 5; ++i)
++a[i];
}
void foo (int *a) {
// Assume 4x32b SIMD.
// Unaligned loads.
for (int i = 0; i != 4; ++i)
++a[i];
++a[5];
}
How might this code have to change if the target doesn't support unaligned loads/stores and had a few more iterations?
void foo (int *a) {
for (int i = 0; i != 4; ++i)
++a[i];
}
#include <arm_neon.h>
void foo (int32x4_t *a) {
*a += 1;
}
for (int i = 0; i < 100; ++i)
a[i+1] = a[i] + b[i];
for (int i = 0, tmp = a[i];
i < 100; ++i) {
a[i + 1] = tmp = tmp + b[i];
}
Many more loop passes
Division and remainder can often times be computed together. Looks for matching denominators and tries to move operations closer together.
static const int x =
0xdeadbeef;
static const int y =
0xdeadbeef;
int foo (void) { return x; }
int bar (void) { baz(y); }
static const int xy =
0xdeadbeef;
int foo (void) { return xy; }
int bar (void) { baz(xy); }
__int128 div (__int128 a,
__int128 b) {
return a / b;
}
__int128 div (__int128 a,
__int128 b) {
// super complicated loop
}
Target may be incapable of large div/rem.
Target may need libcall for _Atomic
operation.
// int a [200];
// int b [200];
if (a[10] == b[20] &&
a[11] == b[21]
a[12] == b[22]
a[13] == b[23])
goto foo;
// int a [200];
// int b [200];
if (memcmp(a + 10, b + 20,
4 * sizeof(a[0])))
goto foo;
Take advantage of target specifics.
// int a [200];
// int b [200];
if (memcmp(a + 10, b + 20,
4 * sizeof(a[0])))
goto foo;
// long a [100];
// long b [100];
if (a[10] == b[10] &&
a[11] == b[11])
goto foo;
if (x)
return 0xdeadbeefdeadbeaf;
if (y)
return 0xdeadbeefdeadbeaf;
unsigned long z =
0xdeadbeefdeadbeaf;
if (x)
return z;
if (y)
return z;
Some constants are too large to encode in one instruction. Reuse (when possible) rather than rebuild repeatedly.
for (char i = 0; i < 50; ++i)
for (int i = 0; i < 50; ++i)
We have to be careful to prove that operation would produce similar results on both types. (Overflow could be a problem.) Instruction selection will later do legalization, but this pass is better suited for cyclic code (inter-block).
Requires (multiple) separate talks.
ISEL does do many optimizations, too.
goto *(v ? &&x : &&y);
x:
k = 5;
goto z;
y:
k = 42;
goto z;
z:
foo();
w:
goto *(v ? &&x : &&y);
x:
k = 5;
foo();
goto w;
y:
k = 42;
foo();
goto w;
z: // unreachable!
foo();
w:
There's pre-RA ("early") and post-RA variants.
goto *(v ? &&x : &&y);
x:
k = 5;
foo();
goto w;
y:
k = 42;
foo();
goto w;
w:
x = v ? 5 : 42;
foo();
mov w0, #1
add w1, w1, w0
add w1, w1, #0
Requires multiple separate talks.
Depending on which register allocator is being used, may involve multiple passes.