2022-08-10

GCC Optimization Levels

Optimization levels are really weird.

Using the function and compiling for an attiny85, ie --mmcu=attiny85.

int main(void) {
    int i = 0;
    while(i < 10000) {
        i++;
    }
}

We can disassemble with avr-objdump -sS -m avr25 led. As a disclaimer, I’m not pretending I know what any of this means.

Using -Os ie optimize for code side, it looks like the entire function is optimized away!

> avr-objdump -sS -m avr25 led
...
00000030 <main>:
{
    int i = 0;
    while(i < 10000) {
        i++;
    }
}
  30:	90 e0       	ldi	r25, 0x00	; 0
  32:	80 e0       	ldi	r24, 0x00	; 0
  34:	08 95       	ret

And by comparison, using -O0, fastest compile time and the gcc default:

00000030 <main>:
// #include "wait.h"

int main(void) {
  30:	cf 93       	push	r28
  32:	df 93       	push	r29
  34:	00 d0       	rcall	.+0      	; 0x36 <L0^A>

00000036 <L0^A>:
  36:	cd b7       	in	r28, 0x3d	; 61
  38:	de b7       	in	r29, 0x3e	; 62

0000003a <.Loc.1>:
    int i = 0;
  3a:	1a 82       	std	Y+2, r1	; 0x02
  3c:	19 82       	std	Y+1, r1	; 0x01

0000003e <.Loc.2>:
    while(i < 10000) {
  3e:	05 c0       	rjmp	.+10     	; 0x4a <.L2>

00000040 <.L3>:
        i++;
  40:	89 81       	ldd	r24, Y+1	; 0x01
  42:	9a 81       	ldd	r25, Y+2	; 0x02
  44:	01 96       	adiw	r24, 0x01	; 1
  46:	9a 83       	std	Y+2, r25	; 0x02
  48:	89 83       	std	Y+1, r24	; 0x01

0000004a <.L2>:
    while(i < 10000) {
  4a:	89 81       	ldd	r24, Y+1	; 0x01
  4c:	9a 81       	ldd	r25, Y+2	; 0x02
  4e:	80 31       	cpi	r24, 0x10	; 16
  50:	97 42       	sbci	r25, 0x27	; 39
  52:	b4 f3       	brlt	.-20     	; 0x40 <.L3>
  54:	80 e0       	ldi	r24, 0x00	; 0
  56:	90 e0       	ldi	r25, 0x00	; 0

00000058 <.Loc.5>:
    }
}
  58:	0f 90       	pop	r0
  5a:	0f 90       	pop	r0
  5c:	df 91       	pop	r29
  5e:	cf 91       	pop	r28
  60:	08 95       	ret
00000030 <main>:
// #include "wait.h"

int main(void) {
  30:	cf 93       	push	r28
  32:	df 93       	push	r29
  34:	00 d0       	rcall	.+0      	; 0x36 <L0^A>

00000036 <L0^A>:
  36:	cd b7       	in	r28, 0x3d	; 61
  38:	de b7       	in	r29, 0x3e	; 62

0000003a <.Loc.1>:
    int i = 0;
  3a:	1a 82       	std	Y+2, r1	; 0x02
  3c:	19 82       	std	Y+1, r1	; 0x01

0000003e <.Loc.2>:
    while(i < 10000) {
  3e:	05 c0       	rjmp	.+10     	; 0x4a <.L2>

00000040 <.L3>:
        i++;
  40:	89 81       	ldd	r24, Y+1	; 0x01
  42:	9a 81       	ldd	r25, Y+2	; 0x02
  44:	01 96       	adiw	r24, 0x01	; 1
  46:	9a 83       	std	Y+2, r25	; 0x02
  48:	89 83       	std	Y+1, r24	; 0x01

0000004a <.L2>:
    while(i < 10000) {
  4a:	89 81       	ldd	r24, Y+1	; 0x01
  4c:	9a 81       	ldd	r25, Y+2	; 0x02
  4e:	80 31       	cpi	r24, 0x10	; 16
  50:	97 42       	sbci	r25, 0x27	; 39
  52:	b4 f3       	brlt	.-20     	; 0x40 <.L3>
  54:	80 e0       	ldi	r24, 0x00	; 0
  56:	90 e0       	ldi	r25, 0x00	; 0

00000058 <.Loc.5>:
    }
}
  58:	0f 90       	pop	r0
  5a:	0f 90       	pop	r0
  5c:	df 91       	pop	r29
  5e:	cf 91       	pop	r28
  60:	08 95       	ret

With -O1:

00000030 <main>:
// #include "wait.h"

int main(void) {
  30:	80 e1       	ldi	r24, 0x10	; 16
  32:	97 e2       	ldi	r25, 0x27	; 39

00000034 <.L2>:
    int i = 0;
    while(i < 10000) {
  34:	01 97       	sbiw	r24, 0x01	; 1

00000036 <.LVL2>:
  36:	f1 f7       	brne	.-4      	; 0x34 <.L2>

00000038 <.Loc.8>:
        i++;
    }
}
  38:	90 e0       	ldi	r25, 0x00	; 0
  3a:	80 e0       	ldi	r24, 0x00	; 0

0000003c <.LVL3>:
  3c:	08 95       	ret

And finally with -O2 and -O3 we are back to optimizing everything away:

00000030 <main>:
int main(void) {
    int i = 0;
    while(i < 10000) {
        i++;
    }
}
  30:	90 e0       	ldi	r25, 0x00	; 0
  32:	80 e0       	ldi	r24, 0x00	; 0
  34:	08 95       	ret

So I guess the TLDR here is try and test them all?

The case where the entire function is optimized away seems funny to me, what if you hand rolled a sleep with a big loop? Wouldn’t that be grand if it just disappeared?