I am trying to make the code below faster keeping two of the variables (the ones we need to reuse) in register or any place closer than the cache. The code takes the three adjacent elements in an array at position idx and adds them together.
void stencil(double * input, double * output){
unsigned int idx = 1;
output[0] = input[0] + input[1];
for(; idx < SIZE - 1; idx++){
output[idx] = input[idx-1] + input[idx] + input[idx+1];
}
output[idx] = input[idx-1] + input[idx];
}
My implementation looks like this:
void stencil(double * input, double * output){
unsigned int idx = 0;
double x , y = 0, z;
z = input[idx];
for(; idx < SIZE - 1; idx++){
x = y;
y = z;
z = input[idx + 1];
output[idx] = x + y + z;
}
output[idx] = y + z;
}
the idea is to reuse the variables of the previous operation and make the program faster.
However, the program seems not to have improved in terms of speed and performance. I am using gcc on an AMD Opteron(tm) Processor 6320 CPU and I am compiling the code with the following flags: -march=native -O3 -Wall -std=c99.
I tried with and without native, the generated assembly is different but I cannot get better performance. The generated assembly WITHOUT -march=native flag looks like this:
stencil:
.LFB7:
.cfi_startproc
subl $1, %edx
movsd (%rdi), %xmm1
je .L4
movq %rsi, %rcx
xorpd %xmm0, %xmm0
xorl %eax, %eax
jmp .L3
.p2align 4,,10
.p2align 3
.L6:
movapd %xmm1, %xmm0
movapd %xmm2, %xmm1
.L3:
addl $1, %eax
addsd %xmm1, %xmm0
addq $8, %rcx
movl %eax, %r8d
movsd (%rdi,%r8,8), %xmm2
leaq 0(,%r8,8), %r9
addsd %xmm2, %xmm0
movsd %xmm0, -8(%rcx)
cmpl %edx, %eax
jne .L6
.L2:
addsd %xmm2, %xmm1
movsd %xmm1, (%rsi,%r9)
ret
.L4:
movapd %xmm1, %xmm2
xorl %r9d, %r9d
xorpd %xmm1, %xmm1
jmp .L2
And WITH the -march=native flag looks like this:
stencil:
.LFB20:
.cfi_startproc
vmovsd (%rdi), %xmm1
vxorpd %xmm0, %xmm0, %xmm0
leaq 144(%rdi), %rdx
leaq 136(%rsi), %rax
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L2:
vaddsd %xmm1, %xmm0, %xmm0
vmovsd -136(%rdx), %xmm4
prefetcht0 (%rdx)
addl $8, %ecx
prefetchw (%rax)
addq $64, %rdx
addq $64, %rax
vaddsd %xmm1, %xmm4, %xmm1
vaddsd %xmm4, %xmm0, %xmm0
vmovsd %xmm0, -200(%rax)
vmovsd -192(%rdx), %xmm3
vaddsd %xmm3, %xmm1, %xmm1
vaddsd %xmm3, %xmm4, %xmm4
vmovsd %xmm1, -192(%rax)
vmovsd -184(%rdx), %xmm2
vaddsd %xmm2, %xmm4, %xmm4
vaddsd %xmm2, %xmm3, %xmm3
vmovsd %xmm4, -184(%rax)
vmovsd %xmm4, -184(%rax)
vmovsd -176(%rdx), %xmm0
vaddsd %xmm0, %xmm3, %xmm3
vaddsd %xmm0, %xmm2, %xmm2
vmovsd %xmm3, -176(%rax)
vmovsd -168(%rdx), %xmm1
vaddsd %xmm1, %xmm2, %xmm2
vaddsd %xmm1, %xmm0, %xmm0
vmovsd %xmm2, -168(%rax)
vmovsd -160(%rdx), %xmm2
vaddsd %xmm2, %xmm0, %xmm0
vaddsd %xmm2, %xmm1, %xmm1
vmovsd %xmm0, -160(%rax)
vmovsd -152(%rdx), %xmm0
vaddsd %xmm0, %xmm1, %xmm1
vaddsd %xmm0, %xmm2, %xmm2
vmovsd %xmm1, -152(%rax)
vmovsd -144(%rdx), %xmm1
vaddsd %xmm1, %xmm2, %xmm2
vmovsd %xmm2, -144(%rax)
cmpl $1399999992, %ecx
jne .L2
movabsq $11199999944, %rdx
movabsq $11199999936, %rcx
addq %rdi, %rdx
addq %rsi, %rcx
xorl %eax, %eax
jmp .L3
.p2align 4,,7
.p2align 3
.L4:
vmovaps %xmm2, %xmm1
.L3:
vaddsd %xmm0, %xmm1, %xmm0
vmovsd (%rdx,%rax), %xmm2
vaddsd %xmm2, %xmm0, %xmm0
vmovsd %xmm0, (%rcx,%rax)
addq $8, %rax
vmovaps %xmm1, %xmm0
cmpq $56, %rax
jne .L4
vaddsd %xmm2, %xmm1, %xmm1
movabsq $11199999992, %rax
vmovsd %xmm1, (%rsi,%rax)
ret
Does anyone have any suggestion on how to make GCC save the variables into registers to make the code faster? Or any other way to make my code effective bypassing the cache?