I am currently working on optimizing a piece of code of math operations, where it loops over a pointer storage and saves the result inplace.
I noticed that at every assignment the compiler emit an memory-store instruction, like this (the vmovaps
to be noticed):
114 [1] top_data_c[pc] += w1 * bottom_data_hwc[o1 + pc];
0x55555558bf70 <+ 2624> c4 c1 78 10 0c 02 vmovups (%r10,%rax,1),%xmm1
0x55555558bf76 <+ 2630> 48 83 c1 01 add $0x1,%rcx
0x55555558bf7a <+ 2634> c4 c3 75 18 4c 02 10 01 vinsertf128 $0x1,0x10(%r10,%rax,1),%ymm1,%ymm1
0x55555558bf82 <+ 2642> c4 c2 25 a8 0c 04 vfmadd213ps (%r12,%rax,1),%ymm11,%ymm1
0x55555558bf88 <+ 2648> c4 c1 7c 29 0c 04 vmovaps %ymm1,(%r12,%rax,1)
115 [1] top_data_c[pc] += w2 * bottom_data_hwc[o2 + pc];
0x55555558bf8e <+ 2654> c4 c1 78 10 04 01 vmovups (%r9,%rax,1),%xmm0
0x55555558bf94 <+ 2660> c4 c3 7d 18 44 01 10 01 vinsertf128 $0x1,0x10(%r9,%rax,1),%ymm0,%ymm0
0x55555558bf9c <+ 2668> c4 c2 7d b8 ca vfmadd231ps %ymm10,%ymm0,%ymm1
0x55555558bfa1 <+ 2673> c4 c1 7c 29 0c 04 vmovaps %ymm1,(%r12,%rax,1)
116 [1] top_data_c[pc] += w3 * bottom_data_hwc[o3 + pc];
0x55555558bfa7 <+ 2679> c4 c1 78 10 04 00 vmovups (%r8,%rax,1),%xmm0
0x55555558bfad <+ 2685> c4 c3 7d 18 44 00 10 01 vinsertf128 $0x1,0x10(%r8,%rax,1),%ymm0,%ymm0
0x55555558bfb5 <+ 2693> c4 c2 75 98 c1 vfmadd132ps %ymm9,%ymm1,%ymm0
0x55555558bfba <+ 2698> c4 c1 7c 29 04 04 vmovaps %ymm0,(%r12,%rax,1)
117 [1] top_data_c[pc] += w4 * bottom_data_hwc[o4 + pc];
0x55555558bfc0 <+ 2704> c5 f8 10 0c 07 vmovups (%rdi,%rax,1),%xmm1
0x55555558bfc5 <+ 2709> c4 e3 75 18 4c 07 10 01 vinsertf128 $0x1,0x10(%rdi,%rax,1),%ymm1,%ymm1
0x55555558bfcd <+ 2717> c4 c2 75 b8 c0 vfmadd231ps %ymm8,%ymm1,%ymm0
0x55555558bfd2 <+ 2722> c4 c1 7c 29 04 04 vmovaps %ymm0,(%r12,%rax,1)
0x55555558bfd8 <+ 2728> 48 83 c0 20 add $0x20,%rax
0x55555558bfdc <+ 2732> 48 39 4d c0 cmp %rcx,-0x40(%rbp)
0x55555558bfe0 <+ 2736> 77 8e ja 0x55555558bf70
however, when I changed the pointer into a local "stack array" variable, i.e. T top_data_c[1024]
, the store instruction appears only at end of the loop:
114 [1] top_data_c[pc] += w1 * bottom_data_hwc[o1 + pc];
0x55555558bbe0 <+ 1712> c5 f8 10 0c 03 vmovups (%rbx,%rax,1),%xmm1
0x55555558bbe5 <+ 1717> 48 83 c1 01 add $0x1,%rcx
0x55555558bbe9 <+ 1721> c4 e3 75 18 4c 03 10 01 vinsertf128 $0x1,0x10(%rbx,%rax,1),%ymm1,%ymm1
0x55555558bbf1 <+ 1729> c4 c2 25 a8 0c 04 vfmadd213ps (%r12,%rax,1),%ymm11,%ymm1
0x55555558bbf7 <+ 1735> c5 fc 28 c1 vmovaps %ymm1,%ymm0
115 [1] top_data_c[pc] += w2 * bottom_data_hwc[o2 + pc];
0x55555558bbfb <+ 1739> c4 c1 78 10 0c 03 vmovups (%r11,%rax,1),%xmm1
0x55555558bc01 <+ 1745> c4 c3 75 18 4c 03 10 01 vinsertf128 $0x1,0x10(%r11,%rax,1),%ymm1,%ymm1
0x55555558bc09 <+ 1753> c4 c2 7d 98 ca vfmadd132ps %ymm10,%ymm0,%ymm1
116 [1] top_data_c[pc] += w3 * bottom_data_hwc[o3 + pc];
0x55555558bc0e <+ 1758> c4 c1 78 10 04 02 vmovups (%r10,%rax,1),%xmm0
0x55555558bc14 <+ 1764> c4 c3 7d 18 44 02 10 01 vinsertf128 $0x1,0x10(%r10,%rax,1),%ymm0,%ymm0
0x55555558bc1c <+ 1772> c4 e2 35 b8 c8 vfmadd231ps %ymm0,%ymm9,%ymm1
117 [1] top_data_c[pc] += w4 * bottom_data_hwc[o4 + pc];
0x55555558bc21 <+ 1777> c4 c1 78 10 04 01 vmovups (%r9,%rax,1),%xmm0
0x55555558bc27 <+ 1783> c4 c3 7d 18 44 01 10 01 vinsertf128 $0x1,0x10(%r9,%rax,1),%ymm0,%ymm0
0x55555558bc2f <+ 1791> c4 c2 75 98 c0 vfmadd132ps %ymm8,%ymm1,%ymm0
0x55555558bc34 <+ 1796> c4 c1 7c 29 04 04 vmovaps %ymm0,(%r12,%rax,1)
0x55555558bc3a <+ 1802> 48 83 c0 20 add $0x20,%rax
0x55555558bc3e <+ 1806> 48 3b 8d c8 fb ff ff cmp -0x438(%rbp),%rcx
0x55555558bc45 <+ 1813> 72 99 jb 0x55555558bbe0
it comes up that the compiler keeps the pointer store action away from optimization due to its thread unsafeness.
Declaring stack array or temporary variable copying from and back looks so dirty in such implementation, is there any way to make such pointer storage thread-safe to compiler's sense? Of course such computation is totally thread-safe (it works quite similar to GPU).