0

I want to use more than one ymm register to accelerate copy speed. Here is a snip of my code.

        __m256 ymm[2];
        ymm[0] = _mm256_load_ps(_src1);
        ymm[1] = _mm256_load_ps(_src2);
        _mm256_store_ps(_dst1, ymm[0]);
        _mm256_store_ps(_dst2, ymm[1]);

but the disassemble result in visual studio shows I only use the ymm0 register, actually I want to use more registers such as ymm1, ymm2. Is there any way to specify or give some hint to the compiler?

        __m256 ymm[2];
        ymm[0] = _mm256_load_ps(_src1);
00007FF6E417EFA9  mov         rax,qword ptr [rbp+10h]  
00007FF6E417EFAD  vmovups     ymm0,ymmword ptr [rax]  
00007FF6E417EFB1  vmovups     ymmword ptr [rbp+0A0h],ymm0  
00007FF6E417EFB9  mov         eax,20h  
00007FF6E417EFBE  imul        rax,rax,0  
00007FF6E417EFC2  vmovups     ymm0,ymmword ptr [rbp+0A0h]  
00007FF6E417EFCA  vmovups     ymmword ptr [rbp+rax+40h],ymm0  
        ymm[1] = _mm256_load_ps(_src2);
00007FF6E417EFD0  mov         rax,qword ptr [rbp+18h]  
00007FF6E417EFD4  vmovups     ymm0,ymmword ptr [rax]  
00007FF6E417EFD8  vmovups     ymmword ptr [rbp+0C0h],ymm0  
00007FF6E417EFE0  mov         eax,20h  
00007FF6E417EFE5  imul        rax,rax,1  
00007FF6E417EFE9  vmovups     ymm0,ymmword ptr [rbp+0C0h]  
00007FF6E417EFF1  vmovups     ymmword ptr [rbp+rax+40h],ymm0  
        _mm256_store_ps(_dst1, ymm[0]);
00007FF6E417EFF7  mov         eax,20h  
00007FF6E417EFFC  imul        rax,rax,0  
00007FF6E417F000  mov         rcx,qword ptr [rbp+20h]  
00007FF6E417F004  vmovups     ymm0,ymmword ptr [rbp+rax+40h]  
00007FF6E417F00A  vmovups     ymmword ptr [rcx],ymm0  
        _mm256_store_ps(_dst2, ymm[1]);
00007FF6E417F00E  mov         eax,20h  
00007FF6E417F013  imul        rax,rax,1  
00007FF6E417F017  mov         rcx,qword ptr [rbp+28h]  
00007FF6E417F01B  vmovups     ymm0,ymmword ptr [rbp+rax+40h]  
00007FF6E417F021  vmovups     ymmword ptr [rcx],ymm0  
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Gordon
  • 396
  • 2
  • 15
  • 1
    enabled optimization if you want the asm to not be garbage. It's using an actual array in memory for `__m256 ymm[2];`, not registers. Also, obviously no sane compiler would ever emit `imul rax,rax,0` with optimization enabled. (And most wouldn't emit that useless way to zero RAX even in debug mode, but MSVC is "special".) – Peter Cordes Feb 01 '21 at 02:40
  • 1
    And BTW, most compilers can inline `memcpy` efficiently for small fixed-size copies. There is maybe sometimes something to gain from writing your own loop, for a small variable-sized copy, especially if the min copy size is known to be >> 1 byte, e.g. at least 32 bytes is very handy. – Peter Cordes Feb 01 '21 at 02:43

0 Answers0