I want to use more than one ymm register to accelerate copy speed. Here is a snip of my code.
__m256 ymm[2];
ymm[0] = _mm256_load_ps(_src1);
ymm[1] = _mm256_load_ps(_src2);
_mm256_store_ps(_dst1, ymm[0]);
_mm256_store_ps(_dst2, ymm[1]);
but the disassemble result in visual studio shows I only use the ymm0 register, actually I want to use more registers such as ymm1, ymm2. Is there any way to specify or give some hint to the compiler?
__m256 ymm[2];
ymm[0] = _mm256_load_ps(_src1);
00007FF6E417EFA9 mov rax,qword ptr [rbp+10h]
00007FF6E417EFAD vmovups ymm0,ymmword ptr [rax]
00007FF6E417EFB1 vmovups ymmword ptr [rbp+0A0h],ymm0
00007FF6E417EFB9 mov eax,20h
00007FF6E417EFBE imul rax,rax,0
00007FF6E417EFC2 vmovups ymm0,ymmword ptr [rbp+0A0h]
00007FF6E417EFCA vmovups ymmword ptr [rbp+rax+40h],ymm0
ymm[1] = _mm256_load_ps(_src2);
00007FF6E417EFD0 mov rax,qword ptr [rbp+18h]
00007FF6E417EFD4 vmovups ymm0,ymmword ptr [rax]
00007FF6E417EFD8 vmovups ymmword ptr [rbp+0C0h],ymm0
00007FF6E417EFE0 mov eax,20h
00007FF6E417EFE5 imul rax,rax,1
00007FF6E417EFE9 vmovups ymm0,ymmword ptr [rbp+0C0h]
00007FF6E417EFF1 vmovups ymmword ptr [rbp+rax+40h],ymm0
_mm256_store_ps(_dst1, ymm[0]);
00007FF6E417EFF7 mov eax,20h
00007FF6E417EFFC imul rax,rax,0
00007FF6E417F000 mov rcx,qword ptr [rbp+20h]
00007FF6E417F004 vmovups ymm0,ymmword ptr [rbp+rax+40h]
00007FF6E417F00A vmovups ymmword ptr [rcx],ymm0
_mm256_store_ps(_dst2, ymm[1]);
00007FF6E417F00E mov eax,20h
00007FF6E417F013 imul rax,rax,1
00007FF6E417F017 mov rcx,qword ptr [rbp+28h]
00007FF6E417F01B vmovups ymm0,ymmword ptr [rbp+rax+40h]
00007FF6E417F021 vmovups ymmword ptr [rcx],ymm0