0

So, I'm trying to recreate program from this presentation, but it only contains kernel code. I tried to write the "main" code myself, but it seems like cudaMemcpy is not working. Here is my code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <cstdlib>
#include <iostream>

#define N 1024
#define blockSize 256
#define numberOfBlocks N/(blockSize*2)

__global__ void reduce(int *g_idata, int *g_odata){
    extern __shared__ int sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + tid;
    sdata[tid] = g_idata[i]*g_idata[i] + g_idata[i+blockDim.x]*g_idata[i+blockDim.x];
    __syncthreads();
    if (blockSize >= 512){
        if (tid < 256){
            sdata[tid] += sdata[tid + 256];
        }
        __syncthreads();
    }
    if (blockSize >= 256){
        if (tid < 128){
        sdata[tid] += sdata[tid + 128];
        }
        __syncthreads();
    }
    if (blockSize >= 128){
        if (tid < 64){
            sdata[tid] += sdata[tid + 64];
        }
        __syncthreads();
    }
    if (tid < 32) {
        if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
        if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
        if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
        if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
        if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
        if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
    }
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

int main()
{
    int A[N];
    for (int i = 0; i < N; i++){
        A[i] = 1;
    }
    int* devA;
    cudaMalloc((int **)&devA, sizeof(int) * N);
    cudaMemcpy(devA, &A, sizeof(int) * N, cudaMemcpyHostToDevice);
    int output[numberOfBlocks];
    for (int i = 0; i < numberOfBlocks; i++){
        output[i] = 8;
    }
    int* devOutput;
    cudaMalloc((int **)&devOutput, sizeof(int) * numberOfBlocks);
    cudaMemcpy(devOutput, &output, sizeof(int) * numberOfBlocks, cudaMemcpyHostToDevice);
    reduce<<<numberOfBlocks, blockSize>>>(devA, devOutput);
    cudaMemcpy(&output, devOutput, sizeof(int) * numberOfBlocks, cudaMemcpyDeviceToHost);
    int sum=0;
    for (int j = 0; j < numberOfBlocks; j++){
        sum+=output[j];
    }
    printf("output 1=%d\n",output[0]);
    printf("output 2=%d\n",output[1]);
    printf("sum=%d\n",sum);
    printf("sum=%f\n",sqrt(float(sum)));
    system("pause");
}

And this is what I get:

output[0]=8
output[1]=8
sum=16
sqrt ofsum=4.0000

So it seems clear to me that cudaMemcpy does not update "output" values and I have no idea why. So if you have, then please share.

faze
  • 1

0 Answers0