利用分页锁定主机存储器进行主机与设备之间的0拷贝

    技术2022-05-19  23

    /* * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */

    #include <stdio.h>#include <stdlib.h>#include <cutil_inline.h>#include <cuda.h>

    /* Add two vectors on the GPU */

    __global__ void vectorAddGPU(float *a, float *b, float *c, int N){  int idx = blockIdx.x*blockDim.x + threadIdx.x;  if (idx < N)    c[idx] = a[idx] + b[idx];}

    int main(int argc, char **argv){  int n, nelem, idev, deviceCount;  char *device = NULL;  unsigned int flags;  size_t bytes;  float *a, *b, *c;                      // Pinned memory allocated on the CPU  float *d_a, *d_b, *d_c;                // Device pointers for mapped memory    float errorNorm, refNorm, ref, diff;  cudaDeviceProp deviceProp;

      if(cutCheckCmdLineFlag(argc, (const char **)argv, "help"))  {    printf("Usage:  simpleZeroCopy [OPTION]/n/n");    printf("Options:/n");    printf("  --device=[device #]  Specify the device to be used/n");    cutilExit(argc, argv);      }

      /* Get the device selected by the user or default to 0, and then set it. */

      if(cutGetCmdLineArgumentstr(argc, (const char**)argv, "device", &device))  {    cudaGetDeviceCount(&deviceCount);    idev = atoi(device);    if(idev >= deviceCount || idev < 0)    {      fprintf(stderr, "Invalid device number %d, using default device 0./n",              idev);      idev = 0;    }  }  else  {    idev = 0;  }

      cutilSafeCall(cudaSetDevice(idev));

      /* Verify the selected device supports mapped memory and set the device     flags for mapping host memory. */

      cutilSafeCall(cudaGetDeviceProperties(&deviceProp, idev));

    #if CUDART_VERSION >= 2020  printf("CUDART_VERSION >= 2020/n");  if(!deviceProp.canMapHostMemory)  {    fprintf(stderr, "Device %d cannot map host memory!/n", idev);    printf("PASSED");    cutilExit(argc, argv);  }  cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));#else  fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field/n");  printf("PASSED");  cutilExit(argc, argv);#endif

      /* Allocate mapped CPU memory. */

      nelem = 1048576;  bytes = nelem*sizeof(float);#if CUDART_VERSION >= 2020  flags = cudaHostAllocMapped;  cutilSafeCall(cudaHostAlloc((void **)&a, bytes, flags));  cutilSafeCall(cudaHostAlloc((void **)&b, bytes, flags));  cutilSafeCall(cudaHostAlloc((void **)&c, bytes, flags));#endif  /* Initialize the vectors. */

      for(n = 0; n < nelem; n++)  {    a[n] = rand() / (float)RAND_MAX;    b[n] = rand() / (float)RAND_MAX;  }

      /* Get the device pointers for the pinned CPU memory mapped into the GPU     memory space. */

    #if CUDART_VERSION >= 2020  cutilSafeCall(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));  cutilSafeCall(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));  cutilSafeCall(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));#endif

      /* Call the GPU kernel using the device pointers for the mapped memory. */

      printf("Adding vectors using mapped CPU memory.../n");  dim3 block(256);  dim3 grid((unsigned int)ceil(nelem/(float)block.x));  vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);    cutilSafeCall(cudaThreadSynchronize());  cutilCheckMsg("vectorAddGPU() execution failed");

      /* Compare the results */

      printf("Checking the results.../n");  errorNorm = 0.f;  refNorm = 0.f;  for(n = 0; n < nelem; n++)  {    ref = a[n] + b[n];    diff = c[n] - ref;    errorNorm += diff*diff;    refNorm += ref*ref;  }  errorNorm = (float)sqrt((double)errorNorm);  refNorm = (float)sqrt((double)refNorm);

      /* Memory clean up */

      printf("Releasing CPU memory.../n");#if CUDART_VERSION >= 2020  cutilSafeCall(cudaFreeHost(a));  cutilSafeCall(cudaFreeHost(b));  cutilSafeCall(cudaFreeHost(c));#endif

      cudaThreadExit();

      printf("%s/n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

      cutilExit(argc, argv);}


    最新回复(0)