两个矢量相加

    技术2022-05-19  21

    /* * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */

    /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element * vector addition. It is the same as the sample illustrating Chapter 3 * of the programming guide with some additions like error checking. * */

    // Includes#include <stdio.h>#include <cutil_inline.h>

    // Variablesfloat* h_A;float* h_B;float* h_C;float* d_A;float* d_B;float* d_C;bool noprompt = false;

    // Functionsvoid Cleanup(void);void RandomInit(float*, int);void ParseArguments(int, char**);

    // Device code__global__ void VecAdd(const float* A, const float* B, float* C, int N){    int i = blockDim.x * blockIdx.x + threadIdx.x;    if (i < N)        C[i] = A[i] + B[i];}

    // Host codeint main(int argc, char** argv){    printf("Vector addition/n");    int N = 50000;    size_t size = N * sizeof(float);    ParseArguments(argc, argv);

        // Allocate input vectors h_A and h_B in host memory    h_A = (float*)malloc(size);    if (h_A == 0) Cleanup();    h_B = (float*)malloc(size);    if (h_B == 0) Cleanup();    h_C = (float*)malloc(size);    if (h_C == 0) Cleanup();        // Initialize input vectors    RandomInit(h_A, N);    RandomInit(h_B, N);

        // Allocate vectors in device memory    cutilSafeCall( cudaMalloc((void**)&d_A, size) );    cutilSafeCall( cudaMalloc((void**)&d_B, size) );    cutilSafeCall( cudaMalloc((void**)&d_C, size) );

        // Copy vectors from host memory to device memory    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );    cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );

        // Invoke kernel    int threadsPerBlock = 256;    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;    VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);    cutilCheckMsg("kernel launch failure");#ifdef _DEBUG    cutilSafeCall( cudaThreadSynchronize() );#endif

        // Copy result from device memory to host memory    // h_C contains the result in host memory    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );        // Verify result    int i;    for (i = 0; i < N; ++i) {        float sum = h_A[i] + h_B[i];        if (fabs(h_C[i] - sum) > 1e-5)            break;    }    printf("%s /n", (i == N) ? "PASSED" : "FAILED");        Cleanup();}

    void Cleanup(void){    // Free device memory    if (d_A)        cudaFree(d_A);    if (d_B)        cudaFree(d_B);    if (d_C)        cudaFree(d_C);

        // Free host memory    if (h_A)        free(h_A);    if (h_B)        free(h_B);    if (h_C)        free(h_C);            cutilSafeCall( cudaThreadExit() );        if (!noprompt) {        printf("/nPress ENTER to exit.../n");        fflush( stdout);        fflush( stderr);        getchar();    }

        exit(0);}

    // Allocates an array with random float entries.void RandomInit(float* data, int n){    for (int i = 0; i < n; ++i)        data[i] = rand() / (float)RAND_MAX;}

    // Parse program argumentsvoid ParseArguments(int argc, char** argv){    for (int i = 0; i < argc; ++i)        if (strcmp(argv[i], "--noprompt") == 0 ||   strcmp(argv[i], "-noprompt") == 0)   {            noprompt = true;            break;        }}


    最新回复(0)