1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 
  4 #include <stdio.h>
  5 #include <time.h>
  6 #include <stdlib.h>
  7 
  8 #define MAX 120
  9 #define MIN 0
 10 cudaError_t addWithCuda(int *c, const int *a, size_t size);
 11 
 12 __global__ void addKernel(int *c, const int *a) {
 13     int i = threadIdx.x;
 14     extern __shared__ int smem[];
 15     smem[i] = a[i];
 16     __syncthreads();
 17     if (i == 0)  // 0号线程做平方和
 18             {
 19         c[0] = 0;
 20         for (int d = 0; d < 5; d++) {
 21             c[0] += smem[d] * smem[d];
 22         }
 23     }
 24     if (i == 1)  //1号线程做累加
 25             {
 26         c[1] = 0;
 27         for (int d = 0; d < 5; d++) {
 28             c[1] += smem[d];
 29         }
 30     }
 31     if (i == 2)  //2号线程做累乘
 32             {
 33         c[2] = 1;
 34         for (int d = 0; d < 5; d++) {
 35             c[2] = smem[d];
 36         }
 37 
 38     }
 39 
 40     if (i == 3)  //3号线程做异或
 41                 {
 42             c[3] = 0;
 43             for (int d = 0; d < 5; d++) {
 44                 c[3] ^= smem[d];
 45             }
 46 
 47         }
 48 }
 49 
 50 int main() {
 51     const int arraySize = 5;
 52     srand((unsigned) time(NULL));
 53     const int a[arraySize] = { rand() % (MAX + 1 - MIN) + MIN, rand()
 54             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN, rand()
 55             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN };
 56     int c[arraySize] = { 0 };
 57     // Add vectors in parallel.
 58     cudaError_t cudaStatus = addWithCuda(c, a, arraySize);
 59     if (cudaStatus != cudaSuccess) {
 60         fprintf(stderr, "addWithCuda failed!");
 61         return 1;
 62     }
 63     printf(
 64             "\t%d+%d+%d+%d+%d = %d\n\t%d^2+%d^2+%d^2+%d^2+%d^2 = %d\n\t%d*%d*%d*%d*%d = %d\n\t%d^%d^%d^%d^%d = %d\n\n\n\n\n",
 65             a[0], a[1], a[2], a[3], a[4], c[1], a[0], a[1], a[2], a[3], a[4],
 66             c[0], a[0], a[1], a[2], a[3], a[4], c[2],a[0], a[1], a[2], a[3], a[4], c[3]);
 67     // cudaThreadExit must be called before exiting in order for profiling and
 68     // tracing tools such as Nsight and Visual Profiler to show complete traces.
 69     cudaStatus = cudaThreadExit();
 70     if (cudaStatus != cudaSuccess) {
 71         fprintf(stderr, "cudaThreadExit failed!");
 72         return 1;
 73     }
 74     return 0;
 75 }
 76 
 77 // Helper function for using CUDA to add vectors in parallel.
 78 cudaError_t addWithCuda(int *c, const int *a, size_t size) {
 79     int *dev_a = 0;
 80     int *dev_c = 0;
 81     cudaError_t cudaStatus;
 82 
 83     // Choose which GPU to run on, change this on a multi-GPU system.
 84     cudaStatus = cudaSetDevice(0);
 85     if (cudaStatus != cudaSuccess) {
 86         fprintf(stderr,
 87                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
 88         goto Error;
 89     }
 90 
 91     // Allocate GPU buffers for three vectors (two input, one output)    .
 92     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
 93     if (cudaStatus != cudaSuccess) {
 94         fprintf(stderr, "cudaMalloc failed!");
 95         goto Error;
 96     }
 97 
 98     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
 99     if (cudaStatus != cudaSuccess) {
100         fprintf(stderr, "cudaMalloc failed!");
101         goto Error;
102     }
103     // Copy input vectors from host memory to GPU buffers.
104     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
105             cudaMemcpyHostToDevice);
106     if (cudaStatus != cudaSuccess) {
107         fprintf(stderr, "cudaMemcpy failed!");
108         goto Error;
109     }
110     // Launch a kernel on the GPU with one thread for each element.
111     addKernel<<<1, size, size * sizeof(int), 0>>>(dev_c, dev_a);
112 
113     // cudaThreadSynchronize waits for the kernel to finish, and returns
114     // any errors encountered during the launch.
115     cudaStatus = cudaThreadSynchronize();
116     if (cudaStatus != cudaSuccess) {
117         fprintf(stderr,
118                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",
119                 cudaStatus);
120         goto Error;
121     }
122 
123     // Copy output vector from GPU buffer to host memory.
124     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
125             cudaMemcpyDeviceToHost);
126     if (cudaStatus != cudaSuccess) {
127         fprintf(stderr, "cudaMemcpy failed!");
128         goto Error;
129     }
130 
131     Error: cudaFree(dev_c);
132     cudaFree(dev_a);
133     return cudaStatus;
134 }

 

22+103+61+63+17 = 266
22^2+103^2+61^2+63^2+17^2 = 19072
22*103*61*63*17 = 17
22^103^61^63^17 = 98

 

OPTIMISM, PASSION & HARDWORK