1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 #include <time.h> 6 #include <stdlib.h> 7 8 #define MAX 120 9 #define MIN 0 10 cudaError_t addWithCuda(int *c, const int *a, size_t size); 11 12 __global__ void addKernel(int *c, const int *a) { 13 int i = threadIdx.x; 14 extern __shared__ int smem[]; 15 smem[i] = a[i]; 16 __syncthreads(); 17 if (i == 0) // 0号线程做平方和 18 { 19 c[0] = 0; 20 for (int d = 0; d < 5; d++) { 21 c[0] += smem[d] * smem[d]; 22 } 23 } 24 if (i == 1) //1号线程做累加 25 { 26 c[1] = 0; 27 for (int d = 0; d < 5; d++) { 28 c[1] += smem[d]; 29 } 30 } 31 if (i == 2) //2号线程做累乘 32 { 33 c[2] = 1; 34 for (int d = 0; d < 5; d++) { 35 c[2] = smem[d]; 36 } 37 38 } 39 40 if (i == 3) //3号线程做异或 41 { 42 c[3] = 0; 43 for (int d = 0; d < 5; d++) { 44 c[3] ^= smem[d]; 45 } 46 47 } 48 } 49 50 int main() { 51 const int arraySize = 5; 52 srand((unsigned) time(NULL)); 53 const int a[arraySize] = { rand() % (MAX + 1 - MIN) + MIN, rand() 54 % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN, rand() 55 % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN }; 56 int c[arraySize] = { 0 }; 57 // Add vectors in parallel. 58 cudaError_t cudaStatus = addWithCuda(c, a, arraySize); 59 if (cudaStatus != cudaSuccess) { 60 fprintf(stderr, "addWithCuda failed!"); 61 return 1; 62 } 63 printf( 64 "\t%d+%d+%d+%d+%d = %d\n\t%d^2+%d^2+%d^2+%d^2+%d^2 = %d\n\t%d*%d*%d*%d*%d = %d\n\t%d^%d^%d^%d^%d = %d\n\n\n\n\n", 65 a[0], a[1], a[2], a[3], a[4], c[1], a[0], a[1], a[2], a[3], a[4], 66 c[0], a[0], a[1], a[2], a[3], a[4], c[2],a[0], a[1], a[2], a[3], a[4], c[3]); 67 // cudaThreadExit must be called before exiting in order for profiling and 68 // tracing tools such as Nsight and Visual Profiler to show complete traces. 69 cudaStatus = cudaThreadExit(); 70 if (cudaStatus != cudaSuccess) { 71 fprintf(stderr, "cudaThreadExit failed!"); 72 return 1; 73 } 74 return 0; 75 } 76 77 // Helper function for using CUDA to add vectors in parallel. 78 cudaError_t addWithCuda(int *c, const int *a, size_t size) { 79 int *dev_a = 0; 80 int *dev_c = 0; 81 cudaError_t cudaStatus; 82 83 // Choose which GPU to run on, change this on a multi-GPU system. 84 cudaStatus = cudaSetDevice(0); 85 if (cudaStatus != cudaSuccess) { 86 fprintf(stderr, 87 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 88 goto Error; 89 } 90 91 // Allocate GPU buffers for three vectors (two input, one output) . 92 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int)); 93 if (cudaStatus != cudaSuccess) { 94 fprintf(stderr, "cudaMalloc failed!"); 95 goto Error; 96 } 97 98 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int)); 99 if (cudaStatus != cudaSuccess) { 100 fprintf(stderr, "cudaMalloc failed!"); 101 goto Error; 102 } 103 // Copy input vectors from host memory to GPU buffers. 104 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), 105 cudaMemcpyHostToDevice); 106 if (cudaStatus != cudaSuccess) { 107 fprintf(stderr, "cudaMemcpy failed!"); 108 goto Error; 109 } 110 // Launch a kernel on the GPU with one thread for each element. 111 addKernel<<<1, size, size * sizeof(int), 0>>>(dev_c, dev_a); 112 113 // cudaThreadSynchronize waits for the kernel to finish, and returns 114 // any errors encountered during the launch. 115 cudaStatus = cudaThreadSynchronize(); 116 if (cudaStatus != cudaSuccess) { 117 fprintf(stderr, 118 "cudaThreadSynchronize returned error code %d after launching addKernel!\n", 119 cudaStatus); 120 goto Error; 121 } 122 123 // Copy output vector from GPU buffer to host memory. 124 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), 125 cudaMemcpyDeviceToHost); 126 if (cudaStatus != cudaSuccess) { 127 fprintf(stderr, "cudaMemcpy failed!"); 128 goto Error; 129 } 130 131 Error: cudaFree(dev_c); 132 cudaFree(dev_a); 133 return cudaStatus; 134 }
22+103+61+63+17 = 266
22^2+103^2+61^2+63^2+17^2 = 19072
22*103*61*63*17 = 17
22^103^61^63^17 = 98