
原理建议阅读下面文章,文中介绍了OpenCL相关名词概念: http://opencl.codeplex.com/wikipage?title=OpenCL%20Tutorials%20-%201 (英文版)




cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0};
cl_int errNum;

// 创建OpenCL上下文
context = CreateContext(&device);
commandQueue = CreateCommandQueue(context, device);
// 创建OpenCL程序
program = CreateProgram(context, device, "device.cl");
// 创建OpenCL内核
kernel = clCreateKernel(program, "vector_add", NULL);
// 创建OpenCL内存对象
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
a[i] = (float)i;
b[i] = (float)(i * 2);
if (!CreateMemObjects(context, memObjects, a, b))
return 1;
// 设置内核参数
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (errNum != CL_SUCCESS)
return 1;
// 执行内核
size_t gloabalWorkSize = ARRAY_SIZE;
size_t localWorkSize = 1;
std::cout << "GPU 运行开始:" << time_stamp() << std::endl;
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &gloabalWorkSize, &localWorkSize, 0, NULL, NULL);
std::cout << "GPU 运行结束:" << time_stamp() << std::endl;
errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, sizeof(float) * ARRAY_SIZE, result, 0, NULL, NULL);
for (int i = 0; i < ARRAY_SIZE; i++)
printf("i = %d:%f\n",i,result[i]);


__kernel void vector_add(global const float *a, global const float *b, global float *result)
int gid = get_global_id(0);
result[gid] = a[gid] + b[gid];




cl_context CreateContext(cl_device_id *device)
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
cl_uint count;
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
printf("Failed to find any OpenCL platforms.");
return NULL;
std::cout << "Number of available platforms: " << numPlatforms << std::endl;
errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, device, &count);
std::cout << " " << "CL_DEVICE_TYPE_GPU" << ": " << count << std::endl;
if (errNum != CL_SUCCESS)
printf("There is no GPU, trying CPU...");
errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_CPU, 1, device, &count);
std::cout << " " << "CL_DEVICE_TYPE_CPU" << ": " << count << std::endl;
if (errNum != CL_SUCCESS)
printf("There is NO GPU or CPU");
return NULL;
context = clCreateContext(NULL, 1, device, NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
printf("create context error\n");
return NULL;
return context;


cl_command_queue CreateCommandQueue(cl_context context, cl_device_id device)
cl_int errNum;
cl_command_queue commandQueue = NULL;
commandQueue = clCreateCommandQueue(context, device, 0, NULL);
if (commandQueue == NULL)
printf("Failed to create commandQueue for device 0");
return NULL;
return commandQueue;


char* ReadKernelSourceFile(const char* filename, size_t* length)
FILE *file = NULL;
size_t sourceLength;
char *sourceString;
int ret;
file = fopen(filename, "rb");
if (file == NULL)
printf("%s at %d: can't open %s\n", __FILE__, __LINE__ - 2, filename);
return NULL;
fseek(file, 0, SEEK_END);
sourceLength = ftell(file);
fseek(file, 0, SEEK_SET);
sourceString = (char *)malloc(sourceLength + 1);
sourceString[0] = '\0';
ret = fread(sourceString, sourceLength, 1, file);
if (ret == 0)
printf("%s at %d: Can't read source %s\n", __FILE__, __LINE__ -2, filename);
return NULL;
if (length != 0)
*length = sourceLength;
sourceString[sourceLength] = '\0';
return sourceString;

cl_program CreateProgram(cl_context context, cl_device_id device, const char *filename)
cl_int errNum;
cl_program program;
size_t program_length;
char* const source = ReadKernelSourceFile(filename, &program_length);
program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, NULL);
if (program == NULL)
printf("Failed to create CL program from source.");
return NULL;
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
printf("Error in kernel:%s ", buildLog);
return NULL;
return program;
bool CreateMemObjects(cl_context context, cl_mem memObjects[3], float *a, float *b)
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, a, NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, b, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * ARRAY_SIZE, NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL)
printf("Error creating memory objects.");
return false;
return true;
double time_stamp()
if (
!QueryPerformanceCounter(&curclock) ||
return -1;
return double(curclock.QuadPart) / freq.QuadPart;