win7（amd显卡）安装 pyopencl

原创

琉年 2018-02-02 10:54:28 博主文章分类：教程 ©著作权

©著作权归作者所有：来自51CTO博客作者琉年的原创作品，请联系作者获取转载授权，否则将追究法律责任

　　事情仍然简单，按理就是

pip install pyopencl

　　但是并没有成功，错误提示说有个mako未安装，虽然说不装也没关系，但是想着不费事就装了，继续报错。

　　似乎想要装pyopencl，得先装opencl，于是amd官网下opencl sdk（2.9.1，版本在表格里，一目了然），安装路径似乎没得选，在program files x86 文件夹。继续报错。

　　这次的报错报到VS里了，我是不是该庆幸装了VS社区版……说CL/cl.h 找不到。试图打开的程序也叫cl……

　　这是个头文件啊……cl.exe，莫非是compile + link？不妨写个helloworld编译一下……（masm里的编译连接一体机好像也叫cl）然后编译失败。

　　提示是找不到xxx.h，或者xxx.lib，这个的教程很好找，在环境变量里把lib和include文件夹都包括进去。于是在环境变量里新建一个Include，一个Lib，然后按着教程加进去一堆来自windows的自带库目录（好像多半来自Microsoft sdk 和 windows kit），反正最后helloworld.c 可以编译了，cl果然就是一键编译连接，宛如gcc还不用设定文件名。

　　此时，还记得amd sdk 文件夹（叫 AMD APP SDK）么？打开一看，也有一个include目录一个lib目录。直接把include加进环境变量即可，但lib下还有一层，x86还是x86_64各位自己试试吧，我也搞不明白我的64位机为何要用x86……一个检验的方法是，在helloworld.c 开头加一句 #include<CL/cl.h>，如果仍然可以编译成功，那么……

pip install pyopencl

　　至少我就这么安装成功了，无警告。

　　示例程序：（来自http://ju.outofmemory.cn/entry/106475，这里把py2的print改成了py3的）

# example provided by Eilif Muller

from __future__ import division

KERNEL_CODE = """

// Thread block size
#define BLOCK_SIZE %(block_size)d

// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height


/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */

/* Matrix multiplication: C = A * B.
 * Device code.
 */

#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]

////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];

    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);

    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);

    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;

    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {

        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];

        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;

}

"""

import pyopencl as cl
from time import time
import numpy

block_size = 16

ctx = cl.create_some_context()

for dev in ctx.devices:
    assert dev.local_mem_size > 0

queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

#queue = cl.CommandQueue(ctx)

if False:
    a_height = 4096
    #a_height = 1024
    a_width = 2048
    #a_width = 256
    #b_height == a_width
    b_width = a_height

elif False:
    # like PyCUDA
    a_height = 2516
    a_width = 1472
    b_height = a_width
    b_width = 2144

else:
    # CL SDK
    a_width = 50*block_size
    a_height = 100*block_size
    b_width = 50*block_size
    b_height = a_width

c_width = b_width
c_height = a_height

h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32)
h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32)
h_c = numpy.empty((c_height, c_width)).astype(numpy.float32)


kernel_params = {"block_size": block_size,
        "w_a":a_width, "h_a":a_height, "w_b":b_width}

if "NVIDIA" in queue.device.vendor:
    options = "-cl-mad-enable -cl-fast-relaxed-math"
else:
    options = ""
prg = cl.Program(ctx, KERNEL_CODE % kernel_params,
        ).build(options=options)
kernel = prg.matrixMul
#print prg.binaries[0]

assert a_width % block_size == 0
assert a_height % block_size == 0
assert b_width % block_size == 0

# transfer host -> device -----------------------------------------------------
mf = cl.mem_flags

t1 = time()

d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a)
d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b)
d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)

push_time = time()-t1

# warmup ----------------------------------------------------------------------
for i in range(5):
    event = kernel(queue, h_c.shape[::-1], (block_size, block_size),
            d_c_buf, d_a_buf, d_b_buf)
    event.wait()

queue.finish()

# actual benchmark ------------------------------------------------------------
t1 = time()

count = 20
for i in range(count):
    event = kernel(queue, h_c.shape[::-1], (block_size, block_size),
            d_c_buf, d_a_buf, d_b_buf)

event.wait()

gpu_time = (time()-t1)/count

# transfer device -> host -----------------------------------------------------
t1 = time()
cl.enqueue_copy(queue, h_c, d_c_buf)
pull_time = time()-t1

# timing output ---------------------------------------------------------------
gpu_total_time = gpu_time+push_time+pull_time

print("GPU push+compute+pull total [s]:", gpu_total_time)
print("GPU push [s]:", push_time)
print("GPU pull [s]:", pull_time)
print("GPU compute (host-timed) [s]:", gpu_time)
print("GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9)

gflop = h_c.size * (a_width * 2.) / (1000**3.)
gflops = gflop / gpu_time

print()
print("GFlops/s:", gflops)

# cpu comparison --------------------------------------------------------------
t1 = time()
h_c_cpu = numpy.dot(h_a,h_b)
cpu_time = time()-t1

print()
print("GPU==CPU:",numpy.allclose(h_c, h_c_cpu))
print()
print("CPU time (s)", cpu_time)
print()
	  
print("GPU speedup (with transfer): ", cpu_time/gpu_total_time)
print("GPU speedup (without transfer): ", cpu_time/gpu_time)

　　别急，如果你开始看到提示输入，不妨看看选项……方括号里是号码，后面是内容，你负责输入号码回车。比如我输入了两次0，最后会有个提示：

Choose platform:
[0] <pyopencl.Platform 'AMD Accelerated Parallel Processing' at 0x7feee7e3168>
Choice [0]:0
Choose device(s):
[0] <pyopencl.Device 'Capeverde' on 'AMD Accelerated Parallel Processing' at 0x9
f28700>
[1] <pyopencl.Device 'Intel(R) Pentium(R) CPU G4560 @ 3.50GHz' on 'AMD Accelerat
ed Parallel Processing' at 0xa627610>
Choice, comma-separated [0]:0
Set the environment variable PYOPENCL_CTX='0:0' to avoid being asked again.

　　意思是如果在环境变量里事先说好，就不用选了。我配置环境变量无果，但是把下面的代码加在文件开头起了作用——（感谢stackflow）

import os
os.environ['PYOPENCL_CTX'] = '0:0'

　　再次运行，上面一段没有了，直接是结果。然而代码没看懂，那个CPU==GPU大概是说cpu和gpu算出来结果一致吧，还有numpy打印数组中间竟然用省略号……

（2018-2-2 于地球）

上一篇：Windows 下 Python 3.6 下安装 TensorFlow （屡败屡战）

下一篇：一边学，一边写出的人工智能教程（一）

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

win7（amd显卡） 安装 pyopencl

win7（amd显卡） 安装 pyopencl

51CTO博客

win7（amd显卡）安装 pyopencl

win7（amd显卡）安装 pyopencl