Ubuntu下手动设置Nvidia显卡风扇转速

精选原创

wx62830f4b679a4 2023-06-10 06:24:04 ©著作权

文章标签 杂谈 python github ide 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者wx62830f4b679a4的原创作品，请联系作者获取转载授权，否则将追究法律责任

有显示器(桌面版)

Ubuntu下手动设置Nvidia显卡风扇转速_python

默认情况下是可以调节的，神奇的是如果使用下面给出的命令调节的操作后就不能再进行可视化的手动调节了。

=========================================

无显示器(服务器版)

使用工具coolgus进行调节：

https://github.com/andyljones/coolgpus

这个工具其实就是一个python文件，通过调用nvidia-setting中的设置命令来对显卡风扇转速进行调节：

#!/usr/bin/python
import os
import re
import time
import argparse
from subprocess import TimeoutExpired, check_output, Popen, PIPE, STDOUT
from tempfile import mkdtemp
from contextlib import contextmanager

parser = argparse.ArgumentParser(description=r'''
GPU fan control for Linux.

By default, this uses a clamped linear fan curve, going from 30% below 55C to 99%
above 80C. There's also a small hysteresis gap, because _changes_ in fan noise 
are a lot more distracting than steady fan noise. 

I can't claim it's optimal, but it Works For My Machine (TM). Full load is about 
75C and 80%.
''')
parser.add_argument('--temp', nargs='+', default=[55, 80], type=float, help='The temperature ranges where the fan speed will increase linearly')
parser.add_argument('--speed', nargs='+', default=[30, 99], type=float, help='The fan speed ranges')
parser.add_argument('--hyst', nargs='?', default=2, type=float, help='The hysteresis gap. Large gaps will reduce how often the fan speed is changed, but might mean the fan runs faster than necessary')
parser.add_argument('--kill', action='store_true', default=False, help='Whether to kill existing Xorg sessions')
parser.add_argument('--verbose', action='store_true', default=False, help='Whether to print extra debugging information')
parser.add_argument('--debug', action='store_true', default=False, help='Whether to only start the Xorg subprocesses, and not actually alter the fan speed. This can be useful for debugging.')
args = parser.parse_args()

T_HYST =  args.hyst

assert len(args.temp) == len(args.speed), 'temp and speed should have the same length'
assert len(args.temp) >= 2, 'Please use at least two points for temp'
assert len(args.speed) >= 2, 'Please use at least two points for speed'

# EDID for an arbitrary display
EDID = b'\x00\xff\xff\xff\xff\xff\xff\x00\x10\xac\x15\xf0LTA5.\x13\x01\x03\x804 x\xee\x1e\xc5\xaeO4\xb1&\x0ePT\xa5K\x00\x81\x80\xa9@\xd1\x00qO\x01\x01\x01\x01\x01\x01\x01\x01(<\x80\xa0p\xb0#@0 6\x00\x06D!\x00\x00\x1a\x00\x00\x00\xff\x00C592M9B95ATL\n\x00\x00\x00\xfc\x00DELL U2410\n  \x00\x00\x00\xfd\x008L\x1eQ\x11\x00\n      \x00\x1d'

# X conf for a single screen server with fake CRT attached
XORG_CONF = """Section "ServerLayout"
    Identifier     "Layout0"
    Screen      0  "Screen0"     0    0
EndSection

Section "Screen"
    Identifier     "Screen0"
    Device         "VideoCard0"
    Monitor        "Monitor0"
    DefaultDepth   8
    Option         "UseDisplayDevice" "DFP-0"
    Option         "ConnectedMonitor" "DFP-0"
    Option         "CustomEDID" "DFP-0:{edid}"
    Option         "Coolbits" "20"
    SubSection "Display"
                Depth   8
                Modes   "160x200"
    EndSubSection
EndSection

Section "ServerFlags"
    Option         "AllowEmptyInput" "on"
    Option         "Xinerama"        "off"
    Option         "SELinux"         "off"
EndSection

Section "Device"
    Identifier  "Videocard0"
    Driver      "nvidia"
        Screen      0
        Option      "UseDisplayDevice" "DFP-0"
        Option      "ConnectedMonitor" "DFP-0"
        Option      "CustomEDID" "DFP-0:{edid}"
        Option      "Coolbits" "29"
        BusID       "PCI:{bus}"
EndSection

Section "Monitor"
    Identifier      "Monitor0"
    Vendorname      "Dummy Display"
    Modelname       "160x200"
    #Modelname       "1024x768"
EndSection
""" 

def log_output(command, ok=(0,)):
    output = []
    if args.verbose:
        print('Command launched: ' + ' '.join(command))
    p = Popen(command, stdout=PIPE, stderr=STDOUT)
    try:
        p.wait(60)
        for line in p.stdout:
            output.append(line.decode().strip())
            if args.verbose:
                print(line.decode().strip())
        if args.verbose:
            print('Command finished')
    except TimeoutExpired:
        print('Command timed out: ' + ' '.join(command))
        raise
    finally:
        if p.returncode not in ok:
            print('\n'.join(output))
            raise ValueError('Command crashed with return code ' + str(p.returncode) + ': ' + ' '.join(command))
        return '\n'.join(output)

def decimalize(bus):
    """Converts a bus ID to an xconf-friendly format by dropping the domain and converting each hex component to 
    decimal"""
    return ':'.join([str(int('0x' + p, 16)) for p in re.split('[:.]', bus[9:])])

def gpu_buses():
    return log_output(['nvidia-smi', '--format=csv,noheader', '--query-gpu=pci.bus_id']).splitlines()

def query(bus, field):
    [line] = log_output(['nvidia-smi', '--format=csv,noheader', '--query-gpu='+field, '-i', bus]).splitlines()
    return line

def temperature(bus):
    return int(query(bus, 'temperature.gpu'))

def config(bus):
    """Writes out the X server config for a GPU to a temporary directory"""
    tempdir = mkdtemp(prefix='cool-gpu-' + bus)
    edid = os.path.join(tempdir, 'edid.bin')
    conf = os.path.join(tempdir, 'xorg.conf')

    with open(edid, 'wb') as e, open(conf, 'w') as c:
        e.write(EDID)
        c.write(XORG_CONF.format(edid=edid, bus=decimalize(bus)))

    return conf

def xserver(display, bus):
    """Starts the X server for a GPU under a certain display ID""" 
    conf = config(bus)
    xorgargs = ['Xorg', display, '-once', '-config', conf]
    print('Starting xserver: '+' '.join(xorgargs))
    p = Popen(xorgargs)
    if args.verbose:
        print('Started xserver')
    return p

def xserver_pids():
    return list(map(int, log_output(['pgrep', 'Xorg'], ok=(0, 1)).splitlines()))

def kill_xservers():
    """If there are already X servers attach to the GPUs, they'll stop us from setting up our own. Right now we
    can't make use of existing X servers for the reasons detailed here https://github.com/andyljones/coolgpus/issues/1
    """
    servers = xserver_pids()
    if servers:
        if args.kill:
            print('Killing all running X servers, including ' + ", ".join(map(str, servers)))
            log_output(['pkill', 'Xorg'], ok=(0, 1))
            for _ in range(10):
                if xserver_pids():
                    print('Awaiting X server shutdown')
                    time.sleep(1)
                else:
                    print('All X servers killed')
                    return
            raise IOError('Failed to kill existing X servers. Try killing them yourself before running this script')
        else:
            raise IOError('There are already X servers active. Either run the script with the `--kill` switch, or kill them yourself first')
    else:
        print('No existing X servers, we\'re good to go')
        return 

@contextmanager
def xservers(buses):
    """A context manager for launching an X server for each GPU in a list. Yields the mapping from bus ID to 
    display ID, and cleans up the X servers on exit."""
    kill_xservers()
    displays, servers = {}, {}
    try:
        for d, bus in enumerate(buses):
            displays[bus] = ':' + str(d)
            servers[bus] = xserver(displays[bus], bus)
        yield displays
    finally:
        for bus, server in servers.items():
            print('Terminating xserver for display ' + displays[bus])
            server.terminate()

def determine_segment(t):
    '''Determines which piece (segment) of a user-specified piece-wise function 
    t belongs to. For example:
        args.temp = [30, 50, 70, 90]
        (segment 0) 30 (0 segment) 50 (1 segment) 70 (2 segment) 90 (segment 2)
        args.speed = [10, 30, 50, 75]
        (segment 0) 10 (0 segment) 30 (1 segment) 50 (2 segment) 75 (segment 2)'''
    # TODO: assert temps and speeds are sorted
    # the loop exits when:
    #   a) t is less than the min temp (returns: segment 0)
    #   b) t belongs to a segment (returns: the segment)
    #   c) t is higher than the max temp (return: the last segment)
    segments = zip(
            args.temp[:-1], args.temp[1:], 
            args.speed[:-1], args.speed[1:])
    for temp_a, temp_b, speed_a, speed_b in segments:
        if t < temp_a:
            break
        if temp_a <= t < temp_b:
            break
    return temp_a, temp_b, speed_a, speed_b

def min_speed(t):
    temp_a, temp_b, speed_a, speed_b = determine_segment(t)
    load = (t - temp_a)/float(temp_b - temp_a)
    return int(min(max(speed_a + (speed_b - speed_a)*load, speed_a), speed_b))

def max_speed(t):
    return min_speed(t + T_HYST)

def target_speed(s, t):
    l, u = min_speed(t), max_speed(t)
    return min(max(s, l), u), l, u

def assign(display, command):
    # Our duct-taped-together xorg.conf leads to some innocent - but voluminous - warning messages about
    # failing to authenticate. Here we dispose of them
    log_output(['nvidia-settings', '-a', command, '-c', display])

def set_speed(display, target):
    assign(display, '[gpu:0]/GPUFanControlState=1')
    assign(display, '[fan:0]/GPUTargetFanSpeed='+str(int(target)))

def manage_fans(displays):
    """Launches an X server for each GPU, then continually loops over the GPU fans to set their speeds according
    to the GPU temperature. When interrupted, it releases the fan control back to the driver and shuts down the
    X servers"""
    try:
        speeds = {b: 0 for b in displays}
        while True:
            for bus, display in displays.items():
                temp = temperature(bus)
                s, l, u = target_speed(speeds[bus], temp)
                if s != speeds[bus]:
                    print('GPU {}, {}C -> [{}%-{}%]. Setting speed to {}%'.format(display, temp, l, u, s))
                    set_speed(display, s)
                    speeds[bus] = s
                else:
                    print('GPU {}, {}C -> [{}%-{}%]. Leaving speed at {}%'.format(display, temp, l, u, s))
            time.sleep(5)
    finally:
        for bus, display in displays.items():
            assign(display, '[gpu:0]/GPUFanControlState=0')
            print('Released fan speed control for GPU at '+display)

def debug_loop(displays):
    displays = '\n'.join(str(d) + ' - ' + str(b) for b, d in displays.items())
    print('\n\n\nLOOPING IN DEBUG MODE. DISPLAYS ARE:\n' + displays + '\n\n\n')
    while True:
        print('Looping in debug mode')
        time.sleep(5)


def run():
    buses = gpu_buses() 
    with xservers(buses) as displays:
        if args.debug:
            debug_loop(displays)
        else:
            manage_fans(displays)

if __name__ == '__main__':
    run()

需要注意的是这个代码默认是在服务器端运行的，也就是没有Xorg程序在运行，如果是桌面版的系统需要kill掉X11才能成功执行，比如ubuntu22.04 Desktop版系统就需要执行 sudo service gdm3 stop来关闭桌面从而关闭X11。

---------------------------------------------------

通过这段代码我们可以知道几个知识点：

查询主板上nvidia显卡的总线号：

nvidia-smi --format=csv,noheader --query-gpu=pci.bus_id

Ubuntu下手动设置Nvidia显卡风扇转速_杂谈_02

查询现在系统正在运行的X11程序的进程号：

pgrep Xorg

Ubuntu下手动设置Nvidia显卡风扇转速_杂谈_03

X11程序启动需要读取xorg.conf文件，默认的nvidia显卡启动的X11程序读取文件位置为 /etc/X11/xorg.conf

网上有很多命令设置nvidia显卡风扇转速的教程，但是普遍反应一般，很多人反应不好用，后来发现这些教程都是针对单显卡的情况，而对于多显卡电脑则不适用。原因在于nvidia官方的风扇转速调节命令只对显卡有显示任务的情况下才有效，也就是说如果一个多卡的电脑要使用nvidia官方命令调风扇转速就必须要每块显卡都进行显示输出，也就是说每块显卡都需要运行X11程序进行显示输出，这样才可以对该显卡进行风扇调速。而大家所知道的，多卡电脑也只是一个显卡运行显示任务，而其他显卡是不运行显示任务的，也正因如此网上的nvidia-smi命令调风扇转速对于多卡电脑是无效的，各种报错，也正因如此上面的这个代码给出了一个另类的解决方法，那就是把原有的X11进程kill掉，然后针对每一款显卡都模拟出一个屏幕显示配置文件并以此启动一个X11进程。不过需要注意的是，这种給每个显卡都模拟一个屏幕然后运行X11的方法需要把之前运行的真实的、可以显示输出的X11进程关掉，这样也就导致真实的屏幕是没有显示输出了（黑屏掉）。

由于ubuntu的Desktop版本系统桌面进程会监控X11的运行情况，如果发现X11进程被kill掉那么桌面进程则会重启一个X11进程，最后导致永远管不到真实X11进程，这里给出ubuntu22.04系统下的解决方法，那就是kill掉桌面进程，sudo service gdm3 stop，然后再根据pgrep Xorg获得真实X11进程，然后kill掉。

在单显卡电脑上使用nvidia命令调节显卡风扇转速：

开启手动设置模式：

nvidia-settings -a [gpu:0]/GPUFanControlState=1 -c :0

设置转速50%：
nvidia-settings -a [fan:0]/GPUTargetFanSpeed=50 -c :0

关闭手动设置模式，恢复自动模式：

nvidia-settings -a [gpu:0]/GPUFanControlState=0 -c :0

其中，由于该命令只对单显卡情况有效，因此 [gpu:0] 和 [fan:0] 是固定的，-c :0 指的是屏幕编号，由于是单卡单屏幕因此这里也是固定不变的。

PS: 实践证明，如果电脑有两个显卡，每个显卡各接一个屏幕，依旧只有一个X11进程：

Ubuntu下手动设置Nvidia显卡风扇转速_杂谈_04

可以看到上面虽然有两个Xorg，其实是同一个CPU进程在不同显卡上的占用情况，pid为1586，其中0号卡为显示输出，显存占用111Mb，1号卡不进行显示输出，显存占用4Mb

结合上面的分析可以知道，目前能够使用官方命令手动控制显卡风扇转速的也只有这种模拟的方法了，该种方法最不好的缺点就是要kill掉真实的X11，导致无法进行桌面显示输出，或许对于多卡电脑使用手动调节显卡转速都是一些计算场景吧，那样的话也就不需要考虑这个问题了。

====================================

根据上面的代码给出一个固定转速的操作，这里假设固定转速为50%：

nvidia-smi --format=csv,noheader --query-gpu=pci.bus_id

00000000:01:00.0
00000000:02:00.0

文件 x.py

Ubuntu下手动设置Nvidia显卡风扇转速_github_05

Ubuntu下手动设置Nvidia显卡风扇转速_ide_06

# EDID for an arbitrary display
EDID = b'\x00\xff\xff\xff\xff\xff\xff\x00\x10\xac\x15\xf0LTA5.\x13\x01\x03\x804 x\xee\x1e\xc5\xaeO4\xb1&\x0ePT\xa5K\x00\x81\x80\xa9@\xd1\x00qO\x01\x01\x01\x01\x01\x01\x01\x01(<\x80\xa0p\xb0#@0 6\x00\x06D!\x00\x00\x1a\x00\x00\x00\xff\x00C592M9B95ATL\n\x00\x00\x00\xfc\x00DELL U2410\n  \x00\x00\x00\xfd\x008L\x1eQ\x11\x00\n      \x00\x1d'

# X conf for a single screen server with fake CRT attached
XORG_CONF = """Section "ServerLayout"
    Identifier     "Layout0"
    Screen      0  "Screen0"     0    0
EndSection

Section "Screen"
    Identifier     "Screen0"
    Device         "VideoCard0"
    Monitor        "Monitor0"
    DefaultDepth   8
    Option         "UseDisplayDevice" "DFP-0"
    Option         "ConnectedMonitor" "DFP-0"
    Option         "CustomEDID" "DFP-0:{edid}"
    Option         "Coolbits" "20"
    SubSection "Display"
                Depth   8
                Modes   "160x200"
    EndSubSection
EndSection

Section "ServerFlags"
    Option         "AllowEmptyInput" "on"
    Option         "Xinerama"        "off"
    Option         "SELinux"         "off"
EndSection

Section "Device"
    Identifier  "Videocard0"
    Driver      "nvidia"
        Screen      0
        Option      "UseDisplayDevice" "DFP-0"
        Option      "ConnectedMonitor" "DFP-0"
        Option      "CustomEDID" "DFP-0:{edid}"
        Option      "Coolbits" "29"
        BusID       "PCI:{bus}"
EndSection

Section "Monitor"
    Identifier      "Monitor0"
    Vendorname      "Dummy Display"
    Modelname       "160x200"
    #Modelname       "1024x768"
EndSection
""" 


from tempfile import mkdtemp
import os
import re


bus = input("bus:")


def decimalize(bus):
    """Converts a bus ID to an xconf-friendly format by dropping the domain and converting each hex component to
    decimal"""
    return ':'.join([str(int('0x' + p, 16)) for p in re.split('[:.]', bus[9:])])


"""Writes out the X server config for a GPU to a temporary directory"""
tempdir = mkdtemp(prefix='cool-gpu-' + bus)
edid = os.path.join(tempdir, 'edid.bin')
conf = os.path.join(tempdir, 'xorg.conf')

with open(edid, 'wb') as e, open(conf, 'w') as c:
    e.write(EDID)
    c.write(XORG_CONF.format(edid=edid, bus=decimalize(bus)))
print(conf)

View Code

Ubuntu下手动设置Nvidia显卡风扇转速_ide_07