pytorch 训练 RuntimeError: Unable to find a valid cuDNN algorithm to run convolution

pytorch 训练 RuntimeError: Unable to find a valid cuDNN algorithm to run convolution


# 问题描述:


python:3.95

pytorch:1.10.2

python train.py --img 640 --batch 64 --epochs 600 --data voc.yaml --weights yolov5x.pt  --device 0,1,2,3
Traceback (most recent call last):
  File "/home/yons/mtl/pytorch/yolov5/train.py", line 643, in <module>
    main(opt)
  File "/home/yons/mtl/pytorch/yolov5/train.py", line 539, in main
    train(opt.hyp, opt, device, callbacks)
  File "/home/yons/mtl/pytorch/yolov5/train.py", line 330, in train
    pred = model(imgs)  # forward
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
    output.reraise()
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/_utils.py", line 434, in reraise
    raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/mtl/pytorch/yolov5/models/yolo.py", line 126, in forward
    return self._forward_once(x, profile, visualize)  # single-scale inference, train
  File "/home/yons/mtl/pytorch/yolov5/models/yolo.py", line 149, in _forward_once
    x = m(x)  # run
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 139, in forward
    return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(input)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 105, in forward
    return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 47, in forward
    return self.act(self.bn(self.conv(x)))
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 446, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 442, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Unable to find a valid cuDNN algorithm to run convolution

运行nvidia-smi显示

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
|  0%   35C    P8    34W / 350W |     18MiB / 12051MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:08:00.0 Off |                  N/A |
|  0%   36C    P8    22W / 350W |      5MiB / 12053MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA GeForce ...  Off  | 00000000:85:00.0 Off |                  N/A |
|  0%   36C    P8    37W / 350W |      5MiB / 12053MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA GeForce ...  Off  | 00000000:89:00.0 Off |                  N/A |
|  0%   34C    P8    27W / 350W |      5MiB / 12053MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      1430      G   /usr/lib/xorg/Xorg                  9MiB |
|    0   N/A  N/A      1726      G   /usr/bin/gnome-shell                6MiB |
|    1   N/A  N/A      1430      G   /usr/lib/xorg/Xorg                  4MiB |
|    2   N/A  N/A      1430      G   /usr/lib/xorg/Xorg                  4MiB |
|    3   N/A  N/A      1430      G   /usr/lib/xorg/Xorg                  4MiB |
+-----------------------------------------------------------------------------+

# 原因分析:

可能达到了内存限制


# 解决方案:

降低下batch

python train.py --img 640 --batch 32 --epochs 600 --data voc.yaml --weights yolov5x.pt  --device 0,1,2,3