pytorch 训练 RuntimeError: Unable to find a valid cuDNN algorithm to run convolution
pytorch 训练 RuntimeError: Unable to find a valid cuDNN algorithm to run convolution
# 问题描述:
python:3.95
pytorch:1.10.2
python train.py --img 640 --batch 64 --epochs 600 --data voc.yaml --weights yolov5x.pt --device 0,1,2,3
Traceback (most recent call last):
File "/home/yons/mtl/pytorch/yolov5/train.py", line 643, in <module>
main(opt)
File "/home/yons/mtl/pytorch/yolov5/train.py", line 539, in main
train(opt.hyp, opt, device, callbacks)
File "/home/yons/mtl/pytorch/yolov5/train.py", line 330, in train
pred = model(imgs) # forward
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/_utils.py", line 434, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/mtl/pytorch/yolov5/models/yolo.py", line 126, in forward
return self._forward_once(x, profile, visualize) # single-scale inference, train
File "/home/yons/mtl/pytorch/yolov5/models/yolo.py", line 149, in _forward_once
x = m(x) # run
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 139, in forward
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 105, in forward
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/mtl/pytorch/yolov5/models/common.py", line 47, in forward
return self.act(self.bn(self.conv(x)))
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 446, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/yons/miniconda3/envs/ptest/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 442, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Unable to find a valid cuDNN algorithm to run convolution
运行nvidia-smi显示
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01 Driver Version: 470.103.01 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... Off | 00000000:04:00.0 Off | N/A |
| 0% 35C P8 34W / 350W | 18MiB / 12051MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... Off | 00000000:08:00.0 Off | N/A |
| 0% 36C P8 22W / 350W | 5MiB / 12053MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce ... Off | 00000000:85:00.0 Off | N/A |
| 0% 36C P8 37W / 350W | 5MiB / 12053MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce ... Off | 00000000:89:00.0 Off | N/A |
| 0% 34C P8 27W / 350W | 5MiB / 12053MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 1430 G /usr/lib/xorg/Xorg 9MiB |
| 0 N/A N/A 1726 G /usr/bin/gnome-shell 6MiB |
| 1 N/A N/A 1430 G /usr/lib/xorg/Xorg 4MiB |
| 2 N/A N/A 1430 G /usr/lib/xorg/Xorg 4MiB |
| 3 N/A N/A 1430 G /usr/lib/xorg/Xorg 4MiB |
+-----------------------------------------------------------------------------+
# 原因分析:
可能达到了内存限制
# 解决方案:
降低下batch
python train.py --img 640 --batch 32 --epochs 600 --data voc.yaml --weights yolov5x.pt --device 0,1,2,3