DDP是目前Pytorch推荐的多GPU训练方法,它支持单机多卡,多机多卡等情况。目前pytorch对DDP技术中进程的启动方式又分为launch脚本启动,和mp模启动。就目前而言,pytorch更推荐采用mp的方法,但launch的方法使用的也很广,所以下面一并做一些介绍。
1.DataParallel 和 DistributedDataParallel之间的比较
- 首先,DP是单进程多线程,只可以在单机中工作。DDP是多进程的,在单机或多机情况下都可以工作。
- 尽管在单机工作的情况下,DP也要比DDP更慢。这是由于DP受到了GIL锁、每次前向传播复制模型、和额外的输入散射输出收集、等因素的限制。
- 从之前的文章可以知道,当你的模型太大以至于单卡无法加载时,你必须借助“模型并行”来解决。DDP至此结合“模型并行”,但DP却不支持。注意:当DDP结合“模型并行时”,每一个DDP进程将使用“模型并行”,所有的进程间总体使用“数据并行”。
2.一个Launch示例
此方法借助于torch.distributed.launch模块,使用得交广泛,但现在pytorch官方更推荐于mp方法(见下节)
import os
import argparse
import torch
import torch.distributed as dist
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def setup(rank, world_size):
# 注:设置默认使用的GPU设备,当遇到CUDA error: an illegal memory access was encountered
# 请检查这块是否有设置
torch.cuda.set_device(rank)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# 注:第一个参数选择后端,nccl后端是单机多卡情况下的推荐,比gloo快很多。
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def demo_basic(rank):
print(f"Running basic DDP example on rank {rank}.")
trainset = torchvision.datasets.CIFAR10(...)
# 注:数据需要用torch.utils.data.distributed.DistributedSampler处理
train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
trainloader = torch.utils.data.DataLoader(trainset,
batch_size=bs,
num_workers=2,
sampler=train_sampler,
)
# 注:需要先加载到rank上,再用DDP处理
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank], output_device=rank)
loss_fn = nn.MSELoss()
# 注:需要在使用DDP处理完之后,再构建optimizerr
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
for data in trainloader:
inputs, labels = data.input.to(rank), data.lable.to(rank)
outputs = ddp_model(inputs)
loss_fn(outputs, labels).backward()
optimizer.step()
if rank == 0:
# 注1:只需再rank==0时进行模型保存
# 注2:保存的是ddp_model.module.state_dict,这才是真正的模型参数。ddp_model只是一个包裹函数
torch.save(ddp_model.module.state_dict(), "xxx.ckpt")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# 注:这里必须要定义一个传参“locak_rank”, launch模块会自动传递这个参数,我们无需传递,未定义会报错。
parser.add_argument("--locak_rank", default=0, type=int)
opts = parser.parse_args()
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
setup(opts.local_rank, world_size)
demo_basic(opts.local_rank)
cleanup()
####
##CMD:
#CUDA_VISIBLE_DEVICES="a,b,c" python -m torch.distributed.launch --nproc_per_node n main.py
###
一些注意事项:
A.大部分说明在代码注释里
B.--nproc_per_node指定了需要用到的GPU数量
C.此处设置的batch_size是每张卡的batch_size,每个iter所训练的数据量实际为:batch_size x n_gpu。(这和DP不同)
D.torch.cuda.set_device(rank)设置了默认使用的GPU设备号,以防止illegal memory错误。在程序的一些其他地方就可以直接用aa.to('cuda')来使用GPU啦(其实不推荐,pytorch推荐的用法是aa.to(rank),如果rank参数能够传递到对应位置,还是使用官方推荐比较好)。
3.一个mp示例(pytorch推荐方法)
在使用DDP时,首先要做的就是进行一些“进程组”的设置。
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
# On Windows platform, the torch.distributed package only
# supports Gloo backend, FileStore and TcpStore.
# For FileStore, set init_method parameter in init_process_group
# to a local file. Example as follow:
# init_method="file:///f:/libtmp/some_file"
# dist.init_process_group(
# "gloo",
# rank=rank,
# init_method=init_method,
# world_size=world_size)
# For TcpStore, same way as on Linux.
def setup(rank, world_size):
# 注:根据情况添加
torch.cuda.set_device(rank)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
做完上述初始化后,需要对模型和数据做一些包装。需要注意的是,DDP在构建时0进程将模型广播到其他进程,所以你不用担心各个进程中的模型初始值不同。
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_basic, world_size)
####
##CMD:
#CUDA_VISIBLE_DEVICES="a,b,c" python main.py
###
以上就是一个简单的DDP使用示例。正如你所见,DDP拥有很清晰的API接口,从而你无需关心梯度的同步通信等问题。
一些注意事项:
A.这个示例与Getting Started with Distributed Data Parallel 保持一致,没有处理dataset, 具体处理方法可见上一节。
B. torch.cuda.set_device(rank)不被官方推荐,但根据个人情况添加。
C.mp方法存在初始化较慢的现象,但是后面的运行速度不会慢,如果你发现运行速度慢于luanch方法,可以参考Why using mp.spawn is slower than using torch.distributed.launch when using multi-GPU training · Issue #47587 · pytorch/pytorch (github.com)
D.mp方法是pytorch官方的推荐方法,可以尝试使用。mp方法具有更灵活的进程控制(如果会用),运行时也不需要调用其他模块。
4.保存和载入Checkpoints
当使用DDP时,模型应该只在一个进程中被保存,并且在载入时由一个进程载入到所有进程。这样能避免重复载入并且保证各个进程初始参数相同。你还需注意在所有进程都完成保存后再进行载入操作。此外,在加载模块时,需要提供适当的map_location参数,以防止进程进入其他设备。如果map_location未设置。Torch.load将首先将模块加载到CPU,然后将每个参数复制到保存它的位置,这将导致同一台机器上的所有进程使用相同的设备集。
def demo_checkpoint(rank, world_size):
print(f"Running DDP checkpoint example on rank {rank}.")
setup(rank, world_size)
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
if rank == 0:
# All processes should see same parameters as they all start from same
# random parameters and gradients are synchronized in backward passes.
# Therefore, saving it in one process is sufficient.
torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
# Use a barrier() to make sure that process 1 loads the model after process
# 0 saves it.
dist.barrier()
# configure map_location properly
map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
ddp_model.load_state_dict(
torch.load(CHECKPOINT_PATH, map_location=map_location))
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn = nn.MSELoss()
loss_fn(outputs, labels).backward()
optimizer.step()
# Not necessary to use a dist.barrier() to guard the file deletion below
# as the AllReduce ops in the backward pass of DDP already served as
# a synchronization.
if rank == 0:
os.remove(CHECKPOINT_PATH)
cleanup()
5.DDP与“模型并行”相结合
class ToyMpModel(nn.Module):
def __init__(self, dev0, dev1):
super(ToyMpModel, self).__init__()
self.dev0 = dev0
self.dev1 = dev1
self.net1 = torch.nn.Linear(10, 10).to(dev0)
self.relu = torch.nn.ReLU()
self.net2 = torch.nn.Linear(10, 5).to(dev1)
def forward(self, x):
x = x.to(self.dev0)
x = self.relu(self.net1(x))
x = x.to(self.dev1)
return self.net2(x)
当DDP结合“模型并行”时,device_ids 和 output_device必须不被设置。输入和输出数据将被自动适当地放置在不同的设备当中。
def demo_model_parallel(rank, world_size):
print(f"Running DDP with model parallel example on rank {rank}.")
setup(rank, world_size)
# setup mp_model and devices for this process
dev0 = (rank * 2) % world_size
dev1 = (rank * 2 + 1) % world_size
mp_model = ToyMpModel(dev0, dev1)
ddp_mp_model = DDP(mp_model)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
optimizer.zero_grad()
# outputs will be on dev1
outputs = ddp_mp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(dev1)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_basic, world_size)
run_demo(demo_checkpoint, world_size)
run_demo(demo_model_parallel, world_size)