DDP是目前Pytorch推荐的多GPU训练方法,它支持单机多卡,多机多卡等情况。目前pytorch对DDP技术中进程的启动方式又分为launch脚本启动,和mp模启动。就目前而言,pytorch更推荐采用mp的方法,但launch的方法使用的也很广,所以下面一并做一些介绍。

1.DataParallel 和 DistributedDataParallel之间的比较

  1. 首先,DP是单进程多线程,只可以在单机中工作。DDP是多进程的,在单机或多机情况下都可以工作。
  2. 尽管在单机工作的情况下,DP也要比DDP更慢。这是由于DP受到了GIL锁、每次前向传播复制模型、和额外的输入散射输出收集、等因素的限制。
  3. 从之前的文章可以知道,当你的模型太大以至于单卡无法加载时,你必须借助“模型并行”来解决。DDP至此结合“模型并行”,但DP却不支持。注意:当DDP结合“模型并行时”,每一个DDP进程将使用“模型并行”,所有的进程间总体使用“数据并行”。

2.一个Launch示例

此方法借助于torch.distributed.launch模块,使用得交广泛,但现在pytorch官方更推荐于mp方法(见下节)

import os

import argparse

import torch

import torch.distributed as dist

import torchvision

import torch.nn as nn

import torch.optim as optim

from torch.nn.parallel import DistributedDataParallel as DDP

   

class ToyModel(nn.Module):

    def __init__(self):

        super(ToyModel, self).__init__()

        self.net1 = nn.Linear(10, 10)

        self.relu = nn.ReLU()

        self.net2 = nn.Linear(10, 5)

    def forward(self, x):

        return self.net2(self.relu(self.net1(x)))

   

def setup(rank, world_size):

    # 注:设置默认使用的GPU设备,当遇到CUDA error: an illegal memory access was encountered

    #     请检查这块是否有设置

    torch.cuda.set_device(rank)

   

    os.environ['MASTER_ADDR'] = 'localhost'

    os.environ['MASTER_PORT'] = '12355'

   

    # 注:第一个参数选择后端,nccl后端是单机多卡情况下的推荐,比gloo快很多。

    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():

    dist.destroy_process_group()

def demo_basic(rank):

    print(f"Running basic DDP example on rank {rank}.")

   

    trainset = torchvision.datasets.CIFAR10(...)

    # 注:数据需要用torch.utils.data.distributed.DistributedSampler处理

    train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)

    trainloader = torch.utils.data.DataLoader(trainset,

                                              batch_size=bs,

                                              num_workers=2,

                                              sampler=train_sampler,

                                              )

   

    # 注:需要先加载到rank上,再用DDP处理

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank], output_device=rank)

    loss_fn = nn.MSELoss()

    # 注:需要在使用DDP处理完之后,再构建optimizerr

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    for data in trainloader:

        inputs, labels = data.input.to(rank), data.lable.to(rank)

        outputs = ddp_model(inputs)

        loss_fn(outputs, labels).backward()

        optimizer.step()

   

    if rank == 0:

        # 注1:只需再rank==0时进行模型保存

        # 注2:保存的是ddp_model.module.state_dict,这才是真正的模型参数。ddp_model只是一个包裹函数

        torch.save(ddp_model.module.state_dict(), "xxx.ckpt")

   



if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # 注:这里必须要定义一个传参“locak_rank”, launch模块会自动传递这个参数,我们无需传递,未定义会报错。

    parser.add_argument("--locak_rank", default=0, type=int)

    opts = parser.parse_args()

   

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

   

    setup(opts.local_rank, world_size)

    demo_basic(opts.local_rank)

    cleanup()

####

##CMD:

#CUDA_VISIBLE_DEVICES="a,b,c" python -m torch.distributed.launch --nproc_per_node n main.py  

###

一些注意事项:

A.大部分说明在代码注释里

B.--nproc_per_node指定了需要用到的GPU数量

C.此处设置的batch_size是每张卡的batch_size,每个iter所训练的数据量实际为:batch_size x n_gpu。(这和DP不同)

D.torch.cuda.set_device(rank)设置了默认使用的GPU设备号,以防止illegal memory错误。在程序的一些其他地方就可以直接用aa.to('cuda')来使用GPU啦(其实不推荐,pytorch推荐的用法是aa.to(rank),如果rank参数能够传递到对应位置,还是使用官方推荐比较好)。

3.一个mp示例(pytorch推荐方法)

在使用DDP时,首先要做的就是进行一些“进程组”的设置。

import os

import sys

import tempfile

import torch

import torch.distributed as dist

import torch.nn as nn

import torch.optim as optim

import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

# On Windows platform, the torch.distributed package only

# supports Gloo backend, FileStore and TcpStore.

# For FileStore, set init_method parameter in init_process_group

# to a local file. Example as follow:

# init_method="file:///f:/libtmp/some_file"

# dist.init_process_group(

#    "gloo",

#    rank=rank,

#    init_method=init_method,

#    world_size=world_size)

# For TcpStore, same way as on Linux.

def setup(rank, world_size):

    # 注:根据情况添加

    torch.cuda.set_device(rank)

    os.environ['MASTER_ADDR'] = 'localhost'

    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group

    dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():

    dist.destroy_process_group()

做完上述初始化后,需要对模型和数据做一些包装。需要注意的是,DDP在构建时0进程将模型广播到其他进程,所以你不用担心各个进程中的模型初始值不同。

class ToyModel(nn.Module):

    def __init__(self):

        super(ToyModel, self).__init__()

        self.net1 = nn.Linear(10, 10)

        self.relu = nn.ReLU()

        self.net2 = nn.Linear(10, 5)

    def forward(self, x):

        return self.net2(self.relu(self.net1(x)))



def demo_basic(rank, world_size):

    print(f"Running basic DDP example on rank {rank}.")

    setup(rank, world_size)

    # create model and move it to GPU with id rank

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    outputs = ddp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(rank)

    loss_fn(outputs, labels).backward()

    optimizer.step()

    cleanup()



def run_demo(demo_fn, world_size):

    mp.spawn(demo_fn,

             args=(world_size,),

             nprocs=world_size,

             join=True)

 

if __name__ == "__main__":

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

    run_demo(demo_basic, world_size)

 

####

##CMD:

#CUDA_VISIBLE_DEVICES="a,b,c" python main.py  

###

  以上就是一个简单的DDP使用示例。正如你所见,DDP拥有很清晰的API接口,从而你无需关心梯度的同步通信等问题。

一些注意事项:

A.这个示例与Getting Started with Distributed Data Parallel 保持一致,没有处理dataset, 具体处理方法可见上一节。

B. torch.cuda.set_device(rank)不被官方推荐,但根据个人情况添加。

C.mp方法存在初始化较慢的现象,但是后面的运行速度不会慢,如果你发现运行速度慢于luanch方法,可以参考Why using mp.spawn is slower than using torch.distributed.launch when using multi-GPU training · Issue #47587 · pytorch/pytorch (github.com)

D.mp方法是pytorch官方的推荐方法,可以尝试使用。mp方法具有更灵活的进程控制(如果会用),运行时也不需要调用其他模块。

4.保存和载入Checkpoints

当使用DDP时,模型应该只在一个进程中被保存,并且在载入时由一个进程载入到所有进程。这样能避免重复载入并且保证各个进程初始参数相同。你还需注意在所有进程都完成保存后再进行载入操作。此外,在加载模块时,需要提供适当的map_location参数,以防止进程进入其他设备。如果map_location未设置。Torch.load将首先将模块加载到CPU,然后将每个参数复制到保存它的位置,这将导致同一台机器上的所有进程使用相同的设备集。

def demo_checkpoint(rank, world_size):

    print(f"Running DDP checkpoint example on rank {rank}.")

    setup(rank, world_size)

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"

    if rank == 0:

        # All processes should see same parameters as they all start from same

        # random parameters and gradients are synchronized in backward passes.

        # Therefore, saving it in one process is sufficient.

        torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)

    # Use a barrier() to make sure that process 1 loads the model after process

    # 0 saves it.

    dist.barrier()

    # configure map_location properly

    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    ddp_model.load_state_dict(

        torch.load(CHECKPOINT_PATH, map_location=map_location))

    optimizer.zero_grad()

    outputs = ddp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(rank)

    loss_fn = nn.MSELoss()

    loss_fn(outputs, labels).backward()

    optimizer.step()

    # Not necessary to use a dist.barrier() to guard the file deletion below

    # as the AllReduce ops in the backward pass of DDP already served as

    # a synchronization.

    if rank == 0:

        os.remove(CHECKPOINT_PATH)

    cleanup()

5.DDP与“模型并行”相结合

class ToyMpModel(nn.Module):

    def __init__(self, dev0, dev1):

        super(ToyMpModel, self).__init__()

        self.dev0 = dev0

        self.dev1 = dev1

        self.net1 = torch.nn.Linear(10, 10).to(dev0)

        self.relu = torch.nn.ReLU()

        self.net2 = torch.nn.Linear(10, 5).to(dev1)

    def forward(self, x):

        x = x.to(self.dev0)

        x = self.relu(self.net1(x))

        x = x.to(self.dev1)

        return self.net2(x)

当DDP结合“模型并行”时,device_ids 和 output_device必须不被设置。输入和输出数据将被自动适当地放置在不同的设备当中。

def demo_model_parallel(rank, world_size):

    print(f"Running DDP with model parallel example on rank {rank}.")

    setup(rank, world_size)

    # setup mp_model and devices for this process

    dev0 = (rank * 2) % world_size

    dev1 = (rank * 2 + 1) % world_size

    mp_model = ToyMpModel(dev0, dev1)

    ddp_mp_model = DDP(mp_model)

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    # outputs will be on dev1

    outputs = ddp_mp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(dev1)

    loss_fn(outputs, labels).backward()

    optimizer.step()

    cleanup()



if __name__ == "__main__":

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

    run_demo(demo_basic, world_size)

    run_demo(demo_checkpoint, world_size)

    run_demo(demo_model_parallel, world_size)