文章目录
- Tensoflow 1的 用法
- Tensorflow 2 的用法
- 2.0 分布式策略:`tf.distribute.MirroredStrategy`
- 2.1 方式1:混杂`custom training loops` 和 `keras model`
- 2.2 方式2:纯 custom training loop方式
- 参考
Tensoflow 1的 用法
在tensorflow中,变量是复用的,变量通过变量名唯一确定。
计算图也会和设备绑定,如果一个图计算时需要用到变量a,而变量a不在该设备上,则会自动生成相应的通信代码,将变量a加载到该设备上。因而,变量的存放设备对于程序的正确性没有影响,但会导致通信开销有所差异。
# coding=utf-8
'''''
Created on Jan 4, 2017
@author: colinliang
tensorflow 单机多卡程序示例,
参考: tensorflow示例cifar10_multi_gpu_train.py
'''
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
def _allocate_variable(name, shape, initializer, dtype=tf.float32):
# 分配变量,Tensorflow 会自动处理变量在不同设备间的通信问题,因而可以放在GPU上,也可以放在CPU上
# 如果是单机单卡,都放在GPU上比较快 (无需显式指定device, tf自动分配即可)
# 如果是单机多卡,则放在CPU上略快; 可能是我这里使用了SLI连接两块GPU,GPU间通信速度还算可以
with tf.device('/cpu:0'): #强制放在主内存上
# with tf.device(None): # 默认放在当前设备上
var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
print('%s: %s' % (var.op.name, var.device))
return var
# 创建网络 y=xw+b
def tower(input_tensor, target_tensor, scope, dims=[]):
for i, d in enumerate(dims):
with tf.variable_scope('affine%d' % i) as varscope: # 仅仅用于生成变量的全名,与存放设备无关
w = _allocate_variable('w', shape=[input_tensor.get_shape()[1], d], initializer=tf.truncated_normal_initializer(0, 1));
b = _allocate_variable('b', shape=[], initializer=tf.zeros_initializer);
input_tensor = tf.matmul(input_tensor, w) + b;
input_tensor = tf.nn.relu(input_tensor)
with tf.variable_scope('affine_last') as varscope: # 仅仅用于生成变量的全名,与存放设备无关
# w = _allocate_variable('w', shape=[input_tensor.get_shape()[1], 1], initializer=tf.truncated_normal_initializer(0, 1));
w = _allocate_variable('w', shape=[input_tensor.get_shape()[1], 1], initializer=tf.constant_initializer(value=1));
b = _allocate_variable('b', shape=[], initializer=tf.zeros_initializer);
y = tf.matmul(input_tensor, w) + b;
l = tf.reduce_mean(tf.square(y - target_tensor));
tf.add_to_collection('losses', l)
return y, l
# 合并所有tower上的梯度,取平均, 对于单机多卡程序,这段代码是通用的
def average_tower_grads(tower_grads):
print('towerGrads:')
idx = 0
for grads in tower_grads: # grads 为 一个list,其中元素为 梯度-变量 组成的二元tuple
print('grads---tower_%d' % idx)
for g_var in grads:
print(g_var)
print('\t%s\n\t%s' % (g_var[0].op.name, g_var[1].op.name))
# print('\t%s: %s'%(g_var[0].op.name,g_var[1].op.name))
idx += 1
if(len(tower_grads) == 1):
return tower_grads[0]
avgGrad_var_s = []
for grad_var_s in zip(*tower_grads):
grads = []
v = None
for g, v_ in grad_var_s:
g = tf.expand_dims(g, 0)
grads.append(g)
v = v_
all_g = tf.concat(0, grads)
avg_g = tf.reduce_mean(all_g, 0, keep_dims=False)
avgGrad_var_s.append((avg_g, v));
return avgGrad_var_s
# 方案1 ,每组输入分别用对应的placeholder作为输入; 未测试
def generate_towers_v1(NUM_GPU=2):
input_tensors = []
target_tensors = []
towerGrads = []
lr = 1e-3
opt = tf.train.AdamOptimizer(lr)
for i in range(NUM_GPU):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i) as scope:
input_tensor = tf.placeholder(tf.float32, shape=[None, 1], name='input_%d' % i);
input_tensors.append(input_tensor)
target_tensor = tf.placeholder(tf.float32, shape=[None, 1], name='target_%d' % i);
target_tensors.append(target_tensor)
y, loss = tower(input_tensor=input_tensor, target_tensor=target_tensor, scope=scope)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
grads = opt.compute_gradients(loss)
towerGrads.append(grads)
avgGrad_var_s = average_tower_grads(towerGrads)
apply_gradient_op = opt.apply_gradients(avgGrad_var_s, global_step=None)
loss = tf.Print(loss, data=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
return input_tensors, target_tensors, y, loss, apply_gradient_op
# 方案2: 一组placeholder, 再根据tower数量分割成n组输入,分别送人对应的tower
def generate_towers_v2(NUM_GPU=2, dim_in=1, dims=None, batch_size=None):
if(dims is None): dims = []
input_tensor = tf.placeholder(tf.float32, shape=[batch_size, dim_in], name='input');
target_tensor = tf.placeholder(tf.float32, shape=[batch_size, dim_in], name='target');
input_tensors = tf.split(0, NUM_GPU, input_tensor) # batch_size必须可以被dim_in整除
target_tensors = tf.split(0, NUM_GPU, target_tensor)
towerGrads = []
lr = 1e-2
opt = tf.train.AdamOptimizer(lr) # 与GradientDescentOptimizer相比,会自动分配一些中间变量
opt = tf.train.GradientDescentOptimizer(lr)
for i in range(NUM_GPU):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i) as scope:
input_sub = input_tensors[i]
print("device:%s" % input_sub.device)
target_sub = target_tensors[i]
y, loss = tower(input_tensor=input_sub, target_tensor=target_sub, scope=scope, dims=dims)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
grads = opt.compute_gradients(loss)
towerGrads.append(grads)
avgGrad_var_s = average_tower_grads(towerGrads)
loss = tf.Print(loss, data=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
apply_gradient_op = opt.apply_gradients(avgGrad_var_s, global_step=None)
print('ALL variables:')
for v in tf.all_variables():
print('\t%s' % v.op.name)
return input_tensor, target_tensor, y, loss, apply_gradient_op
if __name__ == '__main__':
sess = tf.Session()
NUM_GPU = 1 # 由于只有两块GPU,如果设为3,会报错:Could not satisfy explicit device specification '/device:GPU:2'
dim_in = 600; # 输入变量x 的维度
dims = [512, 128, 128] #隐层单元数,设置为[]时表示 y=xw+b的线性变换,否则表示多层的全连接网络
batch_size = 2000;
input_tensor, target_tensor, y, loss, apply_gradient_op = generate_towers_v2(NUM_GPU=NUM_GPU, dim_in=dim_in, dims=dims)
sess.run(tf.initialize_all_variables())
inputs = np.random.rand(batch_size, dim_in)
targets = inputs * 2 + 1;
feed_dict = {input_tensor:inputs, target_tensor:targets}
import time
tstart = time.time()
for i in range(10000):
# _, l = sess.run([apply_gradient_op, loss], feed_dict=feed_dict) #will print w, b
# print(l)
sess.run([apply_gradient_op], feed_dict=feed_dict) # do not print w, b
telapse = time.time() - tstart
print(u'%d块GPU用时: %.2fs' % (NUM_GPU, telapse))
输出
affine0/w: /device:CPU:0
affine0/b: /device:CPU:0
affine1/w: /device:CPU:0
affine1/b: /device:CPU:0
affine2/w: /device:CPU:0
affine2/b: /device:CPU:0
affine_last/w: /device:CPU:0
affine_last/b: /device:CPU:0
towerGrads:
grads---tower_0
(<tf.Tensor 'tower_0/gradients/tower_0/MatMul_grad/tuple/control_dependency_1:0' shape=(600, 512) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c7144d0>)
tower_0/gradients/tower_0/MatMul_grad/tuple/control_dependency_1
affine0/w
(<tf.Tensor 'tower_0/gradients/tower_0/add_grad/tuple/control_dependency_1:0' shape=() dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c7140d0>)
tower_0/gradients/tower_0/add_grad/tuple/control_dependency_1
affine0/b
(<tf.Tensor 'tower_0/gradients/tower_0/MatMul_1_grad/tuple/control_dependency_1:0' shape=(512, 128) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c7146d0>)
tower_0/gradients/tower_0/MatMul_1_grad/tuple/control_dependency_1
affine1/w
(<tf.Tensor 'tower_0/gradients/tower_0/add_1_grad/tuple/control_dependency_1:0' shape=() dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c6cb850>)
tower_0/gradients/tower_0/add_1_grad/tuple/control_dependency_1
affine1/b
(<tf.Tensor 'tower_0/gradients/tower_0/MatMul_2_grad/tuple/control_dependency_1:0' shape=(128, 128) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c6cb750>)
tower_0/gradients/tower_0/MatMul_2_grad/tuple/control_dependency_1
affine2/w
(<tf.Tensor 'tower_0/gradients/tower_0/add_2_grad/tuple/control_dependency_1:0' shape=() dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c6f48d0>)
tower_0/gradients/tower_0/add_2_grad/tuple/control_dependency_1
affine2/b
(<tf.Tensor 'tower_0/gradients/tower_0/MatMul_3_grad/tuple/control_dependency_1:0' shape=(128, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c6f47d0>)
tower_0/gradients/tower_0/MatMul_3_grad/tuple/control_dependency_1
affine_last/w
(<tf.Tensor 'tower_0/gradients/tower_0/add_3_grad/tuple/control_dependency_1:0' shape=() dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x7f8b6c69f950>)
tower_0/gradients/tower_0/add_3_grad/tuple/control_dependency_1
affine_last/b
ALL variables:
affine0/w
affine0/b
affine1/w
affine1/b
affine2/w
affine2/b
affine_last/w
affine_last/b
Tensorflow 2 的用法
2.0 分布式策略:tf.distribute.MirroredStrategy
tf.distribute.MirroredStrategy
是一种简单且高性能的,数据并行的同步式分布式策略,主要支持多个GPU在同一台主机上训练。使用这种策略时,我们只需实例化一个 MirroredStrategy
策略:
strategy = tf.distribute.MirroredStrategy()
Tips:可以在参数中指定设备,如:
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
# 即指定只使用第0、1号GPU参与分布式策略。
并将模型构建的代码放入 strategy.scope()
的上下文环境中:
with strategy.scope():
# 模型构建代码
以下代码展示了使用 MirroredStrategy 策略,在TensorFlow Datasets中的部分图像数据集上使用Keras训练MobileNetV2的过程:
import tensorflow as tf
import tensorflow_datasets as tfds
num_epochs = 5
batch_size_per_replica = 64
learning_rate = 0.001
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: %d' % strategy.num_replicas_in_sync) # 输出设备数量
batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
# 载入数据集并预处理
def resize(image, label):
image = tf.image.resize(image, [224, 224]) / 255.0
return image, label
# 当as_supervised为True时,返回image和label两个键值
dataset = tfds.load("cats_vs_dogs", split=tfds.Split.TRAIN, as_supervised=True)
dataset = dataset.map(resize).shuffle(1024).batch(batch_size)
with strategy.scope():
model = tf.keras.applications.MobileNetV2()
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)
model.fit(dataset, epochs=num_epochs)
MirroredStrategy的步骤如下:
- 训练开始前,该策略在所有N个计算设备上均各复制一份完整的模型;
- 每次训练传入一个批次的数据时,将数据分成N份,分别传入N个计算设备(即数据并行);
- N个计算设备使用本地变量(镜像变量)分别计算自己所获得的部分数据的梯度;
- 使用分布式计算的All-reduce操作,在计算设备间高效交换梯度数据并进行求和,使得最终每个设备都有了所有设备的梯度之和;
- 使用梯度求和的结果更新本地变量(镜像变量);
- 当所有设备均更新本地变量后,进行下一轮训练(即该并行策略是同步的)。
默认情况下,TensorFlow中的 MirroredStrategy 策略使用NVIDIA NCCL进行All-reduce操作。
为了进一步理解MirroredStrategy的过程,以下展示两种方式构建的示例,相对而言要复杂不少:
2.1 方式1:混杂custom training loops
和 keras model
这种方式其实是混杂了custom training loops
和 keras model
两种方式,更纯粹的是只使用 custom training loops 或者 keras 方式去训练。
import tensorflow as tf
from nets.single_posenet import singlePosenet
from configs.spn_config import params
from dataset.dataset import get_dataset
import os
import time
if __name__ == '__main__':
# 控制屏幕的打印信息级别,忽略掉INFO级别的log
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
# 程序可以观察到的gpu id
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
visible_gpus = tf.config.experimental.list_physical_devices('GPU')
print('Visible devices : ', visible_gpus)
gpu_ids = [0,1]
devices = ['/device:GPU:{}'.format(i) for i in gpu_ids]
print ('Used devices: ', devices)
gpu_nums = len(gpu_ids)
strategy = tf.distribute.MirroredStrategy(devices=devices)
print (strategy.num_replicas_in_sync)
with strategy.scope():
if params['finetune'] is not None:
model = tf.keras.models.load_model(params['finetune'])
print('Successfully restore pretained model from {}'.format(params['finetune']))
else:
inputs = tf.keras.Input(shape=(params['height'], params['width'], 3),name='modelInput')
outputs = singlePosenet(inputs, outc=params['num_joints'] + 1, training=True)
model = tf.keras.Model(inputs, outputs)
optimizer = tf.optimizers.Adam(learning_rate=3e-4)
with strategy.scope():
dataset = get_dataset(gpu_nums)
dist_dataset = strategy.experimental_distribute_dataset(dataset)
print(dist_dataset.__dict__['_cloned_datasets'])
epochs = 200
global_batch_size = params['batch_size'] * gpu_nums
def spn_loss(label, preds, mask):
return tf.reduce_sum(tf.nn.l2_loss(label - preds))
def train():
def step_fn(inputs):
img, label4, mask4, label2, mask2 = inputs
with tf.GradientTape() as tape:
preds4, preds2 = model(img)
loss4 = 3 * spn_loss(label4, preds4, mask4)
loss2 = spn_loss(label2, preds2, mask2)
l2_loss = loss4 + loss2
# 必须要除去全局的batch size,这样才可以正确计算一次infer的loss
loss = l2_loss * (1.0 / global_batch_size)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
return loss, loss2, loss4
def distributed_train_epoch(ds):
total_loss = 0.0
num_train_batches = 0.0
for one_batch in ds:
per_example_loss, loss2, loss4 = strategy.experimental_run_v2(step_fn, args=(one_batch,))
num_train_batches += 1
total_loss += strategy.reduce(tf.distribute.ReduceOp.SUM, per_example_loss, axis=None)
return total_loss, num_train_batches
distributed_train_epoch = tf.function(distributed_train_epoch)
s = time.time()
train_total_loss, num_train_batches = distributed_train_epoch(dist_dataset)
e = time.time()
template = ('Epoch: {}, Train Loss: {}, Cost Time: {}')
print(template.format(epoch, train_total_loss, e-s))
for epoch in range(epochs):
with strategy.scope():
train()
model.save('./models/SpnModel_%s.h5' % (epoch + 1))
一些问题:
- 为什么要加上
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
这行代码,按照 tf 官网上的教程,是不需要这个,程序应该可以自动找到你想用的gpu。但实际运行发现,服务器上有7块显卡,如果不加这个选项,程序只能找到第一块卡,也既是gpu:0
,其余的卡都找不到;而且更诡异的是,如果是os.environ['CUDA_VISIBLE_DEVICES'] = '5, 6'
,程序会默认找的是gpu id为 0 1的两个卡,而不是你让它发现的5 6这两张卡,如果想要程序使用 5 6这两张卡,需要使用os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1, 2, 3, 4, 5, 6'
这个选项。但使用了这个选项还是会有问题,就是虽然后面指定devices=['gpu:5', 'gpu:6']
,程序也确实是在这两张卡上运行,但gpu 0 1 2 3 4
这5张卡都会被占用一些显存(147M确切说),所以非常奇怪。目前使用方法是只能使用前面连续的几张卡而不能使用后面的卡 这个原因找到了,当我们使用os.environ['CUDA_VISIBLE_DEVICES'] = '5, 6'
这行代码时,程序就把5号卡和6号卡识别出来,在程序内部标记为0号卡和1号卡,所以后面再指定devices的时候,不能按照原有的gpu id
去指定,不能使用devices=['gpu:5', 'gpu:6']
, 而要用devices=['gpu:0', 'gpu:1']
- 使用
分布式训练
,你的dataset里面的所有tensor,都必须明确下shape,最好的处理方法就是在dataset=dataset.map()
使用的映射函数中,对返回的tensor都使用tensor.set_shape()
选项显式的明确下tensor的shape,否则会出现错误:ValueError: Cannot take the length of shape with unknown rank. - 当使用
keras model.save()
模型时,必须保证保存的模型中,所有op
都是通过tf.keras
生成的,例如不能有tf.nn.sigmoid()
这样的函数,而是要用tf.keras.activations.sigmoid()
,不然在执行model.save()
时会报类似的错误:Cannot export Keras model TypeError: ('Not JSON Serializable:', b'\n...')
。另外,model.save()
不能放在strategy.scope()
内,但restore
的时候,需要放在strategy.scope()
内。
2.2 方式2:纯 custom training loop方式
import tensorflow as tf
from nets.single_posenet import singlePosenet
from configs.spn_config import params
from dataset.dataset import get_dataset
import os
import numpy as np
if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
visible_gpus = tf.config.experimental.list_physical_devices('GPU')
print('Visible devices : ', visible_gpus)
gpu_ids = [0]
devices = ['/device:GPU:{}'.format(i) for i in gpu_ids]
strategy = tf.distribute.MirroredStrategy(devices=devices)
global_batch_size = params['batch_size'] * len(gpu_ids)
dataset = get_dataset(len(gpu_ids))
dist_dataset = strategy.experimental_distribute_dataset(dataset)
checkpoint_dir = './models'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
def compute_loss(label, predict, mask):
return tf.reduce_sum(tf.nn.l2_loss(label-predict))
with strategy.scope():
inputs = tf.keras.Input(shape=(params['height'], params['width'], 3), name='modelInput')
outputs = singlePosenet(inputs, outc=params['num_joints'] + 1, training=True)
model = tf.keras.Model(inputs, outputs)
optimizer = tf.optimizers.Adam(learning_rate=3e-4)
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
if params['finetune'] is not None:
checkpoint.restore(params['finetune'])
print ('Successfully restore.')
with strategy.scope():
def train_step(inputs):
img, label4, mask4, label2, mask2 = inputs
with tf.GradientTape() as tape:
preds4, preds2 = model(img)
loss4 = 3 * compute_loss(label4, preds4, mask4)
loss2 = compute_loss(label2, preds2, mask2)
l2_loss = loss4 + loss2
loss = l2_loss * (1.0 / global_batch_size)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
return loss, loss2, loss4
with strategy.scope():
@tf.function
def distributed_train_step(dataset_inputs):
per_replica_losses, loss2, loss4 = strategy.experimental_run_v2(train_step, args=(dataset_inputs, ))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
for epoch in range(5):
total_loss = 0.0
num_batches = 0.0
for x in dist_dataset:
total_loss += distributed_train_step(x)
num_batches += 1
train_loss = total_loss / num_batches
template = ('Epoch: {}, Train Loss: {}')
print(template.format(epoch, train_loss))
checkpoint.save(checkpoint_prefix)
一些问题:
-
ckpt
创建,loss
计算函数不需要在strategy scope
内, 在strategy scope
内也没问题 -
model
和optimizer
必须在strtegy scope
内创建,ckpt
的save
需要在scope
内(未验证在之外是否可行) - 其余注意事项和方式1的问题一样
使用custom training loop的好处在于加粗样式,我们可以只需要很简单的修改几行代码,就可以把之前单卡单gpu的训练过程转换成单卡多gpu的分布式训练,并且模型的存储方式也不需要改变(如果之前也是存的tf 的ckpt格式而不是keras的h5格式或者tf serving格式)。
完全的keras方式做分布式训练应该比较容易,注意dataset和loss生成方式就可以,具体还是查看官方文档,完全的keras方式就不做介绍。
参考
- https://www.iteye.com/blog/cherishlc-2348962
- https://www.bookstack.cn/read/TensorFlow2.0/spilt.1.854b1d92d4364113.md