传说使用以下方式可以使用多GPU共同训练?那是不可能滴……

# 在代码中
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'  # 使第0,1,2,3块GPU一起训练?NO!
os.environ['CUDA_VISIBLE_DEVICES']='-1'       # 只使用CPU

# 在控制台
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py  # NO!

正确的打开方式: 参考:tensorflow-models/tutorials/image/cifar10/cifar10_multi_gpu_train.py

create_tfrecord.py

import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data

def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

mnist = input_data.read_data_sets('./MNIST_data', dtype=tf.uint8, one_hot=True)
images = mnist.train.images
labels = mnist.train.labels
size = images.shape[1]
num_examples = mnist.train.num_examples

# 输出TFRecord文件的地址
filename = './output.tfrecord'

# 创建writer来写tfrecords文件
writer = tf.python_io.TFRecordWriter(filename)

for i in range(num_examples):
    # 将图像矩阵转换为一个字符串
    image_raw = images[i].tostring()
    #将一个样例转换为Example Protocol Buffer,并将所有的信息写入这个数据结构
    example = tf.train.Example(features = tf.train.Features(feature={
        'size': int64_feature(size),
        'label': int64_feature(np.argmax(labels[i])),
        'image_raw': bytes_feature(image_raw)
    }))
    #将一个Example写入TFRecord文件
    writer.write(example.SerializeToString())
writer.close()

multi_gpu_train.py

import tensorflow as tf
import time
from datetime import datetime
import matplotlib.pyplot as plt

INPUT_NODE = 28*28
NODE1 = 500
OUTPUT_NODE = 10
BASE_LEARNING_RATE = 0.001
DECAY_RATET = 0.99
BATCH_SIZE = 32
MOVING_AVG_DECAY = 0.99
TRAIN_STEPS = 30000
REGULAR_RATIO = 0.0001
N_GPU = 1

def get_weight_variables(shape, regularizer):
    w = tf.get_variable('w',shape,dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.1))
    tf.add_to_collection(tf.GraphKeys.LOSSES, regularizer(w))
    return w

def inference(input_tensor,regularizer):
    with tf.variable_scope('layer1'):
        weight = get_weight_variables([INPUT_NODE, NODE1], regularizer)
        bias  = tf.get_variable('bias', [NODE1], dtype=tf.float32)
        layer1 = tf.nn.relu(tf.matmul(input_tensor, weight) + bias)
    with tf.variable_scope('layer2'):
        weight = get_weight_variables([NODE1, OUTPUT_NODE], regularizer)
        bias  = tf.get_variable('bias', [OUTPUT_NODE], dtype=tf.float32)
        layer2 = tf.matmul(layer1, weight) + bias
    return layer2

def _parse_image_function(example_proto):
    # Create a dictionary describing the features.
    image_feature_description = {
        'size': tf.FixedLenFeature([], tf.int64),
        'label': tf.FixedLenFeature([], tf.int64),
        'image_raw': tf.FixedLenFeature([], tf.string),
        }
    # Parse the input tf.Example proto using the dictionary above.
    feat_dict = tf.parse_single_example(example_proto, image_feature_description) # 返回值是<class 'dict'>类型
    decoded_img = tf.decode_raw(feat_dict['image_raw'], tf.uint8)
    reshaped_img = tf.cast(tf.reshape(decoded_img, [784]), tf.float32)
    label = tf.cast(feat_dict['label'],tf.int32)
    return reshaped_img, label

def get_input():
    # 读取 TFRecord 文件
    input_files = ['output.tfrecord']  # 可以有多个文件
    dataset = tf.data.TFRecordDataset(input_files).map(_parse_image_function)
    dataset = dataset.shuffle(buffer_size=10000).repeat(100).batch(BATCH_SIZE)
    iterator = dataset.make_one_shot_iterator()
    img, label = iterator.get_next()
    return img, label

# 定义损失函数,对于给定的训练数据、正则化损失和命名空间,计算在这个命名空间下的总损失,
# 之所以要给定命名空间是因为不同的GPU上计算的到的正则化损失都会加入名为loss的集合,
# 如果不通过命名空间就会将其它GPU上的正则化损失加到当前GPU上(对于其它GPU也是,相当于每个GPU上的正则化损失都是所有GPU上正则化损失之和)
def get_loss(x, y_gt,regularizer,scope, reuse_variable=None):
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variable):
        y_pred = inference(x,regularizer)
    # 计算交叉熵损失
    cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_gt, logits=y_pred))
    # 计算当前GPU上计算得到的正则化损失
    regularization_loss = tf.add_n(tf.get_collection(tf.GraphKeys.LOSSES, scope))  # scope重要!scope重要!scope重要!
    # 计算总损失
    loss = cross_entropy + regularization_loss
    return loss

# 计算每一个变量梯度的平均值,用于更新变量
# tower_gradients里面保存的形式是(第一个GPU上的梯度,第二个GPU上的梯度,...第N-1个GPU上的梯度)
def average_gradients(tower_gradients):
    average_grads = []
    # 枚举所有的变量和变量在不同GPU上计算得出的梯度
    for grad_and_vars in zip(*tower_gradients):  # grad_and_vars是同一变量在不同GPU上的梯度
        # 计算所有GPU上梯度平均值
        grads = []
        for g,_ in grad_and_vars:
            expanded_g = tf.expand_dims(g,0)
            grads.append(expanded_g)
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad,0)

        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        # 将变量和它的平均梯度对应起来
        average_grads.append(grad_and_var)
    # 返回所有变量的平均梯度,用于更新变量
    return average_grads

def train():
    # 将简单的运算放在CPU上,只有神经网络的训练过程放到GPU上
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        x, y_gt = get_input()
    
        regularizer = tf.contrib.layers.l2_regularizer(REGULAR_RATIO)
    
        global_step = tf.Variable(0,trainable=False)
        learning_rate = tf.train.exponential_decay(BASE_LEARNING_RATE,global_step, 55000/BATCH_SIZE,DECAY_RATET)
    
        opt = tf.train.GradientDescentOptimizer(learning_rate)
    
        tower_grads = []
        reuse_variables = False
        
        # 将神经网络的优化过程跑在不同的GPU上
        for i in range(N_GPU):
            # 将优化过程指定在一个GPU上
            with tf.device('/gpu:%d'%i):
                with tf.name_scope('GPU_%d'%i) as scope:
                    cur_loss = get_loss(x,y_gt,regularizer,scope, reuse_variables)
                    # 在第一次声明变量之后,将控制变量复用的参数设置为True,从而使得y_pred = inference(x)使用的是同一组参数
                    reuse_variables = True
                    grads = opt.compute_gradients(cur_loss)
                    print(type(grads))
                    tower_grads.append(grads)
    
        # 计算变量的平均梯度
        grads = average_gradients(tower_grads)
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram('gradients_on_average%s'%var.op.name,grad)
    
        # 使用平均梯度更新参数
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
    
        ema = tf.train.ExponentialMovingAverage(MOVING_AVG_DECAY, global_step)
        variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
        avg_op = ema.apply(variables_to_average)
    
        with tf.control_dependencies([apply_gradient_op, avg_op]):
            train_op = tf.no_op('train')
    
        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter('./log',graph=tf.get_default_graph())
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())
            for i in range(TRAIN_STEPS):
                # 执行神经网络的训练,并记录训练时间
                start_time = time.time()
                step, loss_,  _ = sess.run([global_step, cur_loss, train_op])
                duration = time.time()-start_time
    
                # 每隔一段时间输出当前训练进度,并统计训练速度
                if step!=0 and step%1000==0:
                    # 计算使用过的训练数据个数,因为在每次运行训练操作时每个GPU都会使用一个batch的训练数据,
                    # 因此用到的训练数据个数为batch*N_GPU
                    num_examples_per_step = BATCH_SIZE * N_GPU
                    # num_examples_per_step是本次迭代使用的训练数据个数,duration为运行当前训练操作使用的时间,
                    # 于是每秒可以处理的训练数据个数为:
                    examples_per_sec = num_examples_per_step/duration
                    # duration为运行当前训练操作的时间,因为在每一个训练过程中每个GPU都会使用一个batch的训练数据,
                    # 所以在单个batch上的训练所需时间为:
                    sec_per_batch = duration/N_GPU
    
                    # 输出训练信息
                    format_str = ('%s: step%d, loss=%.2f(%.1f examples/sec; %.3f sec/batch)')
                    print(format_str %(datetime.now(), step, loss_, examples_per_sec, sec_per_batch))
    
                    # 可视化训练过程
                    summary = sess.run(summary_op)
                    writer.add_summary(summary, step)
    
                if i % 1000 == 0 or (step+1) == TRAIN_STEPS:
                    print('step: %d, loss: %f' % (step, loss_))
                    saver.save(sess,'./model/model.ckpt')
        writer.close()

def main():
    train()

if __name__ == '__main__' :
    main()