Style Transfer
风格迁移首次提出来自这篇论文:A Neural Algorithm of Artistic Style。文中作者提出了一个假设:图像的内容(content)和风格(style)可分离。在一个收敛的深度神经网络中,例如VGG19、Inception等等中,1.图像的内容(content)信息主要保留在模型每层输出中。2.图像的风格(style)在模型多层次输出共同表示。基于这个假设,作者通过一个简单的图像重建实验来证明以上假设。内容重建:基于模型浅层输出矩阵重建图像可以很好的还原图,基于模型深层输出矩阵重建的图像只能还原图像的整体轮廓,但是细节是缺失的。风格重建:基于模型多个不同层次的输出(由浅到深)重建图像,不同尺度输入越多,图像重建的风格跟原图越匹配,但是全局排列信息丢失。
基于假设和实验作者提出的图像合成方法:
- 首先准备一个训练好的图像识别模型例如:VGG19,只用到卷积、池化和激活后输出的特征表示(矩阵)
- ,噪声图像输入模型后特征响应的输出,每层特征映射(矩阵)数量记为:,每层特征矩阵大小记为:
- 每层的特征表示记为:,:表示为第层,第的卷积第个位置的卷积输出
- : 表示目标内容图像输入模型后各层特征输出, : 目标图像第层特征输出
- 然后通过梯度下降算法,找到一个跟目标图像特征响应(表示)匹配的图像,通过最下化输入图像和目标图像特征响应之间的距离(误差)
- 内容损失函数(content loss)
- 风格表示:风格量化,计算各层输出之间的相关性,Gram 矩阵来表示:
- 第层特征响应之间的Gram矩阵,特征向量之间的内积:
- 同上基于梯度下降算法找到一个跟目标图像风格匹配的图像,通过最小化输入图像和生成目标图像的特征响应的Gram 矩阵间的距离
- 风格损失函数(style loss)
- 内容和风格的混合,同时最小化Content Loss 和 Style Loss
- 内容和风格的权重
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
风格迁移
-
_gram_matirx
: 量化计算图像style -
_calc_outputs
: 计算图像的content和style -
_compute_loss
: MSE损失 -
_compute_total_loss
: -
_train_one_step
: 计算整体损失和梯度,修改图像 -
_train
: 多轮迭代最小化误差,收集损失,返回结果图像 -
_post_processing
: 图像的后处理
class StyleTransferer:
def __init__(self, content_layers, style_layers, base_model, optimizer, style_weight = 1e-2, content_weight = 1e4):
self.content_layers = content_layers
self.style_layers = style_layers
self.model = base_model
self.model.trainable = False
self.content_weight = content_weight
self.style_weight = style_weight
outputs = [self.model.get_layer(name).output for name in (self.content_layers+self.style_layers)]
self.transfer_model = Model([self.model.input], outputs)
self.optimizer = optimizer
def _gram_matrix(self, input_tensor):
result = tf.linalg.einsum('bijc,bijd->bcd', input_tensor, input_tensor)
input_shape = tf.shape(input_tensor)
num_locations = tf.reduce_prod(input_shape[1:3])
num_locations = tf.cast(num_locations, tf.float32)
result = result / num_locations
return result
def _calc_outputs(self, inputs):
inputs = inputs * 255.
preprocessed_input = preprocess_input(inputs)
outputs = self.transfer_model(preprocessed_input)
content_outputs = outputs[:len(self.content_layers)]
style_outputs = outputs[len(self.content_layers):]
style_outputs = [self._gram_matrix(style_output) for style_output in style_outputs]
content_dict = {content_name : value for content_name, value in zip(self.content_layers, content_outputs)}
style_dict = {style_name : value for style_name , value in zip(self.style_layers, style_outputs)}
return {'content':content_dict, 'style': style_dict}
def _clip_0_1(self, image):
return tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0)
def _compute_loss(self, outputs, targets):
return tf.add_n([tf.reduce_mean((outputs[key] - targets[key])**2) for key in outputs.keys()])
def _compute_total_loss(self, outputs, content_targets, style_targets):
style_outputs = outputs['style']
content_outputs = outputs['content']
n_style_layers = len(self.style_layers)
style_loss = self._compute_loss(style_outputs, style_targets)
style_loss *= self.style_weight / n_style_layers
n_content_layers = len(self.content_layers)
content_loss = self._compute_loss(content_outputs, content_targets)
content_loss *= self.content_weight / n_content_layers
return style_loss + content_loss
@tf.function()
def _train_one_step(self, image, content_targets, style_targets):
with tf.GradientTape() as tape:
outputs = self._calc_outputs(image)
loss = self._compute_total_loss(outputs, content_targets, style_targets)
gradient = tape.gradient(loss, image)
self.optimizer.apply_gradients([(gradient, image)])
image.assign(self._clip_0_1(image))
return image, loss
def _train(self, image, content_targets, style_targets, run_steps):
loss_history = []
for _ in tqdm_notebook(range(run_steps)):
image, loss = self._train_one_step(image, content_targets, style_targets)
loss_history.append(loss)
return image, loss_history
def _post_processing(self, image):
image = image * 255
image = np.array(image, dtype=np.uint8)
if np.ndim(image) > 3:
image = image[0]
return image
def transfer(self, content_image, style_image, run_steps=100):
content_targets = self._calc_outputs(content_image)['content']
style_targets = self._calc_outputs(style_image)['style']
image = tf.Variable(content_image)
image, loss_history = self._train(image, content_targets, style_targets, run_steps)
image = self._post_processing(image)
return image, loss_history
加载VGG19
vgg19 = VGG19(weights="~/keras_weights/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False)
vgg19.summary()
Model: "vgg19"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, None, None, 3)] 0
_________________________________________________________________
block1_conv1 (Conv2D) (None, None, None, 64) 1792
_________________________________________________________________
block1_conv2 (Conv2D) (None, None, None, 64) 36928
_________________________________________________________________
block1_pool (MaxPooling2D) (None, None, None, 64) 0
_________________________________________________________________
block2_conv1 (Conv2D) (None, None, None, 128) 73856
_________________________________________________________________
block2_conv2 (Conv2D) (None, None, None, 128) 147584
_________________________________________________________________
block2_pool (MaxPooling2D) (None, None, None, 128) 0
_________________________________________________________________
block3_conv1 (Conv2D) (None, None, None, 256) 295168
_________________________________________________________________
block3_conv2 (Conv2D) (None, None, None, 256) 590080
_________________________________________________________________
block3_conv3 (Conv2D) (None, None, None, 256) 590080
_________________________________________________________________
block3_conv4 (Conv2D) (None, None, None, 256) 590080
_________________________________________________________________
block3_pool (MaxPooling2D) (None, None, None, 256) 0
_________________________________________________________________
block4_conv1 (Conv2D) (None, None, None, 512) 1180160
_________________________________________________________________
block4_conv2 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block4_conv3 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block4_conv4 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block4_pool (MaxPooling2D) (None, None, None, 512) 0
_________________________________________________________________
block5_conv1 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block5_conv2 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block5_conv3 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block5_conv4 (Conv2D) (None, None, None, 512) 2359808
_________________________________________________________________
block5_pool (MaxPooling2D) (None, None, None, 512) 0
=================================================================
Total params: 20,024,384
Trainable params: 20,024,384
Non-trainable params: 0
_________________________________________________________________
测试用例
- content image : 《办公室》剧照
- style image : 梵高《星空》
合成图像
eager 模式下运行
tf.config.experimental_run_functions_eagerly(True)
transferer = StyleTransferer(content_layers=['block5_conv2'],
style_layers=['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1'],
base_model=vgg19,
optimizer= tf.optimizers.Adam(learning_rate=2e-2, beta_1=0.99, epsilon=0.1))
加载并重置图像大小
def load_image(image_path):
dimension = 540
image = tf.io.read_file(image_path)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.convert_image_dtype(image, tf.float32)
shape = tf.cast(tf.shape(image)[:-1], tf.float32)
longest_dimension = max(shape)
scale = dimension / longest_dimension
new_shape = tf.cast(shape * scale, tf.int32)
image = tf.image.resize(image, new_shape)
return image[tf.newaxis,:]
content_img = load_image('./content_image.png')
style_img = load_image('./style_image.jpg')
new_image, loss_history = transferer.transfer(content_img, style_img, run_steps=500)
合成视频
video_office