tensorflow 2.x的模型训练结束后一般保存为.h5或save_model的模型格式(只能是fp32格式),如果模型想要部署在移动端,一般需要将模型转换为.tflite格式,这里又分为动态格式、fp32格式、fp16格式和int8格式的tflite【1】。一般而言,如果直接将fp32的h5转换为int8格式的tflite会存在精度下降,为了尽量减少转换过程中的精度下降,可选的方式是在转换前对模型进行量化感知训练。具体办法是在权重保持fp32的情况下,按照int8量化的需求进行训练,之后再正式进行int8量化,这样可以减少int8的量化掉点。具体int8的量化原理这里不讲,下面是量化感知代码,分别比较了量化感知下fp32的h5、量化感知下fp32的tflite、量化感知下int8的tflite、非量化感知下fp32的h5、非量化感知下fp32的tflite、非量化感知下int8的tflite这六种情况下的精度。
量化感知训练
# 实现h5文件转换为tflite文件,可以是fp32或int8,详解参见 https://zhuanlan.zhihu.com/p/165670135
import os
import time
import random
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_model_optimization as tfmot
from tensorflow.keras.optimizers import Adam
############################################################################################################
## 0. 参数设置 ##############################################################################################
############################################################################################################
IMG_SIZE = (128, 128)
BATCH_SIZE = 32
q_epochs = 1 # 感知量化训练的轮数
learning_rate = 0.00001 # 感知量化学习率
num_test = 10 # 由于tflite在CPU上的推理速度有点慢,所以人为限制了最大数据量,其是设置为30就可以了,1000张图片
train_dir = "./train"
test_dir = "./test"
weight_path = ./"XXX.h5"
output_path = "./tflite/"
############################################################################################################
## 1. 读取数据和数据预处理 #####################################################################################
############################################################################################################
train_dataset = "" # tf.data.dataset
test_dataset = "" # tf.data.dataset
############################################################################################################
## 2. 量化感知训练 ###########################################################################################
############################################################################################################
# 读取模型并测试准确率
model = tf.keras.models.load_model(weight_path)
loss0, accuracy0 = model.evaluate(test_dataset)
# 量化感知训练,该量化感知训练应该是针对int8量化进行提前准备,如果想要为fp16量化提前准备,应该选择混合精度训练技术
# 当前版本tensorflow,该量化函数不支持嵌套模型,如果某个model0是model1和model2的串接模型,直接量化感知model0即quantize_model(model0)将会报错
# 解决办法是分别对model1和model2量化感知,然后再串接起来,即(model0 = tf.keras.model(quantize_model(model0), quantize_model(model1)))
quantize_model = tfmot.quantization.keras.quantize_model
q_aware_model = quantize_model(model)
# 新模型需要重新编译
q_aware_model.compile(optimizer=Adam(learning_rate=learning_rate),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
q_aware_model.summary()
# 开始量化感知训练
history_q = q_aware_model.fit(train_dataset, epochs=q_epochs)
# 测试输出模型准确率
loss1, accuracy1 = q_aware_model.evaluate(test_dataset)
############################################################################################################
## 3. 转换为tflite文件 #######################################################################################
############################################################################################################
# 定义int8量化所需数据
def representative_dataset():
for images, _ in train_dataset.take(32):
for i in range(BATCH_SIZE):
image = np.expand_dims(images[i].numpy(), axis=0).astype(np.float32)
yield [image]
# 评估tflite模型
def evaluate_model(tflite_model, tfl_int8):
# 初始化
interpreter = tf.lite.Interpreter(model_content=tflite_model)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_index = input_details[0]["index"]
output_index = output_details[0]["index"]
scale_in, zero_point_in = input_details[0]['quantization']
scale_out, zero_point_out = output_details[0]['quantization']
# 遍历测试集
prediction_labels = []
test_labels = []
num = min(num_test, len(test_dataset))
n = 0
pbar = tqdm(total=num*BATCH_SIZE) # 这里有一个bug,如果最后一个batch的尺寸不是BATCH_SIZE这么大,那么总轮数小于num*BATCH_SIZE
pbar.set_description("Processing int8" if tfl_int8 else "Processing fp32")
start = time.time()
for test_images, labels in test_dataset.take(num):
for i in range(len(test_images)):
if tfl_int8:
test_image = test_images[i] / scale_in + zero_point_in
test_image = np.expand_dims(test_image.numpy(), axis=0).astype(np.int8)
else:
test_image = np.expand_dims(test_images[i].numpy(), axis=0).astype(np.float32)
interpreter.set_tensor(input_index, test_image)
interpreter.invoke()
output = interpreter.get_tensor(output_index)
if tfl_int8:
output = output.astype(np.float32)
output = (output - zero_point_out) * scale_out
digit = np.argmax(output[0])
prediction_labels.append(digit)
test_labels.append(labels[i].numpy())
n += 1
pbar.update(1)
end = time.time()
prediction_labels = np.array(prediction_labels)
test_labels = np.array(test_labels)
accuracy = (prediction_labels == test_labels).mean()
return accuracy, (end-start)/n
# -1 量化感知模型转化为动态范围的tflite ---------------------------------------------------------------------
# converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model) # 读取模型
# converter.optimizations = [tf.lite.Optimize.DEFAULT] # 配置优化算法
# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS] # 配置算子支持
# quantized_tflite_model = converter.convert() # 转换模型
# 0 量化感知模型转化为float32的tflite ---------------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model) # 读取模型
quantized_tflite_model = converter.convert() # 转换模型
# 计算量化为float32后的tflite的准确率
accuracy2, time2 = evaluate_model(quantized_tflite_model, False)
base_path = output_path + weight_path.split("/")[-1].rstrip(".h5")
output_model = "{}_{:.2f}_fp32.tflite".format(base_path, accuracy2)
with open(output_model, 'wb') as f:
f.write(quantized_tflite_model)
# 1 量化感知模型转化为int8的tflite ------------------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model) # 读取模型
converter.optimizations = [tf.lite.Optimize.DEFAULT] # 配置优化算法
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] # 配置算子支持
converter.inference_input_type = tf.int8 # 设置输入数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.inference_output_type = tf.int8 # 设置输出数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.representative_dataset = representative_dataset # int8量化需要数据
converter.allow_custom_ops = False
converter.experimental_new_converter = True
converter.experimental_new_quantizer = True
quantized_tflite_model = converter.convert()
# 计算量化为int8后的tflite的准确率
accuracy3, time3 = evaluate_model(quantized_tflite_model, True)
base_path = output_path + weight_path.split("/")[-1].rstrip(".h5")
output_model = "{}_{:.2f}_int8.tflite".format(base_path, accuracy3)
with open(output_model, 'wb') as f:
f.write(quantized_tflite_model)
# 2 {未进行}量化感知模型转化为float32的tflite --------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(model) # 读取模型
tflite_model = converter.convert() # 转换模型
# 计算量化为float32后的tflite的准确率
accuracy4, time4 = evaluate_model(tflite_model, False)
# 3 {未进行}量化感知模型转化为int8的tflite -----------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(model) # 读取模型
converter.optimizations = [tf.lite.Optimize.DEFAULT] # 配置优化算法
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] # 配置算子支持
converter.inference_input_type = tf.int8 # 设置输入数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.inference_output_type = tf.int8 # 设置输出数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.representative_dataset = representative_dataset # int8量化需要数据
converter.allow_custom_ops = False
converter.experimental_new_converter = True
converter.experimental_new_quantizer = True
tflite_model = converter.convert()
# 计算量化为int8后的tflite的准确率
accuracy5, time5 = evaluate_model(tflite_model, True)
# 打印结果
print("initial accuracy: {:.2f}".format(accuracy0))
print("initial accuracy fp32: {:.2f} speed: {:.3f}s".format(accuracy4, time4))
print("initial accuracy int8: {:.2f} speed: {:.3f}s".format(accuracy5, time5))
print("quantized accuracy: {:.2f}".format(accuracy1))
print("quantized accuracy fp32: {:.2f} speed: {:.3f}s".format(accuracy2, time2))
print("quantized accuracy int8: {:.2f} speed: {:.3f}s".format(accuracy3, time3))
tflite的x86端测评
在得到tflite后,往往需要在x86端进行测评,可是当前tflite在x86端只能运行在CPU上,所以速度很慢,而深度学习的数据集往往很大,如果只是测评一个acc就需要一天时间,这就太不合适了,上面代码针对int8的tflite的测评是只选取了一小部分测试集,下面代码通过多进程实现了针对整体数据集的快速测评,且完成了acc,recall,precision和混淆矩阵的计算。
# 计算tflite模型的acc和混淆矩阵,多进程多batch
import os
import cv2
import numpy as np
import tensorflow as tf
from multiprocessing import Process, Queue
# 定义图片预处理函数
def image_preprocess(image, target_length, value=0.0, method=0):
image = image.astype("float32")
h, w, _ = image.shape # 获得原始尺寸
ih, iw = target_length, target_length # 获得目标尺寸
scale = min(iw/w, ih/h) # 实际拉伸比例
nw, nh = int(scale * w), int(scale * h) # 实际拉伸后的尺寸
image_resized = cv2.resize(image, (nw, nh)) # 实际拉伸图片
image_paded = np.full(shape=[ih, iw, 3], fill_value=value)
dw, dh = (iw - nw) // 2, (ih-nh) // 2
image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized # 居中填充图片
if method == 0:
image_paded = image_paded / 255. # 图片归一化
elif method == 1:
image_paded = image_paded / 127.5 - 1.0 # 图片标准化
return image_paded
# 定义进程函数
def fun(q, interpreter, image_list, label_name, batch_size, tfl_int8, last_size_b, num_core, n):
# 解析解释器
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_index = input_details[0]["index"]
output_index = output_details[0]["index"]
scale_in, zero_point_in = input_details[0]['quantization']
scale_out, zero_point_out = output_details[0]['quantization']
# 初始化
num_i = 0
num_j = 0
num_total = len(image_list)
num_prs = np.array([0.000000001 for _ in range(len(label_name))])
num_rec = np.array([0.000000001 for _ in range(len(label_name))])
num_mat = np.array([[0] * len(label_name) for _ in range(len(label_name))])
# 循环处理每一个batch
for image_path in image_list:
if num_j == 0:
images_list = []
images_label = []
image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
image = image_preprocess(image, 128, 0, 1)
images_list.append(image)
images_label.append(label_name.index(image_path.split("/")[-2]))
if num_j == batch_size - 1:
images = np.array(images_list)
if tfl_int8:
images = images / scale_in + zero_point_in
images = images.astype(np.int8)
else:
images = images.astype(np.float32)
interpreter.set_tensor(input_index, images)
interpreter.invoke()
output = interpreter.get_tensor(output_index)
if tfl_int8:
output = output.astype(np.float32)
output = (output - zero_point_out) * scale_out
if n == num_core-1 and num_i == num_total-batch_size and last_size_b != 0:
num = last_size_b
else:
num = batch_size
for k in range(num):
digit = np.argmax(output[k])
num_prs[int(digit)] += 1
num_rec[images_label[k]] += 1
num_mat[images_label[k], int(digit)] += 1
num_i += batch_size
if num_i%(4*batch_size) == 0:
print("Process %0.3d : finish %0.4d | total %0.4d | percentage %0.2f%%" % (n, num_i, num_total, num_i/num_total*100))
# 控制num_j在[0 ~ batch_size-1]之间循环
if num_j == batch_size - 1:
num_j = 0
else:
num_j += 1
q.put([num_prs, num_rec, num_mat])
# 初始化
image_size = (128, 128)
batch_size = 32
num_core = 16 # CPU核心数量
tfl_int8 = False # 是否为int8量化
model_path = "./xxx.tflite"
path = "./data/cats_and_dogs_filtered/validation/" # 数据集地址
label_name = ['cats', 'dogs'] # 必须跟tflite模型的标签顺序匹配
num_prs = np.array([0.000000001 for _ in range(len(label_name))])
num_rec = np.array([0.000000001 for _ in range(len(label_name))])
num_mat = np.array([[0] * len(label_name) for _ in range(len(label_name))])
q = Queue()
# 读取tflite模型
interpreter = tf.lite.Interpreter(model_path)
input_index = interpreter.get_input_details()[0]["index"]
interpreter.resize_tensor_input(input_index, [batch_size, image_size[0], image_size[1], 3], strict=True)
interpreter.allocate_tensors()
# 遍历每一个子文件夹,得到所有图片路径
image_list = []
for i in range(len(label_name)):
label_path = path + label_name[i] + "/"
for image_name in os.listdir(label_path):
image_list.append(label_path+image_name)
print("finish read image %d" % len(image_list))
# 按照进程分配任务,分配原则是尽量平均,每个核心处理任务的数量尽量一致
last_size_b = len(image_list) - (len(image_list)//batch_size)*batch_size
if last_size_b != 0:
for i in range(batch_size-last_size_b): image_list.append(image_list[i])
last_size_c = len(image_list)/batch_size - ((len(image_list)/batch_size)//num_core)*num_core
if last_size_c != 0:
delta = (len(image_list)/batch_size)//num_core
assert delta > 0
image_index = [0]
for _ in range(int(last_size_c)): image_index.append(image_index[-1] + int((delta+1)*batch_size))
for _ in range(int(last_size_c), num_core): image_index.append(image_index[-1] + int(delta*batch_size))
else:
delta = (len(image_list)/batch_size)//num_core
assert delta > 0
image_index = [0]
for _ in range(num_core): image_index.append(image_index[-1] + int(delta*batch_size))
# 启动进程
process_list = []
for i in range(num_core):
p = Process(target=fun, args=(q, interpreter, image_list[image_index[i]:image_index[i+1]], label_name, batch_size, tfl_int8, last_size_b, num_core, i))
p.start()
process_list.append(p)
# 保持进程
for p in process_list:
p.join()
# 整理结果
for i in range(num_core):
result = q.get()
num_prs += result[0]
num_rec += result[1]
num_mat += result[2]
# 计算总的准确率accuracy
num1, num2, num3 = 0, 0, 0
for i in range(len(label_name)):
num1 += num_mat[i, i]
num2 += num_prs[i]
num3 += num_rec[i]
assert int(num2) == int(num3)
accuracy = (num1/num2*100*100//1)/100
print("total accuracy: "+ str(accuracy))
# 计算每个类别的precision
result_prs = {}
prs = [float(num_mat[i, i])/float(num_prs[i])*100 for i in range(len(label_name))]
for i in range(len(label_name)):
result_prs[label_name[i]] = (prs[i]*100//1)/100
print("every precision: "+ str(result_prs))
# 计算每个类别的recall
result_rec = {}
rec = [float(num_mat[i, i])/float(num_rec[i])*100 for i in range(len(label_name))]
for i in range(len(label_name)):
result_rec[label_name[i]] = (rec[i]*100//1)/100
print("every recall: "+ str(result_rec))
# 打印混淆矩阵
print(label_name)
print(num_mat)