

pip install  pycuda -i  --trusted-host
pip install tensorrt-



import torch.nn as nn
import torch
from collections import OrderedDict
import torchvision.models as models
import numpy as np
from torchvision import models
def bilinear_kernel(in_channels, out_channels, kernel_size):
"""Define a bilinear kernel according to in channels and out channels.
return a bilinear filter tensor
factor = (kernel_size + 1) // 2
if kernel_size % 2 == 1:
center = factor - 1
center = factor - 0.5
og = np.ogrid[:kernel_size, :kernel_size]
bilinear_filter = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype=np.float32)
weight[range(in_channels), range(out_channels), :, :] = bilinear_filter
return torch.from_numpy(weight)

pretrained_net = models.vgg16_bn(pretrained=False)

class FCN(nn.Module):
def __init__(self, num_classes):

self.stage1 = pretrained_net.features[:7]
self.stage2 = pretrained_net.features[7:14]
self.stage3 = pretrained_net.features[14:24]
self.stage4 = pretrained_net.features[24:34]
self.stage5 = pretrained_net.features[34:]

self.scores1 = nn.Conv2d(512, num_classes, 1)
self.scores2 = nn.Conv2d(512, num_classes, 1)
self.scores3 = nn.Conv2d(128, num_classes, 1)

self.conv_trans1 = nn.Conv2d(512, 256, 1)
self.conv_trans2 = nn.Conv2d(256, num_classes, 1)

self.upsample_8x = nn.ConvTranspose2d(num_classes, num_classes, 16, 8, 4, bias=False) = bilinear_kernel(num_classes, num_classes, 16)

self.upsample_2x_1 = nn.ConvTranspose2d(512, 512, 4, 2, 1, bias=False) = bilinear_kernel(512, 512, 4)

self.upsample_2x_2 = nn.ConvTranspose2d(256, 256, 4, 2, 1, bias=False) = bilinear_kernel(256, 256, 4)

def forward(self, x):
# print('image:', x.size())

s1 = self.stage1(x)
# print('pool1:', s1.size())

s2 = self.stage2(s1)
# print('pool2:', s2.size())

s3 = self.stage3(s2)
# print('pool3:', s3.size())

s4 = self.stage4(s3)
# print('pool4:', s4.size())

s5 = self.stage5(s4)
# print('pool5:', s5.size())

scores1 = self.scores1(s5) # self.scores1 = nn.Conv2d(512, num_classes, 1); 这里进行了一次通道数的变化
# print('scores1:', scores1.size())

s5 = self.upsample_2x_1(s5) # nn.ConvTranspose2d(512, 512, 4, 2, 1, bias=False); 转置卷积进行第一次上采样
# print('s5:', s5.size())

add1 = s5 + s4 # 第一次上采样 与 s4进行融合
# print('add1:', add1.size())

scores2 = self.scores2(add1) # self.scores2 = nn.Conv2d(512, num_classes, 1) 将融合后的add1进行一次通道数变化为num_classes
# print('scores2:', scores2.size())

add1 = self.conv_trans1(add1) # self.conv_trans1 = nn.Conv2d(512, 256, 1) 将融合后的add1进行一次通道数变化为256
# print('add1:', add1.size())

add1 = self.upsample_2x_2(
add1) # self.upsample_2x_2 = nn.ConvTranspose2d(256, 256, 4, 2, 1, bias=False) 将通道256的add1 ,上采样为add1
# print('add1:', add1.size())

add2 = add1 + s3 # 将add1 和 s3 进行融合
# print('add2:', add2.size())

output = self.conv_trans2(add2) # self.conv_trans2 = nn.Conv2d(256, num_classes, 1) 改变add2的通道数
# print('output:', output.size())

output = self.upsample_8x(
output) # self.upsample_8x = nn.ConvTranspose2d(num_classes, num_classes, 16, 8, 4, bias=False)
# 使用转置卷积进行上采样
# print('output:', output.size())

return output

vgg16_pretrained = models.vgg16(pretrained=False)

def decoder(input_channel, output_channel, num=3):
if num == 3:
decoder_body = nn.Sequential(
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, output_channel, 3, padding=1))
elif num == 2:
decoder_body = nn.Sequential(
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, output_channel, 3, padding=1))

return decoder_body

class VGG16_deconv(torch.nn.Module):
def __init__(self, num_classes=8):
super(VGG16_deconv, self).__init__()

pool_list = [4, 9, 16, 23, 30]
for index in pool_list:
vgg16_pretrained.features[index].return_indices = True

self.encoder1 = vgg16_pretrained.features[:4]
self.pool1 = vgg16_pretrained.features[4]

self.encoder2 = vgg16_pretrained.features[5:9]
self.pool2 = vgg16_pretrained.features[9]

self.encoder3 = vgg16_pretrained.features[10:16]
self.pool3 = vgg16_pretrained.features[16]

self.encoder4 = vgg16_pretrained.features[17:23]
self.pool4 = vgg16_pretrained.features[23]

self.encoder5 = vgg16_pretrained.features[24:30]
self.pool5 = vgg16_pretrained.features[30]

self.decoder5 = decoder(512, 512)
self.unpool5 = nn.MaxUnpool2d(2, 2)

self.decoder4 = decoder(512, 256)
self.unpool4 = nn.MaxUnpool2d(2, 2)

self.decoder3 = decoder(256, 128)
self.unpool3 = nn.MaxUnpool2d(2, 2)

self.decoder2 = decoder(128, 64, 2)
self.unpool2 = nn.MaxUnpool2d(2, 2)

self.decoder1 = decoder(64, num_classes, 2)
self.unpool1 = nn.MaxUnpool2d(2, 2)

def forward(self, x):
# print('x:', x.size())
encoder1 = self.encoder1(x);
# print('encoder1:', encoder1.size())
output_size1 = encoder1.size()
pool1, indices1 = self.pool1(encoder1);
# print('pool1:', pool1.size());
# print('indices1:', indices1.size())

encoder2 = self.encoder2(pool1);
# print('encoder2:', encoder2.size())
output_size2 = encoder2.size()
pool2, indices2 = self.pool2(encoder2);
# print('pool2:', pool2.size());
# print('indices2:', indices2.size())

encoder3 = self.encoder3(pool2);
# print('encoder3:', encoder3.size())
output_size3 = encoder3.size()
pool3, indices3 = self.pool3(encoder3);
# print('pool3:', pool3.size());
# print('indices3:', indices3.size())

encoder4 = self.encoder4(pool3);
# print('encoder4:', encoder4.size())
output_size4 = encoder4.size()
pool4, indices4 = self.pool4(encoder4);
# print('pool4:', pool4.size());
# print('indices4:', indices4.size())

encoder5 = self.encoder5(pool4);
# print('encoder5:', encoder5.size())
output_size5 = encoder5.size()
pool5, indices5 = self.pool5(encoder5);
# print('pool5:', pool5.size());
# print('indices5:', indices5.size())

unpool5 = self.unpool5(input=pool5, indices=indices5, output_size=output_size5);
# print('unpool5:', unpool5.size())
decoder5 = self.decoder5(unpool5);
# print('decoder5:', decoder5.size())

unpool4 = self.unpool4(input=decoder5, indices=indices4, output_size=output_size4);
# print('unpool4:', unpool4.size())
decoder4 = self.decoder4(unpool4);
# print('decoder4:', decoder4.size())

unpool3 = self.unpool3(input=decoder4, indices=indices3, output_size=output_size3);
# print('unpool3:', unpool3.size())
decoder3 = self.decoder3(unpool3);
# print('decoder3:', decoder3.size())

unpool2 = self.unpool2(input=decoder3, indices=indices2, output_size=output_size2);
# print('unpool2:', unpool2.size())
decoder2 = self.decoder2(unpool2);
# print('decoder2:', decoder2.size())

unpool1 = self.unpool1(input=decoder2, indices=indices1, output_size=output_size1);
# print('unpool1:', unpool1.size())
decoder1 = self.decoder1(unpool1);
# print('decoder1:', decoder1.size())

return decoder1

# 生成网络实例
# net = UNet(in_channels=3,num_classes=2)
# net = FCN(3)
net = VGG16_deconv(num_classes=3)
# 启用评测模型,参数停止更新
# 加载最优模型参数
# 跟踪推理
trace = torch.jit.trace(net, torch.randn(1, 3, 640, 640))
# 导出trace pt模型,'')
# 加载模型
model = torch.load('')
# 启用评测模型,参数停止更新
# 模型转cuda

input_x = torch.randn(1,3,640,640).cuda()
# torch.onnx.export(model, x, 'UNet_model.onnx', input_names=input_names, output_names=output_names, verbose='True')
# 在batchsize上设置为动态的
res = torch.onnx.export(model,
dynamic_axes={"inputs": {0: "bs",1: "channel",2: "h",3: "w"}},
# verbose=True, # true表示打印调试信息


import tensorrt as trt
import os
import common # 修改后的common文件
# 显示设置批大小为1
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
# 生成TRT的日志实例
TRT_LOGGER = trt.Logger()


mix_size = (1, 3, 128, 128) # 最小size
common_size = (1, 3, 640, 640) # 适合size
max_size = (1, 3, 2048, 2048) # 最大size

def get_engine(onnx_file_path, engine_file_path=""):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
# 构建引擎
def build_engine():
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
# 1.生成builder构建器实例,起别名为 builder
# 2.通过构建器的builder构造网络,起别名为 network
# 3.通过构建器的builder构造配置,起别名为 config
# 4.生成Onnx模型解析器OnnxParser,起别名为 parser
# 5.根据日志创建运行时
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
builder.create_builder_config() as config, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
trt.Runtime(TRT_LOGGER) as runtime:
# 设置最大工作空间,为4GB
config.max_workspace_size = 1 << 32 # 4GB
# 设置构建器最大的批处理数量为1
builder.max_batch_size = 1
# 开始解析onnx模型文件
# 判断onnx模型文件是否存在
if not os.path.exists(onnx_file_path):
"ONNX file {} not found, please run first to generate it.".format(onnx_file_path)
# 打印onnx模型位置
print("Loading ONNX file from path {}...".format(onnx_file_path))
# 打开模型,parse进行解析
with open(onnx_file_path, "rb") as model:
print("Beginning ONNX file parsing")
# 读取解析onnx
if not parser.parse(
print("ERROR: Failed to parse the ONNX file.")
# 打印解析失败原因
for error in range(parser.num_errors):
return None
# 为每个动态输入绑定一个profile
profile = builder.create_optimization_profile()
print("network.get_input(0).name:", network.get_input(0).name)
profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
(1, 3, 648, 648)) # 最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
# 从网络中获取输入
inputs = [network.get_input(i) for i in range(network.num_inputs)]
# 打印输入
# 从网络中获取输出
outputs = [network.get_output(i) for i in range(network.num_outputs)]
# 打印输出
print("Completed parsing of ONNX file")
print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
# 通过网络network和配置config进行引擎构建
plan = builder.build_serialized_network(network, config)
# 反序列化引擎流
engine = runtime.deserialize_cuda_engine(plan)
# 打印引擎构建完成
print("Completed creating Engine")
# 保存引擎文件至本地
with open(engine_file_path, "wb") as f:
return engine

# 如果trt的引擎文件存在,则进行运行时反序列化尝试,判断引擎是否可用
# 如果引擎路径不存在,则进行根据onnx模型路径进行引擎构建,保存引擎到本地
if os.path.exists(engine_file_path):
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine( # 运行时反序列化引擎
return build_engine()

self.inputs, self.outputs, self.bindings, = common.allocate_buffers(engine)
self.context = engine.create_execution_context()

def get_DynEngine(onnx_file_path, engine_file_path):
Attempts to load a serialized engine if available,
otherwise build a new TensorRT engine as save it

# 1.生成builder构建器实例,起别名为 builder
# 2.通过构建器的builder构造网络,起别名为 network
# 3.通过构建器的builder构造配置,起别名为 config
# 4.生成Onnx模型解析器OnnxParser,起别名为 parser
# 5.根据日志创建运行时
def build_engine():
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(common.EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)
runtime = trt.Runtime(TRT_LOGGER)
# 最大内存占用
# 显存溢出需要重新设置
config.max_workspace_size = 10 << 30 # 256MB
builder.max_batch_size = 1 # 推理的时候要保证batch_size<=max_batch_size

# parse model file
if not os.path.exists(onnx_file_path):
print(f'onnx file {onnx_file_path} not found,please run first to generate it')
print(f'Loading ONNX file from path {onnx_file_path}...')
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(
print('ERROR:Failed to parse the ONNX file')
for error in range(parser.num_errors):
return None
inputs = [network.get_input(i) for i in range(network.num_inputs)]

outputs = [network.get_output(i) for i in range(network.num_outputs)]

print("Network Description")
for input in inputs:
# 获取当前转化之前的 输入的 batch_size
batch_size = input.shape[0]
print("Input '{}' with shape {} and dtype {} . ".format(, input.shape, input.dtype))
for output in outputs:
print("Output '{}' with shape {} and dtype {} . ".format(, output.shape, output.dtype))

# Dynamic input setting 动态输入在builder的profile设置
# 为每个动态输入绑定一个profile
profile = builder.create_optimization_profile()
profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
(1, 3, 648, 648)) # 最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内

print('Completed parsing the ONNX file')
print(f'Building an engine from file {onnx_file_path}; this may take a while...')
# plan = builder.build_serialized_network(network,config)
# engine = runtime.deserialize_cuda_engine(plan)
engine = builder.build_engine(network, config)
print('Completed creating Engine')
with open(engine_file_path, 'wb') as f:
# f.write(plan)
return engine

if os.path.exists(engine_file_path):
print(f'Reading engine from file {engine_file_path}')
with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(

return build_engine()

if __name__ == "__main__":
"""Create a TensorRT engine for seg and run inference."""
from datetime import datetime

startTime =
onnx_file_path = "FCN_model2.onnx" # "UNet_model2.onnx" # "HySegNet.onnx"
engine_file_path = "model222.engine"# "test.engine" # "model_seg.engine"
# get_engine(onnx_file_path, engine_file_path,is_dyn=1)
get_DynEngine(onnx_file_path, engine_file_path)
endTime =
duringTime = endTime - startTime


import numpy as np
import os
import pycuda.driver as cuda #GPU CPU之间的数据传输
import pycuda.autoinit #负责数据初始化,内存管理,销毁等
import tensorrt as trt
import torch
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import torch.nn.functional as F

TRT_LOGGER = trt.Logger()

import cv2
# Filenames of TensorRT plan file and input/output images.
# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].

def preprocess(image):
# Mean normalization
mean = np.array([0.485, 0.456, 0.406]).astype('float32')
stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev

# Switch from HWC to to CHW order
return np.moveaxis(data, 2, 0)

def postprocess(data):
num_classes = 21
# create a color palette, selecting a color for each class
palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
# plot the segmentation predictions for 21 classes in different colors
img = Image.fromarray(data.astype('uint8'), mode='P')
return img

def load_engine(engine_file_path):
assert os.path.exists(engine_file_path)
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(

def pred2show(mask,iii):
# 1.读取对应表,确定对应关系
path_color2class_table = r".\color2class_table.csv"
dataframe = pd.read_csv(path_color2class_table)
list_rgb = []
list_class_id = []
for i in range(len(dataframe)):
rgb = list(dataframe.iloc[i][2:])
class_id = int(dataframe.iloc[i][0])
for i in range(len(list_rgb)):
list_rgb[i] = i*255
dict_color2class = dict(zip(list_class_id, list_rgb))

# 2.创建空数组
crop_size = (640, 640) # (512,512)
pred = np.empty([crop_size[0], crop_size[1]], dtype=int)
# print(frame.shape) # shape内包含三个元素:按顺序为高、宽、通道数
height = mask.shape[0]
weight = mask.shape[1]

# 3.遍历mask,根据对应关系填充rgb
for row in range(height): # 遍历高
for col in range(weight): # 遍历宽
pred[row,col] = np.array(dict_color2class[mask[row,col]])
img_show = cv2.imread("test"+str(iii)+".png")

def infer(engine, input_file, output_file):
# 打印输入图像路径
print("Reading input image from file {}".format(input_file))
# 打开图像
with as img:
img =img.resize((640, 640), Image.ANTIALIAS)
input_image = preprocess(img)
image_width = img.width
image_height = img.height
# step5:创建上下文context并进行推理
with engine.create_execution_context() as context:
# Set input shape based on image dimensions for inference
# 设置推理的输入shape
context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
# 分配主机和设备缓冲区
bindings = []

# 在cpu和gpu上申请内存
for binding in engine:
binding_idx = engine.get_binding_index(binding)
# 数据大小
size = trt.volume(context.get_binding_shape(binding_idx))
# 数据类型
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
# step3:分配输入数据的CPU锁页内存和GPU显存
input_buffer = np.ascontiguousarray(input_image)
# 分配输入数据的cuda显存
input_memory = cuda.mem_alloc(input_image.nbytes)
# step3:分配输出数据的CPU锁页内存和GPU显存
output_buffer = cuda.pagelocked_empty(size, dtype)
# 分配输出数据的cuda显存
output_memory = cuda.mem_alloc(output_buffer.nbytes)

# step4:创建cuda流
stream = cuda.Stream()
# 将输入数据转入cuda
cuda.memcpy_htod_async(input_memory, input_buffer, stream)
# 执行推理
import datetime
startTime =
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
endTime =
durTime = 'funtion time use:%dms' % (
(endTime - startTime).seconds * 1000 + (endTime - startTime).microseconds / 1000)

# 从GPU中将输出数据取出(output_buffer)
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
# 同步流

# # reshape输出数据的维度
res = np.reshape(output_buffer, (2,image_height, image_width))
# # 转tensor
# out = torch.tensor(res)
# # 扩展维度
# out = torch.unsqueeze(out,0)
# # 激活函数
# out = F.log_softmax(out, dim=1)
# # 取mask
# pre_label = out.max(1)[1].squeeze().cpu().data.numpy()
# # 显示
# rgb = pred2show(pre_label, 1)
#with postprocess(np.reshape(output_buffer, (image_height, image_width,2))) as img:
# print("Writing output image to file {}".format(output_file))
# img.convert('RGB').save(output_file, "PPM")

import common
def infer2(engine, input_file, output_file):
# 打开图像
with as img:
img =img.resize((640, 640), Image.ANTIALIAS)
input_image = preprocess(img)
width = img.width
height = img.height

# 创建执行上下文
context = engine.create_execution_context()

# 修改allocate_buffers函数,支持动态输入
inputs, outputs, bindings, stream = common.allocate_buffers(engine, (height, width))

# 指定使用哪个profile
context.active_optimization_profile = 0 # 新增部分
origin_inputshape = context.get_binding_shape(0)

if origin_inputshape[-1] == -1:
origin_inputshape[-2], origin_inputshape[-1] = (height, width)
context.set_binding_shape(0, (origin_inputshape))

print(f'Running inference on image {input_file}...')
tmpImg = input_image[np.newaxis, :, :, :] # CHW->NCHW
inputs[0].host = np.ascontiguousarray(tmpImg) # ************************
trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
# pdb.set_trace()
trt_outputs = np.reshape(trt_outputs, (3,height, width))
# # postprocess trt output
# trt_outputs = 1.0 - trt_outputs
# trt_outputs_max = np.max(trt_outputs)
# trt_output_min = np.min(trt_outputs)
# trt_outputs = (trt_outputs - trt_output_min) / (trt_outputs_max - trt_output_min)
# trt_outputs = trt_outputs * 255
# trt_outputs = np.clip(trt_outputs, 0, 255)
# cv2.imwrite(output_file, trt_outputs)

if __name__ == '__main__':
flag = 0
if flag:
# 引擎路径
engine_file = "model_seg.engine" # "model_seg.engine"
# 输入图像路径
input_file = r".\liver\train\image\0.png" # r"E0_0_E0_0_Image_20220907142018844.bmp"
# 输出结果保存路径
output_file = "output.png"
# 读取图像数据流
img =
print("Running TensorRT inference for Seg")
# 加载引擎
with load_engine(engine_file) as engine:
# 推理
infer(engine, input_file, output_file)
# infer(engine, input_file, output_file)
# infer(engine, input_file, output_file)
# import datetime
# startTime =
# infer(engine, input_file, output_file)
# endTime =
# durTime = 'funtion time use:%dms' % ((endTime -startTime ).seconds * 1000 + (endTime -startTime ).microseconds / 1000)
# print(durTime)
engine_file = "model222.engine" # "model_seg.engine"
# 输入图像路径
input_file = r".\liver\train\image\0.png" # r"E0_0_E0_0_Image_20220907142018844.bmp"
# 输出结果保存路径
output_file = "output.png"
with load_engine(engine_file) as engine: