## 3.整体网络结构

End-to-End Referring Video Object Segmentation with Multimodal Transformers 的网络结构主要包括两个部分：指代表达理解模块和视频对象分割模块。 <br> 指代表达理解模块：

## 5.损失函数设计

loss function 其中： 试验细节：

• batch_size=16,
• lr=0.01多项式衰减
• 损失参数：20,1,1,1
• resolution:800x800

## 代码

``````import torchimport torch.nn.functional as Ffrom torch import nnfrom network import SEresnextfrom network import Resnetfrom network.wider_resnet import wider_resnet38_a2from config import cfgfrom network.mynn import initialize_weights, Norm2dfrom torch.autograd import Variable
from my_functionals import GatedSpatialConv as gsc
import cv2import numpy as np
class Crop(nn.Module):
def __init__(self, axis, offset):
super(Crop, self).__init__()
self.axis = axis
self.offset = offset

def forward(self, x, ref):
"""        :param x: input layer        :param ref: reference usually data in        :return:        """
for axis in range(self.axis, x.dim()):
ref_size = ref.size(axis)
indices = torch.arange(self.offset, self.offset + ref_size).long()
indices = x.data.new().resize_(indices.size()).copy_(indices).long()
x = x.index_select(axis, Variable(indices))
return x

class MyIdentity(nn.Module):
def __init__(self, axis, offset):
super(MyIdentity, self).__init__()
self.axis = axis
self.offset = offset

def forward(self, x, ref):
"""        :param x: input layer        :param ref: reference usually data in        :return:        """
return x
class SideOutputCrop(nn.Module):
"""    This is the original implementation ConvTranspose2d (fixed) and crops    """

def __init__(self, num_output, kernel_sz=None, stride=None, upconv_pad=0, do_crops=True):
super(SideOutputCrop, self).__init__()
self._do_crops = do_crops
self.conv = nn.Conv2d(num_output, out_channels=1, kernel_size=1, stride=1, padding=0, bias=True)

if kernel_sz is not None:
self.upsample = True
self.upsampled = nn.ConvTranspose2d(1, out_channels=1, kernel_size=kernel_sz, stride=stride,
padding=upconv_pad,
bias=False)
##doing crops
if self._do_crops:
self.crops = Crop(2, offset=kernel_sz // 4)
else:
self.crops = MyIdentity(None, None)
else:
self.upsample = False

def forward(self, res, reference=None):
side_output = self.conv(res)
if self.upsample:
side_output = self.upsampled(side_output)
side_output = self.crops(side_output, reference)

return side_output

class _AtrousSpatialPyramidPoolingModule(nn.Module):
'''    operations performed:      1x1 x depth      3x3 x depth dilation 6      3x3 x depth dilation 12      3x3 x depth dilation 18      image pooling      concatenate all together      Final 1x1 conv    '''

def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=[6, 12, 18]):
super(_AtrousSpatialPyramidPoolingModule, self).__init__()

# Check if we are using distributed BN and use the nn from encoding.nn
# library rather than using standard pytorch.nn

if output_stride == 8:
rates = [2 * r for r in rates]
elif output_stride == 16:
pass
else:
raise 'output stride of {} not supported'.format(output_stride)

self.features = []
# 1x1
self.features.append(
nn.Sequential(nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True)))
# other rates
for r in rates:
self.features.append(nn.Sequential(
nn.Conv2d(in_dim, reduction_dim, kernel_size=3,
dilation=r, padding=r, bias=False),
Norm2d(reduction_dim),
nn.ReLU(inplace=True)
))
self.features = torch.nn.ModuleList(self.features)

# img level features
self.img_pooling = nn.AdaptiveAvgPool2d(1)
self.img_conv = nn.Sequential(
nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True))
self.edge_conv = nn.Sequential(
nn.Conv2d(1, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True))

def forward(self, x, edge):
x_size = x.size()

img_features = self.img_pooling(x)
img_features = self.img_conv(img_features)
img_features = F.interpolate(img_features, x_size[2:],
mode='bilinear',align_corners=True)
out = img_features

edge_features = F.interpolate(edge, x_size[2:],
mode='bilinear',align_corners=True)
edge_features = self.edge_conv(edge_features)
out = torch.cat((out, edge_features), 1)

for f in self.features:
y = f(x)
out = torch.cat((out, y), 1)
return out
class GSCNN(nn.Module):
'''    Wide_resnet version of DeepLabV3    mod1    pool2    mod2 str2    pool3    mod3-7      structure: [3, 3, 6, 3, 1, 1]      channels = [(128, 128), (256, 256), (512, 512), (512, 1024), (512, 1024, 2048),                  (1024, 2048, 4096)]    '''

def __init__(self, num_classes, trunk=None, criterion=None):

super(GSCNN, self).__init__()
self.criterion = criterion
self.num_classes = num_classes

wide_resnet = wider_resnet38_a2(classes=1000, dilation=True)
wide_resnet = torch.nn.DataParallel(wide_resnet)

wide_resnet = wide_resnet.module
self.mod1 = wide_resnet.mod1
self.mod2 = wide_resnet.mod2
self.mod3 = wide_resnet.mod3
self.mod4 = wide_resnet.mod4
self.mod5 = wide_resnet.mod5
self.mod6 = wide_resnet.mod6
self.mod7 = wide_resnet.mod7
self.pool2 = wide_resnet.pool2
self.pool3 = wide_resnet.pool3
self.interpolate = F.interpolate
del wide_resnet

self.dsn1 = nn.Conv2d(64, 1, 1)
self.dsn3 = nn.Conv2d(256, 1, 1)
self.dsn4 = nn.Conv2d(512, 1, 1)
self.dsn7 = nn.Conv2d(4096, 1, 1)

self.res1 = Resnet.BasicBlock(64, 64, stride=1, downsample=None)
self.d1 = nn.Conv2d(64, 32, 1)
self.res2 = Resnet.BasicBlock(32, 32, stride=1, downsample=None)
self.d2 = nn.Conv2d(32, 16, 1)
self.res3 = Resnet.BasicBlock(16, 16, stride=1, downsample=None)
self.d3 = nn.Conv2d(16, 8, 1)
self.fuse = nn.Conv2d(8, 1, kernel_size=1, padding=0, bias=False)

self.cw = nn.Conv2d(2, 1, kernel_size=1, padding=0, bias=False)

self.gate1 = gsc.GatedSpatialConv2d(32, 32)
self.gate2 = gsc.GatedSpatialConv2d(16, 16)
self.gate3 = gsc.GatedSpatialConv2d(8, 8)

self.aspp = _AtrousSpatialPyramidPoolingModule(4096, 256,
output_stride=8)

self.bot_fine = nn.Conv2d(128, 48, kernel_size=1, bias=False)
self.bot_aspp = nn.Conv2d(1280 + 256, 256, kernel_size=1, bias=False)

self.final_seg = nn.Sequential(
nn.Conv2d(256 + 48, 256, kernel_size=3, padding=1, bias=False),
Norm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
Norm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, num_classes, kernel_size=1, bias=False))

self.sigmoid = nn.Sigmoid()
initialize_weights(self.final_seg)

def forward(self, inp, gts=None):

x_size = inp.size()

# res 1
m1 = self.mod1(inp)

# res 2
m2 = self.mod2(self.pool2(m1))

# res 3
m3 = self.mod3(self.pool3(m2))

# res 4-7
m4 = self.mod4(m3)
m5 = self.mod5(m4)
m6 = self.mod6(m5)
m7 = self.mod7(m6)

s3 = F.interpolate(self.dsn3(m3), x_size[2:],
mode='bilinear', align_corners=True)
s4 = F.interpolate(self.dsn4(m4), x_size[2:],
mode='bilinear', align_corners=True)
s7 = F.interpolate(self.dsn7(m7), x_size[2:],
mode='bilinear', align_corners=True)

m1f = F.interpolate(m1, x_size[2:], mode='bilinear', align_corners=True)

im_arr = inp.cpu().numpy().transpose((0,2,3,1)).astype(np.uint8)
canny = np.zeros((x_size[0], 1, x_size[2], x_size[3]))
for i in range(x_size[0]):
canny[i] = cv2.Canny(im_arr[i],10,100)
canny = torch.from_numpy(canny).cuda().float()

cs = self.res1(m1f)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d1(cs)
cs = self.gate1(cs, s3)
cs = self.res2(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d2(cs)
cs = self.gate2(cs, s4)
cs = self.res3(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d3(cs)
cs = self.gate3(cs, s7)
cs = self.fuse(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
edge_out = self.sigmoid(cs)
cat = torch.cat((edge_out, canny), dim=1)
acts = self.cw(cat)
acts = self.sigmoid(acts)

# aspp
x = self.aspp(m7, acts)
dec0_up = self.bot_aspp(x)

dec0_fine = self.bot_fine(m2)
dec0_up = self.interpolate(dec0_up, m2.size()[2:], mode='bilinear',align_corners=True)
dec0 = [dec0_fine, dec0_up]
dec0 = torch.cat(dec0, 1)

dec1 = self.final_seg(dec0)
seg_out = self.interpolate(dec1, x_size[2:], mode='bilinear')

if self.training:
return self.criterion((seg_out, edge_out), gts)
else:
return seg_out, edge_out
``````

## 6.Intel架构使用

``````import cv2
from openvino.inference_engine import IECore

# 加载模型和设备
model_xml = 'path/to/model.xml'
model_bin = 'path/to/model.bin'
device = 'CPU'  # 可以是 CPU、GPU、MYRIAD、HETERO:FPGA,CPU 等

ie = IECore()
net = ie.read_network(model=model_xml, weights=model_bin)
exec_net = ie.load_network(network=net, device_name=device)

# 读取输入图像
image = cv2.imread('path/to/input_image.jpg')

# 预处理输入图像
input_blob = next(iter(net.input_info))
n, c, h, w = net.input_info[input_blob].input_data.shape
processed_image = cv2.resize(image, (w, h))
processed_image = processed_image.transpose((2, 0, 1))
processed_image = processed_image.reshape((n, c, h, w))

# 执行推理
output_blob = next(iter(net.outputs))
result = exec_net.infer(inputs={input_blob: processed_image})

# 后处理输出结果
output = result[output_blob]

# 显示或保存结果
# ...
``````