1、CNN+RNN+CTC(CRNN+CTC)
2、CNN+Seq2Seq+Attention
CRNN 介绍
CRNN 模型,即将 CNN 与 RNN 网络结合,共同训练。主要用于在一定程度上实现端到端(end-to-end)地对不定长的文本序列进行识别,不用先对单个文字进行切割,而是将文本识别转化为时序依赖的序列学习问题,就是基于图像的序列识别。(说一定程度是因为虽然输入图像不需要精确给出每个字符的位置信息,但实际上还是需要对原始的图像进行前期的裁切工作)
整个CRNN网络结构包含三部分,从下到上依次为:
1、CNN(卷积层):使用深度 CNN,对输入图像提取特征,得到特征图;
2、RNN(循环层):使用 双向RNN(BLSTM)对特征序列进行预测,对序列中的每个特征向量进行学习,并输出预测标签(真实值)分布;
3、CTC loss(转录层):使用 CTC 损失,把从循环层获取的一系列标签分布转换成最终的标签序列。
CRNN 网络结构
CRNN 还引入了Batch Normalization
模块,加速模型收敛,缩短训练过程。
1、输入图像为灰度图像(单通道);
2、高度为32,这是固定的,图片通过 CNN 后,高度就变为 1,这点很重要;
3、宽度为160,宽度也可以为其他的值,但需要统一,所以输入 CNN 的数据尺寸为 (channel, height, width)=(1, 32, 160)。
4、CNN 的输出尺寸为 (512, 1, 40)。即 CNN 最后得到 512 个特征图,每个特征图的高度为 1,宽度为 40。
Map-to-Sequence
不能直接把 CNN 得到的特征图送入 RNN 进行训练的,需要进行一些调整,根据特征图提取 RNN 需要的特征向量序列。
RNN
因为 RNN 有梯度消失的问题,不能获取更多上下文信息,所以 CRNN 中使用的是 LSTM,LSTM 的特殊设计允许它捕获长距离依赖。
LSTM 是单向的,它只使用过去的信息。然而,在基于图像的序列中,两个方向的上下文是相互有用且互补的。将两个 LSTM,一个向前和一个向后组合到一个双向 LSTM 中。此外,可以堆叠多层双向 LSTM,深层结构允许比浅层抽象更高层次的抽象。
这里采用的是两层各 256 单元的双向 LSTM 网络:
通过上面一步,我们得到了 40 个特征向量,每个特征向量长度为 512,在 LSTM 中一个时间步就传入一个特征向量进行分类,这里一共有 40 个时间步。
我们知道一个特征向量就相当于原图中的一个小矩形区域,RNN 的目标就是预测这个矩形区域为哪个字符,即根据输入的特征向量,进行预测,得到所有字符的 softmax 概率分布,这是一个长度为字符类别数的向量,作为 CTC 层的输入。
因为每个时间步都会有一个输入特征向量
,输出一个所有字符的概率分布
,所以输出为 40 个长度为字符类别数的向量构成的后验概率矩阵。如下图所示:
然后将这个后验概率矩阵传入转录层。
CTC
对于Recurrent Layers,如果使用常见的Softmax cross-entropy loss,则每一列输出都需要对应一个字符元素。那么训练时候每张样本图片都需要标记出每个字符在图片中的位置,再通过CNN感受野对齐到Feature map的每一列获取该列输出对应的Label才能进行训练,如图9。
CRNN 小结
预测过程中,先使用标准的 CNN 网络提取文本图像的特征,再利用 BLSTM 将特征向量进行融合以提取字符序列的上下文特征,然后得到每列特征的概率分布,最后通过 CTC 进行预测得到文本序列。
利用 BLSTM 和 CTC 学习到文本图像中的上下文关系,从而有效提升文本识别准确率,使得模型更加鲁棒。
在训练阶段,CRNN 将训练图像统一缩放为 w×32(w×h)
;在测试阶段,针对字符拉伸会导致识别率降低的问题,CRNN保持输入图像尺寸比例,但是图像高度还是必须统一为 32 个像素,卷积特征图的尺寸动态决定 LSTM 的时序长度(时间步长)。
CRNN 网络模型搭建
import torch.nn as nn
from collections import OrderedDict
class BidirectionalLSTM(nn.Module):
def __init__(self, nIn, nHidden, nOut):
super(BidirectionalLSTM, self).__init__()
self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
self.embedding = nn.Linear(nHidden * 2, nOut)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
t_rec = recurrent.view(T * b, h)
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class CRNN(nn.Module):
def __init__(self, imgH, nc, nclass, nh, leakyRelu=False):
super(CRNN, self).__init__()
assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
# 1x32x128
self.conv1 = nn.Conv2d(nc, 64, 3, 1, 1)
self.relu1 = nn.ReLU(True)
self.pool1 = nn.MaxPool2d(2, 2)
# 64x16x64
self.conv2 = nn.Conv2d(64, 128, 3, 1, 1)
self.relu2 = nn.ReLU(True)
self.pool2 = nn.MaxPool2d(2, 2)
# 128x8x32
self.conv3_1 = nn.Conv2d(128, 256, 3, 1, 1)
self.bn3 = nn.BatchNorm2d(256)
self.relu3_1 = nn.ReLU(True)
self.conv3_2 = nn.Conv2d(256, 256, 3, 1, 1)
self.relu3_2 = nn.ReLU(True)
self.pool3 = nn.MaxPool2d((2, 2), (2, 1), (0, 1))
# 256x4x16
self.conv4_1 = nn.Conv2d(256, 512, 3, 1, 1)
self.bn4 = nn.BatchNorm2d(512)
self.relu4_1 = nn.ReLU(True)
self.conv4_2 = nn.Conv2d(512, 512, 3, 1, 1)
self.relu4_2 = nn.ReLU(True)
self.pool4 = nn.MaxPool2d((2, 2), (2, 1), (0, 1))
# 512x2x16
self.conv5 = nn.Conv2d(512, 512, 2, 1, 0)
self.bn5 = nn.BatchNorm2d(512)
self.relu5 = nn.ReLU(True)
# 512x1x16
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
x = self.pool1(self.relu1(self.conv1(input)))
x = self.pool2(self.relu2(self.conv2(x)))
x = self.pool3(self.relu3_2(self.conv3_2(self.relu3_1(self.bn3(self.conv3_1(x))))))
x = self.pool4(self.relu4_2(self.conv4_2(self.relu4_1(self.bn4(self.conv4_1(x))))))
conv = self.relu5(self.bn5(self.conv5(x)))
# print(conv.size())
b, c, h, w = conv.size()
assert h == 1, "the height of conv must be 1"
conv = conv.squeeze(2)
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
return output
class CRNN_v2(nn.Module):
def __init__(self, imgH, nc, nclass, nh, leakyRelu=False):
super(CRNN_v2, self).__init__()
assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
# 1x32x128
self.conv1_1 = nn.Conv2d(nc, 32, 3, 1, 1)
self.bn1_1 = nn.BatchNorm2d(32)
self.relu1_1 = nn.ReLU(True)
self.conv1_2 = nn.Conv2d(32, 64, 3, 1, 1)
self.bn1_2 = nn.BatchNorm2d(64)
self.relu1_2 = nn.ReLU(True)
self.pool1 = nn.MaxPool2d(2, 2)
# 64x16x64
self.conv2_1 = nn.Conv2d(64, 64, 3, 1, 1)
self.bn2_1 = nn.BatchNorm2d(64)
self.relu2_1 = nn.ReLU(True)
self.conv2_2 = nn.Conv2d(64, 128, 3, 1, 1)
self.bn2_2 = nn.BatchNorm2d(128)
self.relu2_2 = nn.ReLU(True)
self.pool2 = nn.MaxPool2d(2, 2)
# 128x8x32
self.conv3_1 = nn.Conv2d(128, 96, 3, 1, 1)
self.bn3_1 = nn.BatchNorm2d(96)
self.relu3_1 = nn.ReLU(True)
self.conv3_2 = nn.Conv2d(96, 192, 3, 1, 1)
self.bn3_2 = nn.BatchNorm2d(192)
self.relu3_2 = nn.ReLU(True)
self.pool3 = nn.MaxPool2d((2, 2), (2, 1), (0, 1))
# 192x4x32
self.conv4_1 = nn.Conv2d(192, 128, 3, 1, 1)
self.bn4_1 = nn.BatchNorm2d(128)
self.relu4_1 = nn.ReLU(True)
self.conv4_2 = nn.Conv2d(128, 256, 3, 1, 1)
self.bn4_2 = nn.BatchNorm2d(256)
self.relu4_2 = nn.ReLU(True)
self.pool4 = nn.MaxPool2d((2, 2), (2, 1), (0, 1))
# 256x2x32
self.bn5 = nn.BatchNorm2d(256)
# 256x2x32
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
x = self.pool1(self.relu1_2(self.bn1_2(self.conv1_2(self.relu1_1(self.bn1_1(self.conv1_1(input)))))))
x = self.pool2(self.relu2_2(self.bn2_2(self.conv2_2(self.relu2_1(self.bn2_1(self.conv2_1(x)))))))
x = self.pool3(self.relu3_2(self.bn3_2(self.conv3_2(self.relu3_1(self.bn3_1(self.conv3_1(x)))))))
x = self.pool4(self.relu4_2(self.bn4_2(self.conv4_2(self.relu4_1(self.bn4_1(self.conv4_1(x)))))))
conv = self.bn5(x)
# print(conv.size())
b, c, h, w = conv.size()
assert h == 2, "the height of conv must be 2"
conv = conv.reshape([b,c*h,w])
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
return output
def conv3x3(nIn, nOut, stride=1):
# "3x3 convolution with padding"
return nn.Conv2d( nIn, nOut, kernel_size=3, stride=stride, padding=1, bias=False )
class basic_res_block(nn.Module):
def __init__(self, nIn, nOut, stride=1, downsample=None):
super( basic_res_block, self ).__init__()
m = OrderedDict()
m['conv1'] = conv3x3( nIn, nOut, stride )
m['bn1'] = nn.BatchNorm2d( nOut )
m['relu1'] = nn.ReLU( inplace=True )
m['conv2'] = conv3x3( nOut, nOut )
m['bn2'] = nn.BatchNorm2d( nOut )
self.group1 = nn.Sequential( m )
self.relu = nn.Sequential( nn.ReLU( inplace=True ) )
self.downsample = downsample
def forward(self, x):
if self.downsample is not None:
residual = self.downsample( x )
else:
residual = x
out = self.group1( x ) + residual
out = self.relu( out )
return out
class CRNN_res(nn.Module):
def __init__(self, imgH, nc, nclass, nh):
super(CRNN_res, self).__init__()
assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
self.conv1 = nn.Conv2d(nc, 64, 3, 1, 1)
self.relu1 = nn.ReLU(True)
self.res1 = basic_res_block(64, 64)
# 1x32x128
down1 = nn.Sequential(nn.Conv2d(64, 128, kernel_size=1, stride=2, bias=False),nn.BatchNorm2d(128))
self.res2_1 = basic_res_block( 64, 128, 2, down1 )
self.res2_2 = basic_res_block(128,128)
# 64x16x64
down2 = nn.Sequential(nn.Conv2d(128, 256, kernel_size=1, stride=2, bias=False),nn.BatchNorm2d(256))
self.res3_1 = basic_res_block(128, 256, 2, down2)
self.res3_2 = basic_res_block(256, 256)
self.res3_3 = basic_res_block(256, 256)
# 128x8x32
down3 = nn.Sequential(nn.Conv2d(256, 512, kernel_size=1, stride=(2, 1), bias=False),nn.BatchNorm2d(512))
self.res4_1 = basic_res_block(256, 512, (2, 1), down3)
self.res4_2 = basic_res_block(512, 512)
self.res4_3 = basic_res_block(512, 512)
# 256x4x16
self.pool = nn.AvgPool2d((2, 2), (2, 1), (0, 1))
# 512x2x16
self.conv5 = nn.Conv2d(512, 512, 2, 1, 0)
self.bn5 = nn.BatchNorm2d(512)
self.relu5 = nn.ReLU(True)
# 512x1x16
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
x = self.res1(self.relu1(self.conv1(input)))
x = self.res2_2(self.res2_1(x))
x = self.res3_3(self.res3_2(self.res3_1(x)))
x = self.res4_3(self.res4_2(self.res4_1(x)))
x = self.pool(x)
conv = self.relu5(self.bn5(self.conv5(x)))
# print(conv.size())
b, c, h, w = conv.size()
assert h == 1, "the height of conv must be 1"
conv = conv.squeeze(2)
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
return output
if __name__ == '__main__':
pass
- 关于识别数据集的生成,可参看这个项目:https://github.com/Belval/TextRecognitionDataGenerator
随机生成不定长图片数据
# 生成椒盐噪声
def img_salt_pepper_noise(src,percetage):
NoiseImg = src
NoiseNum = int(percetage*src.shape[0]*src.shape[1])
for i in range(NoiseNum):
randX = random.randint(0, src.shape[0]-1)
randY = random.randint(0, src.shape[1]-1)
if random.randint(0,1) == 0:
NoiseImg[randX, randY] = 0
else:
NoiseImg[randX, randY] = 255
return NoiseImg
# 随机生成不定长图片集
def gen_text(cnt):
font_path = '/data/work/tensorflow/fonts/arial.ttf' # 设置文字字体和大小
font_size = 30
font = ImageFont.truetype(font_path,font_size)
for i in range(cnt):
rnd = random.randint(1, 10) # 随机生成1到10位的不定长数字
text = ''
for j in range(rnd):
text = text + DIGITS[random.randint(0, len(DIGITS)-1)]
img = Image.new("RGB", (256,32)) # 生成图片并绘上文字
draw = ImageDraw.Draw(img)
draw.text((1,1), text, font=font, fill='white')
img = np.array(img)
img = img_salt_pepper_noise(img, float(random.randint(1,10)/100.0)) # 随机叠加椒盐噪声并保存图像
cv2.imwrite(data_dir + text + '_' + str(i+1) + '.jpg',img)