前言
比赛规模不算大,也就是两百多队伍,我算是为达目的绞尽脑汁了,最后用尽全力只得到了95.9%的成功率,第一名能达到99.7%。
比赛题目
验证码识别乍一看是个挺简单的东西,但是它的复杂度随着给你的验证码不同而在变化,这次的复杂程度确实比较超出预期。当时拿到数据集之前我觉得卷积够用了,但是拿到之后就发现有点不对了。
有一些细节值得注意:首先就是背景色,加了噪点不说其实是渐变的,人眼难以察觉,这就给去除背景提高了难度;还有就是有些图片实在是太黑了,如上图的99.jpg,连人眼都几乎看不见这是什么,去除背景的时候很容易直接就去掉验证码本身。还有一个很关键的事情就是大小写。因为训练集只提供了5000张图片,所以0与O这类的区分不会太准。作为初学入门者学的东西实在有限,所以只能想到用卷积神经网络去做,后面想想大概还有其它的方法。
思路
去噪——二值化读取——第一个卷积网络训练字符位置之后分割(使用自己生成的验证码)——第二个卷积网络训练分割后的图像
1.去噪
对于验证码的问题,去噪是首先要考虑的事情,只有去噪相对干净,结果才能更加准确,因为输入网络的图片必须二值化,如果不去掉背景色和噪点,就会极大的影响识别的精度,还有一个原因就是图片实在是太少了,只有五千张,没有大量数据集做后盾。经过查找研究后,我们得到了一种相对可以的去噪方法,我个人认为正确率的提升和这一环节密切相关,只是短时间内想不到更好的去噪方法,或许使用深度学习去噪?
方法
两段去噪,难点在于背景色是渐变的所以第一步是提取周围的颜色先简单去掉一部分背景,第二步是将疑似噪点去掉,问题就在于有一些图片字符和背景太过相似,所以效果不好。
第一段处理
第二段处理
通过局部对比就可以看到这个方法的效果,确实有不干净的图存在。下面附上代码。
代码
from PIL import Image,ImageFont,ImageDraw
import random
import os
import sys
import numpy as np
import cv2
import pandas as pd
def quzao1(filename):
img = Image.open(filename)
img = img.convert('RGB')
pixdata = img.load()
R=0
G=0
B=0
for y in range(0,40):
for x in range(0,3):
R=R+pixdata[x,y][0]
G=G+pixdata[x,y][1]
B=B+pixdata[x,y][2]
R=R/120
G=G/120
B=B/120
for y in range(1,img.size[1]):
for x in range(1,img.size[0]):
if abs(pixdata[x,y][0]-R)<15 and abs(pixdata[x,y][1]-G)<15 and abs(pixdata[x,y][2]-B)<15:
pixdata[x, y] = (255, 255, 255)
return img
def quzao2(filename):
im = Image.open(filename)
(w,h) = im.size
R = 0
G = 0
B = 0
for i in range(w):
for j in range(h):
point = (i,j)
rgb = im.getpixel(point)
if i==0 or i==w-1:
im.putpixel(point, (255, 255, 255))
if j==0 or j==h-1:
im.putpixel(point, (255, 255, 255))
(r,g,b) = rgb
R = R+r
G = G+g
B = B+b
rate1 = R*1000/(R+G+B)
rate2 = G*1000/(R+G+B)
rate3 = B*1000/(R+G+B)
for x in range(0,w):
for y in range(0,h):
point = (x,y)
L = im.getpixel(point)
(r,g,b) = L
n = r * rate1 / 1000 + g * rate2 / 1000 + b * rate3 / 1000
if n >= 110:
im.putpixel(point, (255, 255, 255))
else:
im.putpixel(point, (0, 0, 0))
for x in range(1,w-1):
for y in range(1,h-1):
point = (x,y)
nearDots = 0
if im.getpixel((x-1,y-1)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x-1,y)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x-1,y+1)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x,y-1)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x,y+1)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x+1,y-1)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x+1,y)) == (255, 255, 255):
nearDots += 1
if im.getpixel((x+1,y+1)) == (255, 255, 255):
nearDots += 1
if nearDots > 5:
im.putpixel(point, (255, 255, 255))
return im
for i in range(1,5001):
img1=quzao1('./train/'+str(i)+'.jpg')
img1.save('./train3/'+str(i)+'.bmp')
img2=quzao2('./train3/'+str(i)+'.bmp')
img2.save('./train2/'+str(i)+'.jpg')
以上就是去噪的过程。
2.分割
用深度学习的方式定位验证码每个字符的位置然后分割
步骤
寻找字体—模拟验证码—通过自己生成的坐标来训练—返回坐标后分割图片
代码实例
from PIL import Image,ImageFont,ImageDraw
import random
import os
import numpy as np
import cv2
#加载图片类
class ImageChar():
#类的初始化,设定初始属性颜色,图片尺寸,字体路径,字体尺寸,字的数量
def __init__(self, color=(0,0,0),size=(120,40),
fontlist=['./fonts/'+i for i in os.listdir('./fonts/') if not i =='.DS_Store'],
fontsize=28,
num_word=4):
#用定义的变量命名属性
self.num_word=num_word
self.color=color
self.fontlist=fontlist
#如果字数不是4,则用更灵活的尺寸
if self.num_word==4:
self.size=size
else:
self.size=((self.fontsize+5)*self.num_word,40)
#随机从字体里面取一个字体
self.fontpath=self.fontlist[random.randint(0,len(self.fontlist)-1)]
self.fontsize=fontsize
#读取
self.chinese=open('62.txt','r').read()
#font属性保存对字体的读取
self.font=ImageFont.truetype(self.fontpath, self.fontsize)
def PepperandSalt(self,src,percetage):
NoiseImg=np.array(src)
srp=np.asarray(src)
NoiseNum=int(percetage*srp.shape[0]*srp.shape[1])
for i in range(NoiseNum):
randX=random.randint(0,srp.shape[0]-1)
randY=random.randint(0,srp.shape[1]-1)
if random.randint(0,1)<=0.5:
NoiseImg[randX,randY]=0
else:
NoiseImg[randX,randY]=255
Noise=Image.fromarray(NoiseImg)
return Noise
def PepperandSalt1(self,src,percetage):
NoiseImg=np.array(src)
srp=np.asarray(src)
NoiseNum=int(percetage*srp.shape[0]*srp.shape[1])
for i in range(NoiseNum):
randX=random.randint(0,srp.shape[0]-1)
randY=random.randint(0,srp.shape[1]-1)
if random.randint(0,1)<=0.5:
NoiseImg[randX,randY]=255
else:
NoiseImg[randX,randY]=0
Noise=Image.fromarray(NoiseImg)
return Noise
#随机生成四个汉字的字符串,随机从字母和数字里挑选,并返回字符串和四个字的索引
def rand_chinese(self):
chinese_str=''
chinese_num=[]
for i in range(self.num_word):
temp=random.randint(0,61)
chinese_str=chinese_str+self.chinese[temp]
chinese_num.append(temp)
return chinese_str,chinese_num
#随机生成杂线的坐标,三种模式,随便乱写
def rand_line_points(self,mode=0):
width,height=self.size
if mode==0:
return (random.randint(0, width), random.randint(0, height))
elif mode==1:
return (random.randint(0,6),random.randint(0, height))
elif mode==2:
return (random.randint(width-6,width),random.randint(0, height))
#随机生成一张图片 根据step值,分别为第一个网络和第二个网络提供训练数据
def rand_img_label(self,num_lines=3,step=1):
width,height=self.size
gap=2
start=10
#c1=random.randint(50,255)
#c2=random.randint(50,255)
#c3=random.randint(50,255)
self.img1 = Image.new('RGB',self.size,(255,255,255))
self.draw1=ImageDraw.Draw(self.img1)
#把线画上去
words,chinese_num=self.rand_chinese()
label_list=[]
#将字画上去
for i in range(len(words)):
x=start+(23+gap)*i+random.randint(-3,3)
if x>105:
x=105
y=random.randint(5,height-self.fontsize-gap)
if step == 1:#为第一个网络生成标签数据:汉字的坐标
temp_list=[0]*2
temp_list[0]=(x+12)/120#该汉字的中心横坐标,除于100是为了规划到0~1,为了方便训练
temp_list[1]=(y+15)/40#该汉字的中心纵坐标,除于25也是为了方便训练
else:#为第二个网络生成标签数据,汉字的one-hot矩阵
temp_list=[0]*62
temp_list[chinese_num[i]]=1
#储存每一个图片的1-hot矩阵和图片
label_list.append(temp_list)
self.draw1.text((x,y),words[i],fill=(0,0,0),font=self.font)
for i in range(num_lines//2):
self.draw1.line([self.rand_line_points(),self.rand_line_points()],(255,255,255),2)
for i in range(num_lines//2):
self.draw1.line([self.rand_line_points(1),self.rand_line_points(2)],(255,255,255),3)
o=random.randint(0,100)/100
p=random.randint(0,30)/100
self.img1=self.PepperandSalt(self.img1,o)
self.img1=self.PepperandSalt1(self.img1,p)
return self.img1,label_list
这是生成验证码的类,验证码字体是寻找的比赛用的验证码的字体,代码原件是从另外一篇博客上找的,我们做了修改。主要在于噪声和数字的间距,位置一类的调整。
生成的效果如图
传回来的坐标就很便于训练,去噪之后输入网络即可。训练是基于CNN的网络。不过我们还是犯了深度学习初学者的一个问题,就是错误的认为可以把自动生成的验证码直接用于识别的训练,之后才发现,电脑的识别逻辑完全不是想象的那样,肉眼无法辨别的区别可能会极大的影响识别的结果,所以不能靠这个东西来提升很多成功率。
import tensorflow.contrib.slim as slim
import tensorflow as tf
import numpy as np
import random
import time
def cal_loss(pre_loca, lab_loca):
loca_loss = tf.reduce_mean(tf.square(tf.subtract(pre_loca , lab_loca)))
return loca_loss*100
def xavier_init(fan_in,fan_out,constant = 1):
low = -constant * np.sqrt(6.0/(fan_in+fan_out))
high = constant * np.sqrt(6.0/(fan_in+fan_out))
return tf.random_uniform((fan_in,fan_out),minval = low,maxval = high,dtype = tf.float32)
# In[32]:
def network(in_image,if_is_training):
batch_norm_params={
'is_training':if_is_training,
'zero_debias_moving_mean':True,
'decay':0.99,
'epsilon':0.001,
'scale':True,
'updates_collections':None
}
with slim.arg_scope([slim.conv2d],activation_fn=tf.nn.relu,
padding='SAME',
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params,
weights_regularizer=slim.l2_regularizer(0.0005)):
out_1=32
out_2=64
out_3=128
net=slim.conv2d(in_image,num_outputs=out_2,kernel_size=[5,5],stride=1,scope='conv1')
print('1_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool1')
print('1_pool:\t',net.get_shape())
net=slim.conv2d(net,num_outputs=out_2,kernel_size=[5,5],stride=1,scope='conv2')
print('2_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool2')
print('2_pool:\t',net.get_shape())
net=slim.conv2d(net,num_outputs=out_3,kernel_size=[3,3],stride=1,scope='conv3_1')
net=slim.conv2d(net,num_outputs=out_3,kernel_size=[3,3],stride=1,scope='conv3_2')
print('3_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool3')
print('3_pool:\t',net.get_shape())
# net = tf.reshape(net,shape=[-1,2*2*128])
net=slim.flatten(net,scope='flatten')
with slim.arg_scope([slim.fully_connected],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
net=slim.fully_connected(net,1000,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
scope='fc_total')
print('fc:\t',net.get_shape())
pre_loca=slim.fully_connected(net,2000,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
scope='fc2_1')
pre_loca=slim.fully_connected(pre_loca,8,
activation_fn=tf.nn.sigmoid,
# normalizer_fn=None,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
scope='fc2_2')
pre_loca = tf.reshape(pre_loca, shape=[-1,4,2])
return pre_loca
# In[35]:
#测试网络训练精度
def accuracy(sess,pre_loca,in_image,x_image,y_label,if_is_training):
erro_count = 0
for i in range(10):#每次取一百张预测,取十次共1000
bt=random.randint(0,9999 - 100)
min_x_image = x_image[bt:(bt+100),:,:]
min_y_label = y_label[bt:(bt+100),:,:]
loca_np = sess.run(pre_loca,feed_dict={in_image:min_x_image , if_is_training:True})
m,n,l = loca_np.shape
for j in range(m):
for k in range(n):
x =round(loca_np[j,k,0]*120)
y=round(loca_np[j,k,1]*40)
x0=round(min_y_label[j,k,0]*120)
y0=round(min_y_label[j,k,1]*40)
lo = ((x - x0)**2 + (y - y0)**2)**0.5#计算两个预测坐标和标签坐标的距离
if lo>3:
erro_count+=1
if erro_count>20:
return False, erro_count
else:
return True, erro_count
def accuracy1(sess,pre_loca,in_image,x_image,y_label,if_is_training):
erro_count = 0
loca_np = sess.run(pre_loca,feed_dict={in_image:x_image , if_is_training:False})
m,n,l = loca_np.shape
for j in range(m):
for k in range(n):
x =round(loca_np[j,k,0]*120)
y=round(loca_np[j,k,1]*40)
x0=round(y_label[j,k,0]*120)
y0=round(y_label[j,k,1]*40)
lo = ((x - x0)**2 + (y - y0)**2)**0.5#计算两个预测坐标和标签坐标的距离
if lo>3:
erro_count+=1
if erro_count>2:
return False, erro_count
else:
return True, erro_count
# In[36]:
def main():
tf.reset_default_graph()
in_image= tf.placeholder(dtype=tf.float32, shape=[None,40,120], name='in_image')
# lab_class=tf.placeholder(dtype=tf.float32, shape=[None,4,3500], name='lab_class')
lab_loca=tf.placeholder(dtype=tf.float32, shape=[None,4,2], name='lab_loca')
# 和 batch normalization一起使用,在训练时为True,预测时False
if_is_training=tf.placeholder(dtype=tf.bool,name='if_is_training')
# if_is_trainingf=tf.placeholder(dtype=tf.bool,name='if_is_trainingf')
x_input = tf.reshape(in_image, shape=[-1,120,40,1], name='x_input')
pre_loca=network(x_input,if_is_training)
loca_loss=cal_loss(pre_loca, lab_loca)
# 和 batch normalization 一起使用
update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
# train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
train_op = tf.train.MomentumOptimizer(learning_rate=0.01,momentum=0.9,use_nesterov=True).minimize(loca_loss)
model_saver=tf.train.Saver()
tf.add_to_collection('pre_loca',pre_loca)
x_image=training_images
y_label=training_label
maxx=100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
while True:
#输入训练次数,方便控制和继续训练
command=input('输入训练轮次:\t')
command1=input('输入训练批量:\t')
batchs=int(command1)
if command=='qq':
break
for i in range(int(command)):
bt=random.randint(0,9999 - batchs)
min_x_image = x_image[bt:(bt + batchs),:,:]
min_y_label = y_label[bt:(bt + batchs),:,:]
sess.run(train_op,feed_dict={in_image:min_x_image , lab_loca:min_y_label , if_is_training:True})
if i%50==0:
ret,erro_count = accuracy(sess,pre_loca,in_image,x_image,y_label,if_is_training)
print('训练轮次: ',i, '\t\t训练错误数: ' , erro_count , '\t\t训练成功率: ',1-erro_count/4000)
mck,validation_erro_count= accuracy1(sess,pre_loca,in_image,validation_images,validation_label,if_is_training)
mck,validation_erro_count1= accuracy1(sess,pre_loca,in_image,validation_images,validation_label,if_is_training)
mck,validation_erro_count2= accuracy1(sess,pre_loca,in_image,validation_images,validation_label,if_is_training)
countn=(validation_erro_count+validation_erro_count1+validation_erro_count2)/3/200
if(countn<maxx):
maxx=countn
print( '\t\验证错误数: ' , validation_erro_count , '\t\t验证错误率: ',maxx)
if ret:
break
model_saver.save(sess,'./modellll/mymodel.ckpt')
if __name__=='__main__':
main()
训练和验证的代码如下,可做参考。
3.训练前的预处理
这里就到了最关键的对验证码本身的提取了,上面的一期是先学习字母的中心坐标,目的是便于分割。
import tensorflow.contrib.slim as slim
import tensorflow as tf
from PIL import Image
import numpy as np
import random
import time
import cv2
这些就是导入的库。
当然,对于输入网络的经过去噪和二值化的图片自然是要先将其保存为numpy张量的形式。这里是一小段例子
images=[]
for i in range(1,5001):
img=Image.open('./train2/'+str(i)+'.jpg')
np_img=np.asarray(img)
np_img = cv2.cvtColor(np_img,cv2.COLOR_BGR2GRAY)
ttt,np_img = cv2.threshold(np_img,127,255,cv2.THRESH_BINARY_INV)
np_img=np_img/255
images.append(np_img.tolist())
images=np.array(images)
np.save('./train1/RealTrainImg.npy',images)
同理,对于标签来说非常重要的是先获取标签,这个标签是官方给的,此时你需要做一个txt文件,输入所有大小写字母和数字,用它来生成onehot矩阵,再append并生成训练用的标签。我这里给一个例子
labels=[]
onep=[]
data = pd.read_csv("train_label.csv")
p=data.values
chinese=open('62.txt','r').read()
for m in range (0,5000):
lab=p[m,1]
for i in range(0,4):
temp_list=[0]*62
for j in range(0,62):
if(lab[i]==chinese[j]):
temp_list[j]=1
onep.append(temp_list)
labels.append(onep)
onep=[]
labels=np.array(labels)
np.save('./train1/RealTrainLab.npy',labels)
做好准备之后就可以开工了。为了让上一期的中心点坐标有用,需要一段切割验证码的代码。
#切割图片的函数
def crop_image(data,loca_np,imgshow=False):
croped_img_list=[]
loca_list=loca_np.tolist()
if imgshow:
img = data.copy()
m,n=loca_np.shape
for i in range(m):
x = round(loca_list[i][0]*120-15)#将中心横坐标转化为左上角横坐标,方便剪裁
#根据坐标剪裁可能会超出边界。
if x<0:
x = 0
elif x>95:
x=95
temp = data[0:40,x:x+25]#对汉字进行剪裁
croped_img_list.append(temp.tolist())
if imgshow:
img=cv2.rectangle(img*255,(x,0),(x+25,40),(255,0,0),1)#在数字周围画矩形
if imgshow:#画个效果图出来看看
img = Image.fromarray(img)
img.show()
#返回的是0~1的图片,类型List
return croped_img_list#返回的是剪切完的部分
之后就要调用第一个模型去预测中心点的坐标。
#加载第一个网络的模型。
def load_model_1():
graph_1=tf.Graph()
sess_1=tf.Session(graph=graph_1)
with graph_1.as_default():
model_saver_1=tf.train.import_meta_graph("./modellll/mymodel.ckpt.meta")
model_saver_1.restore(sess_1,'./modellll/mymodel.ckpt')
y_loca=tf.get_collection('pre_loca')[0]#出处tf.add_to_collection('pre_loca',pre_loca)
x_1=graph_1.get_operation_by_name('in_image').outputs[0]#in_image= tf.placeholder(dtype=tf.float32, shape=[None,40,120], name='in_image')
if_is_training_1=graph_1.get_operation_by_name('if_is_training').outputs[0]#if_is_training=tf.placeholder(dtype=tf.bool,name='if_is_training')
return x_1 , sess_1 , if_is_training_1 ,y_loca#分别输出各种格式命名和训练好的图本身
#第一个模型对输入数据进行预测中心坐标
def pre_model_1(x_1 , sess_1 , if_is_training_1 ,y_loca,in_image_1,y_label):
loca_np=sess_1.run(y_loca,feed_dict={x_1:in_image_1,if_is_training_1:False})#把数据in_image_1导入
M,N,L=loca_np.shape#得到输出的尺寸
x_image_2=[]
y_label_2=[]
for m in range(M):
imgCutList=crop_image(in_image_1[m,:,:],loca_np[m,:,:])#用后面定义的函数去切割图像得到单个数字
for im in range(len(imgCutList)):
try:
data_2=np.array(imgCutList[im]).reshape(1000,)#把切出来的图像reshape,这里根据字体大小来改
except Exception as e:
print('imList reshape erro')
continue
x_image_2.append(data_2.tolist())#把切好的图放到数组里
y_label_2.append(y_label[m,im,:].tolist())#把标签放好
return np.array(x_image_2), np.array(y_label_2)
做好准备之后就可以调用前面的函数,把图片切成四分之一的小块块,用单个的小块块进行学习。这里写一个准备小块块的函数。
#测试用函数
def pre_model_2(x_1 , sess_1 , if_is_training_1 ,y_loca,in_image_1,y_label):
loca_np=sess_1.run(y_loca,feed_dict={x_1:in_image_1,if_is_training_1:False})#把数据in_image_1导入
M,N,L=loca_np.shape#得到输出的尺寸
x_image_2=[]
y_label_2=[]
for m in range(M):
imgCutList=crop_image(in_image_1[m,:,:],loca_np[m,:,:],imgshow=True)#用后面定义的函数去切割图像得到单个数字
for im in range(len(imgCutList)):
try:
data_2=np.array(imgCutList[im]).reshape(1000,)#把切出来的图像reshape,这里根据字体大小来改
except Exception as e:
print('imList reshape erro')
continue
x_image_2.append(data_2.tolist())#把切好的图放到数组里
y_label_2.append(y_label[m,im,:].tolist())#把标签放好
return np.array(x_image_2), np.array(y_label_2)
4.训练验证码分类
虽然最终的结果只有95%的正确率,但是作为初学者的我更加了解了神经网络的运转,算是给我不小的收获了。代码的从头到尾的调试都是自己完成的,这个过程比较有成就感。
代码实例
#损失函数交叉熵
def cal_loss(y_pre,y_label):
return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_label, logits=y_pre))
def xavier_init(fan_in,fan_out,constant = 1):
low = -constant * np.sqrt(6.0/(fan_in+fan_out))
high = constant * np.sqrt(6.0/(fan_in+fan_out))
return tf.random_uniform((fan_in,fan_out),minval = low,maxval = high,dtype = tf.float32)
例行公事的规定损失函数。
#网络搭建
def network(in_image,if_is_training):
batch_norm_params={
'is_training':if_is_training,
'zero_debias_moving_mean':True,
'decay':0.99,
'epsilon':0.001,
'scale':True,
'updates_collections':None
}
with slim.arg_scope([slim.conv2d],activation_fn=tf.nn.relu,
padding='SAME',
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params,
weights_regularizer=slim.l2_regularizer(0.0005)):
out_1=32
out_2=64
out_3=128
net=slim.conv2d(in_image,num_outputs=out_2,kernel_size=[5,5],stride=1,scope='conv1')
print('1_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool1')
print('1_pool:\t',net.get_shape())
net=slim.conv2d(net,num_outputs=out_2,kernel_size=[5,5],stride=1,scope='conv2')
print('2_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool2')
print('2_pool:\t',net.get_shape())
net=slim.conv2d(net,num_outputs=out_3,kernel_size=[3,3],stride=1,scope='conv3_1')
net=slim.conv2d(net,num_outputs=out_3,kernel_size=[3,3],stride=1,scope='conv3_2')
print('3_con:\t',net.get_shape())
net=slim.max_pool2d(net,kernel_size=[2,2],stride=2,scope='pool3')
print('3_pool:\t',net.get_shape())
# net = tf.reshape(net,shape=[-1,2*2*128])
net=slim.flatten(net,scope='flatten')
with slim.arg_scope([slim.fully_connected],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
net=slim.fully_connected(net,1500,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
scope='fc1')
print('fc1:\t',net.get_shape())
net=slim.fully_connected(net,4500,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
scope='fc2')
print('fc2:\t',net.get_shape())
net=slim.fully_connected(net,62,
activation_fn=None,
normalizer_fn=None,
# weights_initializer=slim.xavier_initializer(),
# biases_initializer=tf.zeros_initializer(),
scope='fc3')
print('soft:\t',net.get_shape())
return net
这个网络结构是非常初学化的,也是我认为可以改进的最大的地方,纯是我想象出来的结构。后来在参阅了一些其它论文之后我才发现这一点。这里其实有很多问题。非常简单的cnn结构。
def main():
tf.reset_default_graph()
#输入格式为25*25图片,输出格式为62的one-hot矩阵
in_image= tf.placeholder(dtype=tf.float32, shape=[None,1000
], name='in_image')
out_image=tf.placeholder(dtype=tf.float32, shape=[None,62], name='out_image')
maxx=100
minn=100
# 和 batch normalization一起使用,在训练时为True,预测时False
if_is_training=tf.placeholder(dtype=tf.bool,name='if_is_training')
#让输入有宽度
x_input = tf.reshape(in_image, shape=[-1,40,25,1], name='x_input')
pre_image=network(x_input,if_is_training)
# l2_loss = tf.add_n(tf.losses.get_regularization_losses())
#各种损失函数
cost=cal_loss(pre_image,out_image)
corr=tf.equal(tf.argmax(pre_image,1),tf.argmax(out_image,1))
loss=tf.reduce_mean(tf.cast(corr,"float"))
#设定超参数
# 和 batch normalization 一起使用
update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
# train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
train_op = tf.train.MomentumOptimizer(learning_rate=0.01,momentum=0.9,use_nesterov=True).minimize(cost)
#保存
model_saver=tf.train.Saver()
tf.add_to_collection('pre_img',pre_image)
#导入第一个网络
x_1 , sess_1 , if_is_training_1 ,y_loca =load_model_1()
testImg,testLab=pre_model_1(x_1 , sess_1 , if_is_training_1 ,y_loca,testImg0,testLab0)
print('total test images:',testImg.shape )
print('total test labels:',testLab.shape )
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
while True:
#输入训练次数,方便控制和继续训练
command=input('input:\t')
if command=='qq':
break
for i in range(int(command)):
x_image_1,y_label = generate_data()#调用30个图片和onehot矩阵
x_image_2,y_label_2= pre_model_1(x_1 , sess_1 , if_is_training_1 ,y_loca,x_image_1,y_label)#用写好的函数得到单个数字和单个标签
sess.run(train_op,feed_dict={in_image:x_image_2,out_image:y_label_2,if_is_training:True})
if i%50==0:
ret,erro_count = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
ret,erro_count1 = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
ret,erro_count2 = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
countn=(erro_count+erro_count1+erro_count2)/3/1000
if(countn<maxx):
maxx=countn
print('预训练count: ',i,'\t错误率1: ',erro_count/1000,'\t错误率2: ',erro_count1/1000,'\tmin: ',maxx)
while True:
#输入训练次数,方便控制和继续训练
command=input('input:\t')
if command=='qq':
break
for i in range(int(command)):
x_image_1,y_label = generate_data1()#调用30个图片和onehot矩阵
x_image_2,y_label_2= pre_model_1(x_1 , sess_1 , if_is_training_1 ,y_loca,x_image_1,y_label)#用写好的函数得到单个数字和单个标签
sess.run(train_op,feed_dict={in_image:x_image_2,out_image:y_label_2,if_is_training:True})
if i%50==0:
ret,erro_count = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
ret,erro_count1 = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
ret,erro_count2 = accuracy(sess,pre_image,in_image,testImg,testLab,if_is_training)
countn=(erro_count+erro_count1+erro_count2)/3/1000
if(countn<minn):
minn=countn
print('count: ',i,'\t错误率1: ',erro_count/1000,'\t错误率2: ',erro_count1/1000,'\tmin: ',minn)
model_saver.save(sess,"./modellll/model2/mymodel.ckpt")
if __name__=='__main__':
main()
最后就是main函数去调用所有人实现训练的过程了,可以手动去控制轮次。在实施的过程中,错误率下降到一定程度就很缓慢了,又采用了预训练的手段,先用自己随便生成的数据训练几下,之后再导入官方数据,实测可以发现预训练之后模型成功率能够上升一个百分点。
class ChineseCodeRecognition():
"""docstring for ChineseCodeRecognition"""
def __init__(self):
self.w3500 = open('62.txt','r').read()
self.x_1,self.sess_1 , self.if_is_training_1 ,self.y_loca = self.load_model_1()
self.x_2,self.sess_2 , self.if_is_training_2 ,self.y_class = self.load_model_2()
def load_model_1(self):
graph_1=tf.Graph()
sess_1=tf.Session(graph=graph_1)
with graph_1.as_default():
model_saver_1=tf.train.import_meta_graph("./modellll/mymodel.ckpt.meta")
model_saver_1.restore(sess_1,'./modellll/mymodel.ckpt')
y_loca=tf.get_collection('pre_loca')[0]
x_1=graph_1.get_operation_by_name('in_image').outputs[0]
if_is_training_1=graph_1.get_operation_by_name('if_is_training').outputs[0]
return x_1 , sess_1 , if_is_training_1 ,y_loca
def load_model_2(self):
graph_2=tf.Graph()
sess_2=tf.Session(graph=graph_2)
with graph_2.as_default():
model_saver_2=tf.train.import_meta_graph("./modellll/model2/mymodel.ckpt.meta")
model_saver_2.restore(sess_2,'./modellll/model2/mymodel.ckpt')
y_class=tf.get_collection('pre_img')[0]
x_2=graph_2.get_operation_by_name('in_image').outputs[0]
if_is_training_2=graph_2.get_operation_by_name('if_is_training').outputs[0]
return x_2 , sess_2 , if_is_training_2 ,y_class
def readImage(self,filename):
img=cv2.imread(filename)
img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
ret,data = cv2.threshold(img,127,255,cv2.THRESH_BINARY_INV)
data=data/255
return data
def crop_image(self,data,loca_np,imgshow=False):
croped_img_list=[]
loca_list=loca_np.tolist()
if imgshow:
img = data.copy()
m,n=loca_np.shape
for i in range(m):
x = round(loca_list[i][0]*120-15)
if x<0:
x = 0
elif x>95:
x=95
temp = data[0:40,x:x+25]
croped_img_list.append(temp.tolist())
if imgshow:
img=cv2.rectangle(img*255,(x,0),(x+25,40),(255,0,0),1)
if imgshow:
img = Image.fromarray(img)
img.show()
#返回的是0~1的图片,类型List
return croped_img_list
#预测单张验证码
def predict(self):
labs=[]
for i in range(1,5001):
filename = './test2/'+str(i)+'.jpg'
data = self.readImage(filename)
in_image=data.reshape(1,40,120)
loca_np=self.sess_1.run(self.y_loca,feed_dict={self.x_1:in_image,self.if_is_training_1:False})
loca_np=loca_np.reshape(4,2)
imgCutList=self.crop_image(data,loca_np,False)
chineseCode=""
for imList in imgCutList:
data_2=np.array(imList).reshape(1,1000)
# data=tf.reshape(data, shape=[1,400])
rel=self.sess_2.run(self.y_class,feed_dict={self.x_2:data_2,self.if_is_training_2:False})
num=np.argmax(rel)
chineseCode+=self.w3500[num]
if(len(chineseCode)==4):
labs.append(chineseCode)
return labs
#测试准确率,测试times张验证码
def test(self,times):
erro=0
loss=0
for i in range(times):
i_chr=ImageChar()
img_PIL,words=i_chr.rand_img_test()
in_img = np.asarray(img_PIL)
in_img = cv2.cvtColor(in_img,cv2.COLOR_BGR2GRAY)
ret,in_img = cv2.threshold(in_img,127,255,cv2.THRESH_BINARY_INV)
data=in_img/255
in_image = data.reshape(1,40,120)
loca_np=self.sess_1.run(self.y_loca,feed_dict={self.x_1:in_image,self.if_is_training_1:False})
loca_np=loca_np.reshape(4,2)
imgCutList=self.crop_image(data,loca_np)
chineseCode=""
for imList in imgCutList:
try:
data_2=np.array(imList).reshape(1,1000)
except Exception as e:
loss+=1
continue
rel=self.sess_2.run(self.y_class,feed_dict={self.x_2:data_2,self.if_is_training_2:False})
num=np.argmax(rel)
chineseCode+=self.w3500[num]
if len(chineseCode)==4:
if not chineseCode == words:
erro+=1
print('\r',i,end='\r')
print('erro: ',erro/times*100,'%','\tloss: ',loss)
if __name__=='__main__':
ccr=ChineseCodeRecognition()
最后需要做的就是写一个导入模型去预测未知图片的类,之后调用它。之后我们就可以把结果放到excel里面直观的看到结果。
labels=ccr.predict()
ids = [str(x) + ".jpg" for x in range(1, 5001)]
df = pd.DataFrame([ids, labels]).T
df.columns = ['ID', 'label']
df.to_csv('./test/zuihou.csv', index=None)
到此整个项目就完成了。