前言:最近想给维护的爬虫项目加点功能,摆脱人工实现自动化定时爬取,遇到的第一个难题就是登录获取cookie需要点触验证码识别,然后就进入了无止境的判断图片相似度,花了一周左右时间的尝试和查找资料,最终解决了这个问题,在此记录一下。
pillow基础:Python图像处理PIL各模块详细介绍 样例图片:
第一步:将图片处理成黑白图片
import numpy
from PIL import Image
def get_bin_table(threshold=140):
"""
获取灰度转二值的映射table
:param threshold:
:return:
"""
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
return table
image = Image.open('1.jpg')
imgry = image.convert('L') # 转化为灰度图
table = get_bin_table()
out = imgry.point(table, '1')
第二步:分割图片上下部分
out_up = out.crop((0, 0, out.width, out.height-100))
out_down = out.crop((1, out.height-39, out.width-200, out.height-1))
第三步:图片去噪
def sum_9_region(img, x, y, color):
"""
9邻域框,以当前点为中心的田字框,color点个数
:param x:
:param y:
:param color: 白点为1,黑点为0
:return:
"""
# todo 判断图片的长宽度下限
cur_pixel = img.getpixel((x, y)) # 当前像素点的值
width = img.width
height = img.height
if cur_pixel != color: # 如果当前点为非color点区域,则不统计邻域值
return 0
if y == 0: # 第一行
if x == 0: # 左上顶点,4邻域
# 中心点旁边3个点
sum = cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x + 1, y + 1))
if color:
return sum
else:
return 3-sum
elif x == width - 1: # 右上顶点
sum = cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x - 1, y)) \
+ img.getpixel((x - 1, y + 1))
if color:
return sum
else:
return 3 - sum
else: # 最上非顶点,6邻域
sum = img.getpixel((x - 1, y)) \
+ img.getpixel((x - 1, y + 1)) \
+ cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x + 1, y + 1))
if color:
return sum
else:
return 6 - sum
elif y == height - 1: # 最下面一行
if x == 0: # 左下顶点
# 中心点旁边3个点
sum = cur_pixel \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x + 1, y - 1)) \
+ img.getpixel((x, y - 1))
if color:
return sum
else:
return 3 - sum
elif x == width - 1: # 右下顶点
sum = cur_pixel \
+ img.getpixel((x, y - 1)) \
+ img.getpixel((x - 1, y)) \
+ img.getpixel((x - 1, y - 1))
if color:
return sum
else:
return 3 - sum
else: # 最下非顶点,6邻域
sum = cur_pixel \
+ img.getpixel((x - 1, y)) \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x, y - 1)) \
+ img.getpixel((x - 1, y - 1)) \
+ img.getpixel((x + 1, y - 1))
if color:
return sum
else:
return 6 - sum
else: # y不在边界
if x == 0: # 左边非顶点
sum = img.getpixel((x, y - 1)) \
+ cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x + 1, y - 1)) \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x + 1, y + 1))
if color:
return sum
else:
return 6 - sum
elif x == width - 1: # 右边非顶点
# print('%s,%s' % (x, y))
sum = img.getpixel((x, y - 1)) \
+ cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x - 1, y - 1)) \
+ img.getpixel((x - 1, y)) \
+ img.getpixel((x - 1, y + 1))
if color:
return sum
else:
return 6 - sum
else: # 具备9领域条件的
sum = img.getpixel((x - 1, y - 1)) \
+ img.getpixel((x - 1, y)) \
+ img.getpixel((x - 1, y + 1)) \
+ img.getpixel((x, y - 1)) \
+ cur_pixel \
+ img.getpixel((x, y + 1)) \
+ img.getpixel((x + 1, y - 1)) \
+ img.getpixel((x + 1, y)) \
+ img.getpixel((x + 1, y + 1))
if color:
return sum
else:
return 9 - sum
def collect_noise_point(img, color):
'''收集所有的噪点'''
noise_point_list = []
for x in range(img.width):
for y in range(img.height):
res_9 = sum_9_region(img, x, y, color)
if (0 < res_9 < 6) and img.getpixel((x, y)) == color: # 找到孤立点
pos = (x, y)
noise_point_list.append(pos)
return noise_point_list
def remove_noise_pixel(img, noise_point_list, color):
'''根据噪点的位置信息,消除二值图片的color点噪声'''
for item in noise_point_list:
img.putpixel((item[0], item[1]), 1-color)
noise_point_list = collect_noise_point(out_up, 1)
# print(noise_point_list)
remove_noise_pixel(out_up, noise_point_list, 1)
第四步:分割图片
def judge(fonts_point_list, x, y, color):
"""
判断x,y是否应该加入fonts_point_list
:param color: 白字为70, 黑字为30
"""
if not color:
distance = 30
else:
distance = 70
for i, j in fonts_point_list:
if abs(x - i) < distance and abs(y - j) < distance:
return False
return True
def find_fonts_hang(img, color, distance):
"""
将带有文字的图片坐标保存下来(一行一行的找)
:param color: 1代表白色, 0代表黑色
:param distance: 字与字之间像素的距离,白字之间为50, 黑字之间为15
"""
fonts_point_list = []
y = -1
while y+distance < img.height:
if len(fonts_point_list) == 4 or y+distance > img.height:
break
y += 1
x = 0
while x+distance < img.width:
# print(x, y)
res_9 = sum_9_region(img, x, y, color)
if res_9 >= 7 and img.getpixel((x, y)) == color: # 找到文字左上角的点
f = False
if not fonts_point_list:
f = True
if f or judge(fonts_point_list, x, y, color):
pos = (x, y)
fonts_point_list.append(pos)
x += 1
print(',,', img.width, img.height)
return fonts_point_list
def copy_fonts(img, fonts_point_list, color, distance):
"""
将带有文字的图片部分复制下来
:param color: 1代表白色, 0代表黑色
:param distance: 字与字之间像素的距离,白字之间为50, 黑字之间为20
"""
fonts_img = []
for l_x, l_y in fonts_point_list:
try:
left = l_x-distance
if left < 0:
left = 0
font_img = img.crop((left, l_y, l_x+distance, l_y+distance))
except SystemError:
print('越界')
fonts_img.append(font_img)
return fonts_img
fonts_point_list = find_fonts_hang(out_up, 1, 50)
fonts_img1 = copy_fonts(out_up, fonts_point_list, 1, 40) # 获取到4张带有字的图片(白字)
fonts_img_down = find_fonts_hang(out_down, 0, 15)
fonts_img0 = copy_fonts(out_down, fonts_img_down, 0, 20) # 获取到4张带有字的图片(黑字)
第五步:底部照片白底黑字转换成黑底白字,将图片黑色部分切割掉
def blacktowhite(fonts_img):
"""
将图片黑白转换
"""
res = []
for img in fonts_img:
for x in range(img.width):
for y in range(img.height):
value = img.getpixel((x, y))
img.putpixel((x, y), 1-value)
# img.show()
res.append(img)
return res
def cut_black(img):
"""
将图片的左右黑色部分去掉
"""
# print(img)
edge_left, edge_right = find_min_edge(ont_to_two(list(img.getdata()), img.width))
img = img.crop((edge_left, 0, edge_right, img.height))
# img.show()
return img
fonts_img0 = blacktowhite(fonts_img0)
fonts_img0 = [cut_black(img) for img in fonts_img0] # 下
fonts_img1 = [cut_black(img) for img in fonts_img1] # 上
第六步:判断图片相似度
def find_min_edge_hang(data1):
"""
找到data1主体的最小边界长度,一维数组
"""
index_1 = 0
index_last1 = 0
for i in range(len(data1)):
if data1[i] == 1:
index_1 = i
break
for i in range(len(data1)):
if data1[len(data1) - 1 - i] == 1:
index_last1 = len(data1) - 1 - i
break
return index_1, index_last1
def find_min_edge(data1):
"""
找到data1主体的最小边界长度,data1是二维数组
"""
edge_left = []
edge_right = []
for hang in data1:
index, index_last = find_min_edge_hang(hang)
edge_left.append(index)
edge_right.append(index_last)
# print(min(edge_left), max(edge_right))
return min(edge_left), max(edge_right)
def ont_to_two(data, lie):
"""
将行存储一维数组转化为每行lie个元素的二维数组
"""
two = []
i = 0
while i < len(data):
hang = data[i: i+lie]
i += lie
two.append(hang)
# print(hang)
return two
def image_similarity_vectors_via_numpy(img1, img2):
img1 = img1.resize((32, 32), Image.ANTIALIAS)
img2 = img2.resize((32, 32), Image.ANTIALIAS)
# img1.show()
# img2.show()
images = [img1, img2]
vectors = []
norms = []
for image in images:
vector = []
for pixel_tuple in image.getdata():
vector.append(numpy.average(pixel_tuple))
vectors.append(vector)
norms.append(numpy.linalg.norm(vector, 2))
a, b = vectors
a_norm, b_norm = norms
res = numpy.dot(a / a_norm, b / b_norm)
return res
res = []
has_select = []
similarity = {}
for img1_index in range(len(fonts_img1)):
# similarity = {}
for img2_index in range(len(fonts_img0)):
similarity[(img1_index, img2_index)] = image_similarity_vectors_via_numpy(fonts_img1[img1_index], fonts_img0[img2_index])
# break
# break
similarity = sorted(similarity.items(), key=lambda x: x[1], reverse=True)
for _ in range(4):
index1 = similarity[0][0][0]
index2 = similarity[0][0][1]
res.append(similarity[0])
img1 = fonts_img1[index1]
img2 = fonts_img0[index2]
img1.show()
img2.show()
i = 0
while i < len(similarity):
if similarity[i][0][0] == index1 or similarity[i][0][1] == index2:
similarity.remove(similarity[i])
else:
i += 1
similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
print(res)
配对效果图: