一、在线下载图片

import os
import sys
import xlrd
import requests
import urllib.request



def read_excel(excel_path):
workbook = xlrd.open_workbook(excel_path)
sheet = workbook.sheet_by_name("Sheet1")
nrows = sheet.nrows
img_list = []
for i in range(nrows):
img_list.append(sheet.row_values(i)[0]) # 获取第1列数据
print("list1", img_list)
return img_list


def get_HTML():
url='http://www.ivsky.com/tupian/haiyangshijie/'
headers={"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",}
data = {'show_env': '1'}
res_params=requests.get(url,headers=headers,params=data)
return res_params

def _progress(filename,block_num, block_size, total_size):
'''回调函数
@block_num: 已经下载的数据块
@block_size: 数据块的大小
@total_size: 远程文件的大小
'''
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
float(block_num * block_size) / float(total_size) * 100.0))
sys.stdout.flush()


def get_imgName(excel_path,target_dir,url,_progress):
img_list = read_excel(excel_path)
for img in img_list:
image_url = url + img # 需要下载的文件
try:
if not os.path.exists(target_dir):
os.makedirs(target_dir) # 如果没有这个path则直接创建
filename = target_dir+img # 保存本地文件名。
urllib.request.urlretrieve(image_url, filename, _progress) # 利用urllib.request.urltrieve方法下载图片
except Exception as e:
with open('./download_fail.csv') as download_fail:
download_fail.write(image_url)
print('出错跳过:file=' + img + ' exception:', e)
continue
print("全部数据下载完毕!")



if __name__ == '__main__':
excel_path = './20190330_31.xls' # 数据库文件目录
target_dir = 'D:/invoice/' # 保存到本地的目录
url = 'http://10.28.11.139/' # 服务器地址
get_imgName(excel_path,target_dir,url,_progress)

二、局域网(公司内网)图片下载

由于公司内部存储图片是以加密的方式,或者说是以数据的方式存储的,图片被编码成base64。公司内部服务器数据传输接口是java写的,所以我这里用到了pyhessian 。

前期的安装配置也是很麻烦的(应为公司的电脑是无法连接网络的,必须离线安装)。运行时出了各种问题。网上有很多关于解决这些问题的方法。

2.1 安装:

在线安装:直接执行 ​pip install python-hessian​​
离线安装:下载:​​​six-1.12.0-py2.py3-none-any.whl​​​ , ​​python_hessian-1.1.0-py2.py3-none-any.whl​

# 在下载的目录下打开终端直接执行
2.2 调用
# coding:utf-8
import base64
import xlrd
import json
import time
from pyhessian.client import HessianProxy

# 下载的数据是从Oracle数据库中查询的。
def read_excel(excel_path):
workbook = xlrd.open_workbook(excel_path)
sheet = workbook.sheet_by_name("SQL_Results")
nrows = sheet.nrows
fileids = []
for i in range(1, nrows):
fileids.append(sheet.row_values(i)[5]) # 获取第6列数据
print("list:", fileids)
return fileids


def download(request_params):
j = json.loads(request_params)
print("type(j):", type(j))
if 'seqNum' not in j:
result_json = {'code': '1', 'msg': '缺少参数:seqNum'}
print('缺少参数:seqNum')
return json.dumps(result_json), {'Content-Type': 'application/xxx'}

if 'fileid' not in j:
result_json = {'code': '1', 'msg': '缺少参数:fileid'}
print('缺少参数:fileid')
return json.dumps(result_json), {'Content-Type': 'application/xxx'}

systemId = 'XXXX'
seqNum = j['seqNum']
sceneY = u'字段一'
sceneE = u'字段二'
sceneS = u'字段三'
fileid = j['fileid']
url = 'http://10.x.xxx.xx:8000/hessian/xxx/xxx'
params = {
'systemId': systemId,
'sceneY': sceneY,
'sceneE': sceneE,
'sceneS': sceneS,
'seqNum': seqNum,
'fileId': fileid
}

service = HessianProxy(url)
result = service.downloadFile(params) # 该方法为java写的,在接口文档中说明的。
print("return result:", result)

resCode = result['resCode']
resMsg = result['resMsg']
fileData = result['fileData']
img_b64decode = base64.b64decode(fileData)
with open('./download_img_20190402/' + str(fileid) + '.jpg', "wb") as f:
f.write(img_b64decode)
print('save successful!')


def get_base64(fileids):
scale = len(fileids)
download_fails = []
for i, fileid in enumerate(fileids):
a = '#' * int(i / 100)
b = '.' * (int(scale / 100) - int(i / 100))
c = (i / scale) * 100
time.sleep(0.2) # 休息一下

seqNum = 'invoice_test_data_' + str(time.strftime("%Y%m%d%H%M%S", time.localtime()))
request_params = json.dumps({'seqNum': seqNum, 'fileid': fileid})
print('request_params:', request_params)
try:
download(request_params)
print("{:^3.2f}%[{}->{}]".format(c, a, b)) # 进度条
except:
download_fails.append(request_params)
return download_fails


if __name__ == '__main__':
excel_path = './20190401_company_product_data.xls'
fileids = read_excel(excel_path)
download_fails = get_base64(fileids)

with open('./download_fail.txt', 'wb') as ff:
for download_fail in download_fails:
ff.write(','.join(download_fail) + '\n')
ff.close()

多线程下载

import base64
import xlrd
import sys,csv
import time
from pyhessian.client import HessianProxy
from concurrent.futures import ThreadPoolExecutor



def read_excel(excel_path):
workbook = xlrd.open_workbook(excel_path)
sheet = workbook.sheet_by_name("SQL_Results")
nrows = sheet.nrows
fileids = []
for i in range(1, nrows):
fileids.append(sheet.row_values(i)[5]) # 获取第6列数据
print("fileids:", fileids)
print("total:", len(fileids))
return fileids


def Request_Params(fileids):
request_params = []
for fileid in fileids:
seqNum = 'invoice_test_data_' + str(time.strftime("%Y%m%d%H%M%S", time.localtime()))
systemId = 'AIOCR'
sceneL = u'XXXXX'
sceneM = u'XXXXX'
sceneS = u'XXXXX'
url = 'http://10.XX.XXX.X:XXXX/HDSServ/servlet/hessian/hds/XXXXXXXXXX'
params = {
'systemId': systemId,
'sceneL': sceneL,
'sceneM': sceneM,
'sceneS': sceneS,
'seqNum': seqNum,
'fileId': fileid
}
#print('request_params:', request_params)
request_params.append((url,params))
return request_params


def download_file(request_params):
download_fails = []
service = HessianProxy(request_params[0])
try:
result = service.downloadFile(request_params[1])
fileData = result['fileData']
img_b64decode = base64.b64decode(fileData)
with open('./download_img_20190402/' + str(request_params[1]['fileid']) + '.jpg', "wb") as f:
f.write(img_b64decode)
print(str(request_params[1]['fileid']) + '.jpg','--save successful!')
except:
download_fails.append(request_params[1])
return download_fails



if __name__ == '__main__':
excel_path = './20190401_invoice_product_data.xls'
fileids = read_excel(excel_path)
request_params = Request_Params(fileids)

download_fails=[]
scale = len(request_params)
start = time.time()
with ThreadPoolExecutor(max_workers=4) as pool:
results = pool.map(download_file, request_params) # 使用线程执行map计算
for i, r in enumerate(results):
a = '#' * int(i/10)
b = '.' * (int(scale/10) - int(i/10))
c = (i / scale) * 100
sys.stdout.write("{:^3.2f}%[{}->{}]".format(c, a, b))
sys.stdout.flush()
download_fails.extend(r)
end = time.time()
print("Total Spend time:", str((end - start) / 60)[0:6] + "分钟")

if len(download_fails)>1:
with open("./download_fail.csv", 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerow(('sequence', 'params'))
write.writerows(download_fails)
else:
print('download ovor OK!')

关于进度条你可以直接使用 ​​tqdm​​​。我是懒得下载了(数据进入公司电脑很麻烦)。 ​