在公司购买的OA系统上,很多功能都是软件商开发好的,如果有什么自定义的需求,也很难实现。现实情况下需要将一个工单的各类信息汇总整理为一份Excel,看似简单的需求,却需要在OA系统上反复点击多次,人工汇总。本章我们看看如何使用Python爬虫帮同事解决这个问题的。
点击工单号之后才可以看到更多信息
一、技术路线
requests_html
二、环境准备
from requests_html import HTMLSession
import re
import csv
import datetime
import time
三、问题点
- 脚本如何登录OA系统
cookie_value = input("请输入秘钥:") #让用户手动输入cookie值
session = HTMLSession()
url = "http://crm.xtcrm.com/xcrm/contract/contract/"
cookies={"XTSID" :cookie_value}
r = session.get(url,cookies=cookies)
print(r.text)
首次登录后记录Cookie值,脚本启动时输入。
- 所见非所得
网页中的工单信息并非直接存在于当前HTML代码中,很多内容是通过请求其他URL内容从而进行展示的。所以我们需要进行URL重建。
构造跨站访问的URL需要我们在当前网页上找到目标URL的各种参数
- 脚本如何在其他人的电脑上运行
D:\>pyinstaller -i radish.ico -F 回访内容自动爬取.py
-i 指定打包程序使用的图标(.icon)文件
-F 在dist文件夹中只生成独立的打包文件
四、脚本效果
人工1小时,脚本144秒=减少工作时长=白嫖工资
五、代码
from requests_html import HTMLSession
import re
import csv
import datetime
import time
#获取导表的日期
ExportFrom_date = str(datetime.date.today())
#获取导表前一天的日期
def getYesterday():
today = datetime.date.today()
oneday = datetime.timedelta(days=1)
yesterday = str(today - oneday)
return yesterday
#获取导表前二天的日期:
def getBigYesterday():
today = datetime.date.today()
oneday = datetime.timedelta(days=2)
Bigyesterday = str(today - oneday)
return Bigyesterday
#将昨天,今天和前天格式化
def dataformat(time):
if time == "今天":
time = ExportFrom_date
elif time == "昨天":
time = getYesterday()
elif time == "前天":
time = getBigYesterday()
return time
#本周时间范围,过滤本周的接单日期>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#本周第一天
today = datetime.date.today()
week_start_day = today - datetime.timedelta(days=today.weekday())
# 本周最后一天
today = datetime.date.today()
week_end_day = today + datetime.timedelta(days=6-today.weekday())
#如果字段内容为空,则添加N/A
def Fillblank(field):
if field == " ":
field = "N/A"
#print("这是一个空格")
elif field:
field = field
#print("这是一个有效值")
else:
field = "N/A"
#print("这是一个空值")
return field
#0)让用户手动输入cookie值
cookie_value = input("请输入秘钥:")
file = open("D:/{}_1回访表.csv".format(ExportFrom_date),"w",newline="",encoding="utf-8-sig")
csvwriter = csv.writer(file)
csvwriter.writerow(["ID号","服务单号","机器名称","关单时间","对应客户","派单人","联系人","工程师","回访联系人","回访电话","回访手机","故障主题","完成时间","处理措施","分类"])
try:
#1)使用cookie的方式登录到http://crm.xtcrm.com/xcrm/contract/contract/
session = HTMLSession()
url = "http://crm.xtcrm.com/xcrm/contract/contract/"
cookies={"XTSID" :cookie_value}
r = session.get(url,cookies=cookies)
#print(r.text)
#2)在上面这个页面查询需要的参数,列出要爬取的30个工单列表
#pre_Global
pre_Global_element = "sid=(.*)&ma"
pre_Global_list = re.findall(pre_Global_element, r.html.html)
pre_Global = "&sid=" + pre_Global_list[0]
#print(pre_Global)
#ma_WorkID
ma_WorkID_element = "&ma=.{30}"
ma_WorkID_list = re.findall(ma_WorkID_element, r.html.html)
ma_WorkID = ma_WorkID_list[0]
#mid_WorkID
mid_element = "&comname=qlist(.*)customer_phone_show=0"
mid_list = re.findall(mid_element, r.html.html)
mid_WordID = "&comname=qlist" + mid_list[0]
#print(mid_WordID)
#modcr_WorkID 应该不需要这段
modcr_WorkID = mid_list[0]
#print(modcr_WorkID)
#!!!爬取工单页面控制>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
base_url = ["http://crm.xtcrm.com/system/runmod.xt?where=&bwhere=df@1^5`@@1^5`@QKywCMoAibpBSbylmZu92YgQmbhByM9UGc5R3dllmd&page=0&orderf=begin_date&orderv=1&pn=30",
"http://crm.xtcrm.com/system/runmod.xt?where=&bwhere=df@1^5`@@1^5`@QKywCMoAibpBSbylmZu92YgQmbhByM9UGc5R3dllmd&page=2&orderf=begin_date&orderv=1&pn=30",
]
#定义全局变量
ID_list_end = []
ID_Number_list = []
ID_Number_list_end = []
#获取工单ID号函数
def getIDNumber():
for page in base_url:
global ID_list_end
global ID_Number_list
global ID_Number_list_end
workID_url = page + pre_Global + ma_WorkID + "&scname=contract" + mid_WordID + "&customer_phone_show=0&cv=,search,qlist,contractmark,statlist,"
#print(workID_url)
session = HTMLSession()
r_workID = session.get(workID_url,cookies=cookies)
ID_element = '>[0-9]{4}</td>'
ID_list = re.findall(ID_element, r_workID.html.html)
ID_list_end = ID_list_end + ID_list
for i in range(0,len(ID_list_end)):
ID_Number_Pre = ID_list_end[i]
ID_Number_element = "[0-9]{4}"
ID_Number = re.findall(ID_Number_element,ID_Number_Pre)
ID_Number_list = ID_Number_list + ID_Number
#去除列表中的重复ID
for s in ID_Number_list:
if s not in ID_Number_list_end:
ID_Number_list_end.append(s)
getIDNumber()
#3)根据工单ID号遍历这个工单列表,抓取mdb这个参数 modcr_Maintenace参数 ma_Maintenace参数
#程序进度
scale = len(ID_Number_list_end)
print("执行开始".center(scale//2, "-"))
start = time.perf_counter()
x = 0
for i in ID_Number_list_end:
#进度条控制
a = '*' * x
b = '.' * (scale - x)
c = (x/scale)*100
dur = time.perf_counter() - start
print("\r{:^3.0f}%[{}->{}]{:.2f}s".format(c,a,b,dur),end='')
x = x + 1
session = HTMLSession()
detail_url = "http://crm.xtcrm.com/xcrm/contract/contract/detail.xt?id=" + i
r_detail = session.get(detail_url,cookies=cookies)
#mdb_Order参数
mdb_element = "&mdb=(.{6})"
mdb_list = re.findall(mdb_element, r_detail.html.html)
mdb_Order = "&mdb=" + mdb_list[0]
#modcr_Maintenace_Order参数
modcr_Maintenace_Order_element = "&comname=qshow(.*)customer_phone_show=0"
modcr_Maintenace_Order_list = re.findall(modcr_Maintenace_Order_element, r_detail.html.html)
modcr_Maintenace_Order = "&comname=qshow" + modcr_Maintenace_Order_list[0] + "customer_phone_show=0"
#ma_Maintenace_Order参数
ma_Maintenace_Order_element = "&ma=.{32}"
ma_Maintenace_Order_list = re.findall(ma_Maintenace_Order_element, r_detail.html.html)
ma_Maintenace_Order = ma_Maintenace_Order_list[0]
#modcr_Maintenace_process参数列表 (不是所有的工单都有这个页面的,如果工程师没有写措施的话就不会有)
modcr_Maintenace_process_element = "&scname=pp_repairinfo&comname=repair(.*)customer_phone_show=0"
modcr_Maintenace_process_list = re.findall(modcr_Maintenace_process_element, r_detail.html.html)
#modcr_dely (不是所有的工单都有这个页面的,如果工程师没有写措施的话就不会有)
modcr_dely_element = "&scname=pp_repairinfo&comname=dely(.*)customer_phone_show=0"
modcr_dely_list = re.findall(modcr_dely_element, r_detail.html.html)
#4)在上面这个网页查询需要的参数,爬取“维修工单”和“维修过程”
#4.1 维修工单URL
Maintenace_base_url = "http://crm.xtcrm.com/system/runmod.xt?id=" + i
Maintenance_Order_url = Maintenace_base_url + "&stype=&where=" + mdb_Order + pre_Global + ma_Maintenace_Order + "&scname=pp_repairinfo" + \
modcr_Maintenace_Order + "&cv=,qshow,"
#print("维修工单URL:" + Maintenance_Order_url)
#4.1.1 爬去维修工单中的指定字段,打印出来(注意有些内容不需要回访)
r_Maintenance_Order_url = session.get(Maintenance_Order_url)
r_Maintenance_Order_url.html.render()
#根据接单日期来判断是否为本周的工单!!!!!!!!!!!!
receive_date = (r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(4) > div:nth-child(1) > div > div",first=True)).text
#将接单日期格式化
receive_date_format = dataformat(receive_date)
#将字符串类型的时间转换为datetime.date可以参与比较的类型
receive_data_end = datetime.date(*map(int, receive_date_format.split('-')))
#!!!爬取工单的时间控制(本周)>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
if week_start_day > receive_data_end:
continue
#服务单号
server_id = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(3) > div:nth-child(1) > div > div",first=True)
#机器名称
device = r_Maintenance_Order_url.html.find("#bg-default > div > i > div:nth-child(2) > div:nth-child(1) > div > div",first=True)
#对应客户
client = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(5) > div:nth-child(1) > div > div",first=True)
#派单人
send_man = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(3) > div:nth-child(2) > div > div",first=True)
#联系人
link_man = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(6) > div:nth-child(2) > div > div",first=True)
#工程师
engineer = r_Maintenance_Order_url.html.find("#bg-default > div > i > div:nth-child(7) > div:nth-child(1) > div > div",first=True)
#回访联系人 = 联系人 如果为空,添加暂无联系人
#回访电话
visit_phone = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(7) > div:nth-child(1) > div > div",first=True)
#回访手机
visit_mobile_phone = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(7) > div:nth-child(2) > div > div", first=True)
#故障主题
Fault_title = r_Maintenance_Order_url.html.find("#bg-default > div > i > div:nth-child(5) > div:nth-child(1) > div > div",first=True)
#工单分类
type = r_Maintenance_Order_url.html.find("#bg-default > div > div:nth-child(5) > div:nth-child(2) > div > div",first=True)
#4.2 维修过程URL (存在问题,不是所有的工单都有这个页面的,如果没有完成的话就不会有)
#需要提前定义着两个字段,如果没有写措施的话,就不会执行下面的内容了。也就没有finsh_time和handle_way这两个字段了。
finsh_times = "N/A"
handle_way_end = "N/A"
if modcr_Maintenace_process_list:
modcr_Maintenace_process = "&scname=pp_repairinfo&comname=repair" + modcr_Maintenace_process_list[0] + "customer_phone_show=0"
#print(modcr_Maintenace_process)
Maintenance_process_url = Maintenace_base_url + pre_Global + modcr_Maintenace_process + "&cv=,qshow,svc_goods,svc_money,svc_action,inspection,svc_confirm,repair,"
#print("维修过程URL:" + Maintenance_process_url)
#4.2.1爬取维修过程中的指定字段
r_Maintenance_process_url = session.get(Maintenance_process_url)
r_Maintenance_process_url.html.render()
#完成时间 = 维修完成日期 + 完成时间
data = r_Maintenance_process_url.html.find("body > div.row.form-horizontal.show-box > div:nth-child(5) > div > div",first=True)
times = r_Maintenance_process_url.html.find("body > div.row.form-horizontal.show-box > div:nth-child(7) > div > div",first=True)
#美观时间格式,增加用户体验
finsh_times = dataformat(data.text) + " " + times.text
#处理措施
handle_way = r_Maintenance_process_url.html.find("body > div.row.form-horizontal.show-box > div:nth-child(1) > div > div",first=True)
handle_way_end = handle_way.text
else:
modcr_Maintenace_process_list.append("此工单还没有写措施")
Maintenance_process_url = "此工单未生成维修过程表单"
#print("维修过程URL:" + Maintenance_process_url)
#4.2.2工程师未写措施,发送微信提醒工程师。网页微信无法使用,未实现此功能。
#4.3交付过程URL (存在问题,不是所有的工单都有这个页面的,如果工程师没写措施的话就不会有)
#需要提前定义这个字段,如果没有关单的话,就不会执行下面的内容了。也就没有dely_time.
close_time = "N/A"
if modcr_dely_list:
modcr_dely ="&scname=pp_repairinfo&comname=dely" + modcr_dely_list[0] + "customer_phone_show=0"
dely_url = Maintenace_base_url + pre_Global + ma_Maintenace_Order + modcr_dely + "&cv=,qshow,svc_goods,svc_money,svc_action,inspection,svc_confirm,repair,dely,"
#print("交付URL:" + dely_url)
#4.3.1爬取维修过程中的关单字段.
r_dely_url = session.get(dely_url)
r_dely_url.html.render()
#关单时间
dely_data = r_dely_url.html.find("body > div.form-horizontal.show-box > div:nth-child(2) > div:nth-child(1) > div > div",first=True)
dely_time = r_dely_url.html.find("body > div.form-horizontal.show-box > div:nth-child(2) > div:nth-child(2) > div > div",first=True)
#美化时间格式,增加用户体验
close_time =dataformat(dely_data.text) + " " + dely_time.text
else:
modcr_dely_list.append("此工单销售还未关单")
dely_url = "此工单销售还未关单"
#print("交付URL:" + dely_url)
#4.3.2 如果没有关单则需要发送微信提醒派单人员,网页微信无法使用,未实现此功能。
#5)将文件写入CSV中
#如果字段为空,加入N/A
close_time = Fillblank(close_time)
link_man_end = Fillblank(link_man.text)
visit_phone_end = Fillblank(visit_phone.text)
visit_mobile_phone_end = Fillblank(visit_mobile_phone.text)
Fault_title_end = Fillblank(Fault_title.text)
finsh_times = Fillblank(finsh_times)
handle_way_end = Fillblank(handle_way_end)
csvwriter.writerow([i,server_id.text,device.text,close_time,client.text,send_man.text,link_man_end,engineer.text,link_man_end,visit_phone_end,visit_mobile_phone_end,Fault_title_end,finsh_times,handle_way_end,type.text])
print("\n"+"执行结束".center(scale//2,'-'))
input("报表文件请在D盘根目录下查看(按'Enter'退出程序)")
except :
input("您的输入有误(按'Enter'退出程序)")