本文介绍一个python爬虫小项目,通过tkinter绘制交互界面,根据界面提供的信息爬取网站、筛选数据,最后将获得的数据写入excel表格并用图表显示,达到不打开网站就能获得成果表格的目的。具体分享如下:
程序界面 运行效果:
点击“使用说明”按钮,弹出说明文件:
代码分4块:
1、tkinter设置界面,通过交互得到查询的城市(对应招标网址)、项目类型(分施工、设计、监理三种)、查询起止日期、投资额范围及文件保存地址。将获得的以上7个信息作为参数传入第2、3步使用。
2、requests + .json获得网站每条项目信息网址,再通过 requests + BeautifulSoup访问解析,筛选获取有用信息形成列表。
3、xlsxwriterg根据第1步传递的文件保存地址,创建excel文件,将第2步的信息列表写入文件,生成图表。
4、辅助功能:为便于查看结果文件,在程序界面上设置“打开文件”和“打开文件夹”按钮,点击直接打开,省去逐步打开文件夹的过程。
一、tkinter设置界面
将界面分为标题及1至8行,每行逐步建立,避免混乱。
root = Tk() # 创建窗口root.title('招标信息查询')root.resizable(0, 0) # 0,0表示框体x,y方向均不可调;root.geometry('700x450+200+80')dirpath = biaot#ft0 = tkFont.Font(family='楷体', size=16, weight=tkFont.BOLD)titlelabel = Label(root, text='招标信息查询小助手', font=ft0, anchor='w')titlelabel.grid(row=0, column=1, columnspan=8, pady=10)# 第1行ft = tkFont.Font(family='Fixdsys', size=10, weight=tkFont.BOLD, slant=tkFont.ITALIC)label1 = Label(root, text='选择区域:', width=10, anchor='w') # padx=30, sticky=Wlabel1.grid(row=1, column=1, columnspan=2, pady = 10)label11 = Label(root, text=city['杭州'][3], width=60, anchor='w', font=ft)label11.grid(row=1, column=4, columnspan=6, sticky=W)cl1=ttk.Combobox(root,textvariable=StringVar(), width=10) #初始化cl1["values"]=("杭州","北京","上海","广州")cl1.current(0)cl1.grid(row=1,column=3, sticky=W)cl1.bind("<>",cit)data['city'] = city[cl1.get()]# 第2行label2 = Label(root, text='查询类型:', width=10, anchor='w')label2.grid(row=2, column=1, columnspan=2, pady = 10)h1 = BooleanVar()cb1=Checkbutton(root, text='施工', width=10, anchor='w', variable=h1, command=type)cb1.grid(row=2, column=3, sticky=W)h2 = BooleanVar()cb2 = Checkbutton(root, text='设计', width=10, anchor = 'w', variable=h2, command=type)cb2.grid(row=2, column=4, sticky=W)h3 = BooleanVar()cb3 = Checkbutton(root, text='监理', width=10, anchor = 'w', variable=h3, command=type)cb3.grid(row=2, column=5, sticky=W)label21 = Label(root, text='当前查询:', width=10, anchor='w')label21.grid(row=2, column=6, columnspan=1, sticky=W)label22 = Label(root, text='', width=20, anchor='w', font=ft)label22.grid(row=2, column=7, columnspan=2, sticky=W)# 第3行label3 = Label(root, text='起始日期:', width=10, anchor='w')label3.grid(row=3, column=1, columnspan=2, pady = 10)cl31 = ttk.Combobox(root, textvariable=StringVar(), width=5)cl31["values"] = ("2019", "2020", "2021")cl31.current(1)cl31.grid(row=3, column=3, sticky=W)cl31.bind("<>", begintime)cl32 = ttk.Combobox(root,textvariable=StringVar(), width=5)cl32["values"] = (1,2,3,4,5,6,7,8,9,10,11,12)cl32.current(2)cl32.grid(row=3,column=4, sticky=W)cl32.bind("<>", begintime)day = []for i in range(1,32): day.append(i)cl33 = ttk.Combobox(root,textvariable=StringVar(), width=5)cl33["values"] = daycl33.current(4)cl33.grid(row=3, column=5, sticky=W)cl33.bind("<>", begintime)label31 = Label(root, text='开始日期:', width=10, anchor='w')label31.grid(row=3, column=6, columnspan=1, sticky=W)label32 = Label(root, text=cl31.get()+'-'+cl32.get()+'-'+cl33.get(), width=10, anchor='w', font=ft)label32.grid(row=3, column=7, columnspan=1, sticky=W) # , pady=20data['begintime'] = label32['text']# 第4行label4 = Label(root, text='终止日期:', width=10, anchor='w')label4.grid(row=4, column=1, columnspan=2)cl41 = ttk.Combobox(root, textvariable=StringVar(), width=5)cl41["values"]=("2019","2020")cl41.current(1)cl41.grid(row=4, column=3, sticky=W, pady=10)cl41.bind("<>", endtime)cl42 = ttk.Combobox(root, textvariable=StringVar(), width=5)cl42["values"]=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)cl42.current(2)cl42.grid(row=4, column=4, sticky=W)cl42.bind("<>", endtime)day = []for i in range(1,32): day.append(i)cl43 = ttk.Combobox(root, textvariable=StringVar(), width=5)cl43["values"] = daycl43.current(10)cl43.grid(row=4, column=5, sticky=W)cl43.bind("<>", endtime)label41 = Label(root, text='终止日期:', width=10, anchor='w')label41.grid(row=4, column=6, columnspan=1, sticky=W)label42 = Label(root, text=cl41.get()+'-'+cl42.get()+'-'+cl43.get(), width=10, anchor='w', font=ft)label42.grid(row=4, column=7, columnspan=2, sticky=W)data['endtime'] = label42['text']# 第5行label5 = Label(root, text='造价范围:', width=10, anchor='w')label5.grid(row=5, column=1, columnspan=2, pady=12, sticky=N)label51 = Label(root, text='————————————', width=10, anchor='w')label51.grid(row=5, column=4, columnspan=1, pady=12, sticky=NW)var51 = StringVar()var51.trace("w", lambda name, index, mode, sv=var51: minpri(var51))edit51 = Entry(root, textvariable=var51, width=8)edit51.grid(row=5, column=3, pady=12, sticky=NW)var52 = StringVar()var52.trace("w", lambda name, index, mode, sv=var52: maxpri(var52))edit52 = Entry(root, textvariable=var52, width=8)edit52.grid(row=5, column=5, pady=12, sticky=NW)bt51 = Button(root, text='保存地址', width=8, height=1, command=setpath)bt51.grid(row=5, column=6, pady=10, columnspan=1, sticky=NW)label53 = Label(root, text='E:python爬招标网试验.xlsx', width=23, height=3, wraplength=170, justify='left', anchor="nw")# takefocus='True', wraplength=90, justify='left',label53.grid(row=5, column=7, columnspan=2, pady=12, sticky=NW)data['dir'] = label53.cget('text')# 第6行至第8行,结果显示框,占三行三列result = Label(root, width=36, wraplength = 280, height=10, font=("楷体", 10), fg="blue", borderwidth=2, relief='sunken', anchor='n', justify = 'left')result.grid(row=6, column=3, columnspan=3, rowspan=3, sticky=NW)# 第6行pic_0 = PhotoImage(file='pic/休息.png')button60 = Button(root, width=50, height=62, image=pic_0, command=shuoming)#, anchor='w'button60.grid(row=6, column=1, columnspan=1, sticky=W, padx=20)#, sticky=NW, padx=30pic_1 = PhotoImage(file='pic/1.png')button6 = Button(root, width=50, text='开始查询', height=62, state='disabled', image=pic_1, command=search) # 'normal' btn1['state'] = 'disabled'button6.grid(row=6, column=6, sticky=W, padx=20)pic_2 = PhotoImage(file='pic/91.png')button61 = Button(root, width=50, text='打开文件', height=62, state='disabled', image=pic_2, command=openfile)button61.grid(row=6, column=7, sticky=W, padx=20)pic_3 = PhotoImage(file='pic/8.png')button62 = Button(root, width=50, text='打开路径', height=62, state='disabled', image=pic_3, command=opendir)button62.grid(row=6, column=8, sticky=W, padx=20)# 第7行ft1 = tkFont.Font(family='Fixdsys', size=10)label7= Label(root, text='使用说明', width=8, font=ft1)#, anchor='w'label7.grid(row=7, column=1, columnspan=2, sticky=N)label71 = Label(root, text='开始查询', width=8, font=ft1)label71.grid(row=7, column=6, sticky=N)label72 = Label(root, text='打开文件', width=8, font=ft1)label72.grid(row=7, column=7, sticky=N)label73 = Label(root, text='打开文件夹', width=8, font=ft1)label73.grid(row=7, column=8, sticky=N)#, sticky=S# 第8行ft2 = tkFont.Font(family='Fixdsys', size=8)label8 = Label(root, text='------版本: bmy-001-----', width=20, font=ft2)label8.grid(row=8, column=7, columnspan=2)#, sticky=S
通过上面的代码,界面就绘制完成了。我们需要对在界面上进行的数据操作结果保存下来,作为后续爬虫及文件保存操作的依据。共需传递7个数据,首先定义一个空字典准备用来装数据。
data = { 'city': '', 'type': '', 'begintime': '', 'endtime': '', 'minprice': '', 'maxprice': '', 'dir': ''}
二、通过界面逐个赋值:
1、city,下拉框选择要查询的城市,
def cit(event): global data text = city[cl1.get()] label11.config(text=text[3]) data['city'] = city[cl1.get()] set_state()
第一步绘制界面时,绘制城市选择下拉框时,给其设置了当选择数据后,触发cit函数块。
data['city'] = city[cl1.get()]:将下拉框中选定的城市名写入到data字典里的city项下。此时,若我们选择了杭州,则data的结果就变成:
data = { 'city': '杭州', 'type': '', 'begintime': '', 'endtime': '', 'minprice': '', 'maxprice': '', 'dir': ''}
2、'type',多选框选择要查询的是施工、还是设计、还是监理
def type(): msg = [] if h1.get() == True: msg.append("施工") if h2.get() == True: msg.append("设计") if h3.get() == True: msg.append("监理") label22.config(text = msg) data['type'] = msg set_state()
绘制界面时,也设置了选中复选框后触发的函数
将当前选择结果及时在后面显示出来的代码:
label22.config(text = msg) # abe
3、起止日期设置方法同1
4、'minprice': '', 'maxprice': '',通过输入框entry设置最小金额和最大金额,此时,我们需要控制输入框只能输入数字:
def minpri(var51): try: edit51.get() == '' or float(edit51.get()) # 获取输入框的值,转为浮点数,如果不能转,责捕获异常 data['minprice'] = edit51.get() # 如果输入的是数值,则写入data字典minprice键下 set_state() except: edit51.delete(len(edit51.get())-1) messagebox.showwarning('警告', '请输入数字')def maxpri(var52): try: edit52.get() == '' or float(edit52.get()) # 同上 data['maxprice'] = edit52.get() set_state() except: edit52.delete(len(edit52.get())-1) messagebox.showwarning('警告', '请输入数字')
5、dir:通过按钮点击触发函数,打开选择文件对话框,设置文件保存路径。
def setpath(): path = './' fname = filedialog.asksaveasfilename(title=u'选择文件', filetypes=[("excel07", ".xlsx")], initialdir=(os.path.expanduser(path))) if fname != '': if not fname[-5:] == '.xlsx': fname = fname + '.xlsx' label53.config(text=fname)# 将对话框设置的路径更新在后面的label53里 data['dir'] = label53['text'] # 将标签label53的内容写入data字典的dir键下 set_state()
6、按钮状态控制
def set_state(): bl = 1 for i in data: if data[i] == '' or data[i] == []: # 逐个判断data字典里键下的值,当全部都有值后,按钮状态设置为正常 bl = 0 break if bl == 1: button6['state'] = 'normal' else: button6['state'] = 'disabled'
7处红线位置均有数据,开始查询按钮变为可点击状态
计算出结果后,文件和文件夹按钮变为可点击状态
三、开始查询代码:
def search(): if datetime.strptime(data['endtime'], '%Y-%m-%d') < datetime.strptime(data['begintime'], '%Y-%m-%d'): messagebox.showwarning('警告', '起止日期不合理') else: info = zbw.getinfo(data) #调用爬虫代码,获得数据 if len(info) == 1: messagebox.showwarning('对不起, 没找到信息,请更换范围查找') else: tt = '小助手共为您找到{}条信息:'.format(len(info)-1) + '' for j in range(1, len(info)): if j <= 4: tt += '' + ' ' + str(info[j][0]) + '、' + info[j][1][0:10] + '...: ' + str(info[j][3]) + '' else: break result.config(text=tt, anchor='w') zbw.save(info) #调用操作excel代码,将数据保存在excel文件里 button61['state'] = 'normal' # 设置打开文件按钮状态 button62['state'] = 'normal'# 设置打开文件夹按钮状态 messagebox.showwarning('查询', tt)
用os.startfile打开帮助文件和excel文件
def openfile(): os.startfile(label53['text'])def opendir(): os.startfile(os.path.dirname(label53['text']))def shuoming(): os.startfile('使用说明.txt')
四、爬虫部分
案例将爬取数据功能和保存为excel功能封装在zbw.py文件里。
def getinfo(data): url = data['city'][0] params = data['city'][1] typename = data['type'][0] minprice = data['minprice'] maxprice = data['maxprice'] begintime = datetime.strptime(data['begintime'], '%Y-%m-%d') endtime = datetime.strptime(data['endtime'], '%Y-%m-%d') dir = data['dir'] response = requests.post(url, params=params).json() n = 1 info = [[dir,typename,data['city'][3]]] for i in range(len(response)): url1 = data['city'][2] + response[i]['SEGMENTSHOWID'] response1 = requests.get(url1).content soup = BeautifulSoup(response1, 'lxml', from_encoding='gbk') SegmentName = soup.find('span', id="SegmentName").text BuildUnitName = soup.find('span', id="BuildUnitName").text ProjectTypeName = soup.find('span', id="ProjectTypeName").text Tzze = soup.find('span', id="Tzze").text CreateTime = soup.find('span', id='CreateTime').text Enterprise = soup.find('span', id='Enterprise').text info1 = [] ct = datetime.strptime(CreateTime, '%Y-%m-%d') if typename == ProjectTypeName and float(maxprice) > float(Tzze) >= float(minprice) and endtime > ct > begintime: info1.append((n)) info1.append(SegmentName) info1.append(BuildUnitName) info1.append(float(Tzze)) info1.append(ProjectTypeName) info1.append(CreateTime) info1.append(Enterprise) info1.append(url1) info.append(info1) n += 1 return info
五、数据保存为excel
def save(info): if len(info) == 1: print('noinfo') return '' else: wb = xlsxwriter.Workbook(info[0][0]) sht = wb.add_worksheet(info[0][1]) bold = wb.add_format({'bold': 1, 'font_size':12, 'text_wrap': True, 'align': 'center', 'valign': 'vcenter'}) align = wb.add_format({'align': 'center', 'font_size':10, 'valign': 'vcenter', 'text_wrap': True}) head = ['序号', '项目名称', '建设单位', '投资总额', '类型', '日期', '资质要求', '详情', info[0][2]] sht.write_row('A1' , head, bold) sht.set_row(0, height=30) for row in range(2, len(info) + 1): sht.write_row('A'+ str(row), info[row-1], align) sht.set_row(row-1, height=40) sht.set_column(0, 0, width=6) sht.set_column(1, 2, width=20) sht.set_column(3, 5, width=10) sht.set_column(6, 7, width=11.5) chart = wb.add_chart({'type': 'column'}) chart.add_series({ 'name': [info[0][1], 0, 3], 'categories': [info[0][1], 1, 0, len(info)-1, 0], 'values': [info[0][1], 1, 3, len(info)-1, 3], 'line': {'color': 'red'} }) chart.set_title({'name': '项目投资总额比较'}) chart.set_x_axis({'name': '报建日期'}) chart.set_y_axis({'name': '投资总额'}) chart.set_style(1) sht.insert_chart('I2', chart, {'x_offset':5, 'y_offset':5}) wb.close()
保存的excel文件内容为:
点击excel表中详情列网址,能直接进入下图的网站详细页面
六、后语
本文案例是本人编写的第一个python程序,敬请留言交流。