第三周

转载

mb5fdcad0be2e90 2021-08-02 16:46:00

文章标签 html sql json 用户名 mysql 文章分类 MySQL 数据库

第三周学习内容

1、爬取薄荷网热量为大创app数据计算添加数据库数据；

youmian.py

第三周_html 第三周_用户名_02

from youGet import youGet
if __name__ == '__main__':
    data = youGet()
    data.get_date()
    data.parse_date()
import youdaosql
youdaosql.du_sql()

View Code

youGet.py

爬取所有食物需要更改url和每个next的数字就可以；

第三周_html 第三周_用户名_02

import json

import lxml.html
import requests

etree = lxml.html.etree
import time
from requests.adapters import HTTPAdapter

heatss = []
class youGet():
    def get_date(self):
        url = "http://www.boohee.com/food/view_menu"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }
        response = requests.get(url, headers=headers)
        with open('htmlyou.txt', 'w', encoding="utf-8") as file:
            file.write(response.text)
        response.close()

    def parse_date(self):
        with open('htmlyou.txt', 'r', encoding='utf-8') as file:
            text = file.read()
        html = etree.HTML(text)
        # hrefpage = html.xpath('//a[@class="next_page"]/@href')
        page = html.xpath('//div[@class="pagination"]/@limit_page')[0]
        a = 2
        try:
            if page!=0:
                heats = html.xpath('//ul[@class="food-list"]/li/div/p/text()')
                if len(heats)!=0:
                    names = html.xpath('//ul[@class="food-list"]/li/div/h4/a/@title')
                    hrefs = html.xpath('//ul[@class="food-list"]/li/div/h4/a/@href')
                    heat = html.xpath('//ul[@class="food-list"]/li/div/p/text()')
                    num = 0
                    for index in names:
                        info = {}
                        info["name"] = index
                        info["href"] = hrefs[num]
                        info["heat"] = heat[num]
                        heatss.append(info)
                        num += 1
                    # next =  html.xpath('//div[@class="pagination"]/a[@class="next_page"]/@herf')
                    next ="http://www.boohee.com/food/view_menu?page="+str(a)
                    while(len(next)!=0):
                        if a==int(page)+1:
                            break
                        a+=1
                        headers = {
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                          'Chrome/80.0.3987.149 Safari/537.36 '
                        }
                        response = requests.get(next, headers=headers)
                        html1 = etree.HTML(response.text)
                        heats2 = html1.xpath('//ul[@class="food-list"]/li/div/p/text()')
                        if len(heats2)!=0:
                            names = html1.xpath('//ul[@class="food-list"]/li/div/h4/a/@title')
                            hrefs = html1.xpath('//ul[@class="food-list"]/li/div/h4/a/@href')
                            heat = html1.xpath('//ul[@class="food-list"]/li/div/p/text()')
                            num = 0
                            for index in names:
                                info = {}
                                info["name"] = index
                                info["href"] = hrefs[num]
                                info["heat"] = heat[num]
                                heatss.append(info)
                                num += 1
                            next ="http://www.boohee.com/food/view_menu?page="+str(a)

                        response.close()
                result = json.dumps(heatss, sort_keys=True, indent=2)
                with open('./data5.json', 'w', encoding='utf-8') as file:
                    for i in result:
                        file.write(i)

        except requests.exceptions.ConnectionError:
            result = json.dumps(heatss, sort_keys=True, indent=2)
            with open('./data4.json', 'w', encoding='utf-8') as file:
                for i in result:
                    file.write(i)
            with open('./htmlhref1.txt', 'w', encoding="utf-8") as file:
                for i in heatss:
                    file.write(i)

View Code

youdaosql.py

第三周_html 第三周_用户名_02

import mysql.connector
import json
with open('./data5.json', 'r') as file:
    data = file.read()
    data = json.loads(data)

def du_sql():
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="password",
        database="test1",
        auth_plugin="mysql_native_password"
    )
    dbpath = mydb.cursor()
    savaDataSql(dbpath)
    print("ok")
    mydb.commit()

def savaDataSql(dbpath):
    cur = dbpath
    for each in data:
        name = each['name']
        href = each['href']
        heat = each['heat']
        sql = "INSERT INTO heats (name,href,heat) values (%s,%s,%s)"
        var = (name,href,heat)
        cur.execute(sql,var)