一、爬取湛江天气预报信息
url:http://www.weather.com.cn/weather/10128100101A.shtml
1.思路分析:
①先爬取网页所有数据,div/ul/li,获得7天天气预报的所有信息;
②li下的所有数据进行提取数据;
③保存文件。 --文件操作、json模块。
import requests
import lxml.html
import json
def parse_url(url,header):
""""解析url地址,获得网页的所有数据内容"""
response = requests.get(url,headers=header)
# return response.text 会出现乱码
#改为content二进制数据,再转为字符串decode即可
return response.content.decode("utf-8")
def get_weather_datas(html_content):
""""从所有html数据信息中获取ul>li标签中的所有天气信息"""
metree = lxml.html.etree
# 1 获得解析对象(构造一个 XPath 解析对象并对 HTML 文本进行自动修正)
parser = metree.HTML(html_content,metree.HTMLParser())
# 2 使用Xpath语法获得li所有标签
li_list = parser.xpath("//div[@class='c7d']/ul[@class='t clearfix']/li")
# print(li_list)
# print(len(li_list)) 7
# 3 获得li标签下的所有信息
# 先创建一个空列表
data = []
# 循环遍历
for ele in li_list:
# 创建一个空字典
item = {}
# 继续使用XPath语法
item["date"] = ele.xpath("./h1/text()")[0]
item["天气"] = ele.xpath("./p[@class='wea']/text()")[0]
# item["wea"] = ele.xpath("./p[@class='wea']/@title")[0] 通过title属性获取
item["最低温度"] = ele.xpath("./p[@class='tem']/i/text()")[0]
item["最高温度"] = ele.xpath("./p[@class='tem']/span/text()")[0]
# print(item["最高温度"])
data.append(item)
# print(data)
return data
def save_weather_file(datas):
""""保存文件"""
# 列表转json字符串,并保存到文件中
json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
# print(json_strs)
# print(type(json_strs))
with open("./file/weather.json","w",encoding="utf-8") as files:
files.write(json_strs)
print("数据保存成功!")
def main():
# ①div/ul/li,获得7天天气预报的所有信息; --lxml、requests、XPath --list类型
# 网址、请求头
# print("HelloWorld...")
http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
headers = \
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
# 解析url
html_data = parse_url(http_url,headers)
# print(html_data)
# ②li下的所有数据进行提取数据; --对上述的7条数据进行处理,提取数据 XPath --列表
weather_datas = get_weather_datas(html_data)
# print(weather_datas)
# ③保存文件。 --文件操作、json模块
save_weather_file(weather_datas)
""""python程序入口,通常写在程序末尾处"""
if __name__ == '__main__':
main()
2.爬取的数据:
二 、爬取美女网站信息
url:http://www.xiaohuar.com/list-1-1.html
1.思路分析
①获得所有美女图片的div列表;
②在当前美女的div中获得所有信息;
③保存数据。
import requests
import lxml.html
import json
def parse_url(url,header):
""""解析url地址,获取所有网页数据信息"""
response = requests.get(url,headers=header)
return response.content.decode("gbk")
def get_xiaohua_datas(html_content):
""""获取数据内容"""
metree = lxml.html.etree
# 解析对象
parser = metree.HTML(html_content,metree.HTMLParser())
# 解析获得②在当前美女的div中获得所有信息
div_list = parser.xpath("//div[@class='item_list infinite_scroll']/div")
# 创建空列表
data = []
# print(div_list)
# print(len(div_list)) 25
for ele in div_list:
item = {} #创建空字典
# 继续使用xpath 语法
item["title"] = ele.xpath("./div[@class='item_t']/div[@class='img']/a/img/@alt")[0]
item["name"] = ele.xpath("./div[@class='item_t']/div[@class='img']/span/text()")[0]
item["school"] = ele.xpath("./div[@class='item_t']/div[@class='img']/div[@class='btns']/a/text()")[0]
item["like_count"] = ele.xpath("./div[@class='item_b clearfix']/div[@class='items_likes fl']/em/text()")[0]
# print(item["like_count"])
data.append(item)
# print(data)
return data
def save_xiaohua_file(datas):
""""保存文件"""
json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
with open("./file/xiaohua.json","w",encoding="utf-8") as files:
files.write(json_strs)
print("数据保存成功!")
def main():
xiaohua_url = "http://www.xiaohuar.com/list-1-1.html"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
html_data = parse_url(xiaohua_url,headers)
# print(html_data)
# ①获得所有美女图片的div列表
xiaohua_datas = get_xiaohua_datas(html_data)
# ③保存数据
save_xiaohua_file(xiaohua_datas)
if __name__ == '__main__':
main()
2.爬取的数据
[
{
"title": "大连国际舞蹈学校校花王钰萌",
"name": "王钰萌",
"school": "大连国际舞蹈学校",
"like_count": "159"
},
{
"title": "南昌大学校花曾阳",
"name": "曾阳",
"school": "南昌大学",
"like_count": "220"
},
{
"title": "中国民航大学校花张金玉",
"name": "张金玉",
"school": "中国民航大学",
"like_count": "109"
},
{
"title": "天津财经大学校花卓娅祺",
"name": "卓娅祺",
"school": "天津财经大学",
"like_count": "361"
},
{
"title": "新疆农业大学校花麦合丽娅",
"name": "麦合丽娅",
"school": "新疆农业大学",
"like_count": "53"
},
{
"title": "成都职业技术学院校花杨萍",
"name": "杨萍",
"school": "成都职业技术学院",
"like_count": "108"
},
{
"title": "东北师范大学校花尹思凝",
"name": "尹思凝",
"school": "东北师范大学",
"like_count": "109"
},
{
"title": "北京理工大学珠海学院校花韦若愚",
"name": "韦若愚",
"school": "北京理工大学珠海学院",
"like_count": "122"
},
{
"title": "厦门理工学院校花袁慧",
"name": "袁慧",
"school": "厦门理工学院",
"like_count": "78"
},
{
"title": "湖北艺术学院校花王媛茜",
"name": "王媛茜",
"school": "湖北艺术学院",
"like_count": "96"
},
{
"title": "文光中心校花陈里佳",
"name": "陈里佳",
"school": "文光中心",
"like_count": "48"
},
{
"title": "大连外国语大学校花高梦馨",
"name": "高梦馨",
"school": "大连外国语大学",
"like_count": "115"
},
{
"title": "舟山技师学院校花宋世杰",
"name": "宋世杰",
"school": "舟山技师学院",
"like_count": "99"
},
{
"title": "上海财经大学校花徐逸岑",
"name": "徐逸岑",
"school": "上海财经大学",
"like_count": "123"
},
{
"title": "武汉大学校花丁婷婷",
"name": "丁婷婷",
"school": "武汉大学",
"like_count": "121"
},
{
"title": "行健学院校花徐艳琛",
"name": "徐艳琛",
"school": "行健学院",
"like_count": "149"
},
{
"title": "上海交通大学校花唐雨乔",
"name": "唐雨乔",
"school": "上海交通大学",
"like_count": "105"
},
{
"title": "温州大学校花汤以斯贴",
"name": "汤以斯贴",
"school": "温州大学",
"like_count": "289"
},
{
"title": "华东大学校花赵梦洁",
"name": "赵梦洁",
"school": "华东大学",
"like_count": "604"
},
{
"title": "鄞州职业高级中学校花翁川美",
"name": "翁川美",
"school": "鄞州职业高级中学",
"like_count": "109"
},
{
"title": "中央戏剧学院校花刘垚昕",
"name": "刘垚昕",
"school": "中央戏剧学院",
"like_count": "585"
},
{
"title": "星源初中校花廖炯炅",
"name": "廖炯炅",
"school": "星源初中",
"like_count": "99"
},
{
"title": "广州华夏职业学院校花邓杏琳",
"name": "邓杏琳",
"school": "广州华夏职业学院",
"like_count": "97"
},
{
"title": "芷江师范校花滕之雅",
"name": "滕之雅",
"school": "芷江师范",
"like_count": "208"
},
{
"title": "铁岭师范校花施玉",
"name": "施玉",
"school": "铁岭师范",
"like_count": "186"
}
]
三、总结
爬取数据的基本步骤(以案例二为例)
1.写入对应的url
2.请求头,解析url地址,获取所有网页数据信息:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
html_data = parse_url(xiaohua_url,headers)
3.获取数据后可以获得所有美女图片的div列表
xiaohua_datas = get_xiaohua_datas(html_data)
4.根据需要的内容来保存数据
save_xiaohua_file(xiaohua_datas)