1.解决中文乱码的问题

(1)是否动态加载,

(2)获取源码数据

彼岸图网:

第一页地址:​​http://pic.netbian.com/4kmeinv/​

第二页:​​http://pic.netbian.com/4kmeinv/index_2.html​

第三页:​​http://pic.netbian.com/4kmeinv/index_3.html​

#第一步:我们写的下面的代码有bug,返回的中文有乱码的问题

import requests
from lxml import etree
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#通用的url模板(不能修改模板)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
if page==1:
new_url='http://pic.netbian.com/4kmeinv/'
else:
new_url=format(url%page)
page_text=requests.get(url=new_url,headers=headers).text
#解析名称和图片的src属性值
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name=li.xpath('./a/img/@alt')[0]
img_src=li.xpath('./a/img/@src')[0]
print(img_name,img_src)

#第二步:修改,下面的结果会有变化,但是结果还是存在问题

import requests
from lxml import etree
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#通用的url模板(不能修改模板)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
if page==1:
new_url='http://pic.netbian.com/4kmeinv/'
else:
new_url=format(url%page)
response=requests.get(url=new_url,headers=headers)
response.encoding='utf-8'
page_text=response.text
#解析名称和图片的src属性值
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name=li.xpath('./a/img/@alt')[0]
img_src=li.xpath('./a/img/@src')[0]
print(img_name,img_src)

第三步:我们进一步升级

import requests
from lxml import etree
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#通用的url模板(不能修改模板)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
if page==1:
new_url='http://pic.netbian.com/4kmeinv/'
else:
new_url=format(url%page)
response=requests.get(url=new_url,headers=headers)
# response.encoding='utf-8'
page_text=response.text
#解析名称和图片的src属性值
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name=li.xpath('./a/img/@alt')[0]
img_name=img_name.encode('iso-8859-1').decode('gbk')
img_src=li.xpath('./a/img/@src')[0]
print(img_name,img_src)

第四步,进一步升级

 

import requests
from urllib import request
from lxml import etree
import os
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#通用的url模板(不能修改模板)

#创建文件夹
if not os.path.exists('./meinvs'):
os.mkdir('./meinvs')
url='http://pic.netbian.com/4kmeinv/index_%d.html' #这个跳转的原始页码要看好.
for page in range(start_page,end_page+1):
if page==1:
new_url='http://pic.netbian.com/4kmeinv/'
else:
new_url=format(url%page)
response=requests.get(url=new_url,headers=headers)
# response.encoding='utf-8'
page_text=response.text
#解析名称和图片的src属性值
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name=li.xpath('./a/img/@alt')[0]
img_name=img_name.encode('iso-8859-1').decode('gbk')+'.jpg'
img_src='http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
#print('img_src',img_src)
img_path='./meinvs/'+img_name #这个路径拼接需要注意下
request.urlretrieve(img_src,img_path)
print(img_name,'下载成功!!!')

2.XPATH的另一种用法

爬取全国城市名称

url = 'https://www.aqistudy.cn/historydata/'

 

import requests
from lxml import etree
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
# tree.xpath('//div[@class="bottom"]/ul/li/a/text()') #热门城市
#tree.xpath('//div[@class="bottom"]/ul/li/a/text()')
#all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()')
#all_city #一列表形式,打印全部城市

#拿取所有的数据,按位或,xpath直接获取
tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')

站长素材里边的"简历模板"

​http://sc.chinaz.com/jianli/​

小爬爬2:中文乱码等问题处理_chrome

下载地址可以换着用,解析的时候用每个地址.

 

小爬爬2:中文乱码等问题处理_.net_02