import requests
from lxml import etree
import re
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8700291432374701138&ipn=rj&ct=201326592&is=&fp=result&fr=ala&word=%E8%A1%A8%E6%83%85%E5%8C%85&queryWord=%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&expermode=&nojc=&isAsync=&pn=390&rn=30&gsm=186'
headers = {
'Cookie':'winWH=%5E6_1920x963; BDIMGISLOGIN=0; BDqhfp=%E8%A1%A8%E6%83%85%E5%8C%85%26%26NaN-1undefined%26%268772%26%2614; BIDUPSID=47D1A97F74FE4D84D9C060A7E9D9623C; PSTM=1688450494; BAIDUID=64354928A148308F322D02D378FB19A4:FG=1; BAIDUID_BFESS=64354928A148308F322D02D378FB19A4:FG=1; ZFY=r0Ch4DZ4vzKkjKsCTr20yTyvBoJZR:BJjX3:AbIpxAvCs:C; BA_HECTOR=05812la52la52l80008505891ieo2c31p; PSINO=1; H_PS_PSSID=36548_39226_39223_39193_39199_39240_39233_26350_39238_39138_39224_39137_22157_39100; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[A24tJn4Wkd_]=mk3SLVN4HKm',
'Host':'image.baidu.com',
'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B1%ED%C7%E9%B0%FC&fr=ala&ala=1&alatpl=normal&pos=0&',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
data = {
'tn': 'resultjson_com',
'logid': '8700291432374701138',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'fr': 'ala',
'word': '表情包',
'queryWord': '表情包',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '',
'istype': '',
'qc': '',
'nc': '',
'expermode': '',
'nojc': '',
'isAsync': '',
'pn': '330',
'rn': '30',
'gsm': '186',
}
resp = requests.get(url , headers=headers,data=data)
resp_json = resp.json()
resp_urls = resp_json['data']
for resp_url in resp_urls:
try:
fromPageTitle = resp_url['fromPageTitle']
fromPageTitle = re.sub(r'[/\*?<>|\n_]',fromPageTitle)
middleURL = resp_url['middleURL']
name = re.split(r'(\w+)',middleURL)
print(name[-10],fromPageTitle)
except:
continue
百度表情里面原来看起来是HTML,后来抓包发现数据竟然是json的