你好,我是悦创。
课程源码:
课程源码修改成了多线程
"""
project = 'Code', file_name = 'spider', author = 'AI悦创'
time = '2020/4/11 11:01', product_name = PyCharm
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
"""
import time
import httpx
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import threading
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'cookie': '__jsluid_s=cd83b4d69f4ab44cb06be1accb4cc1f0; gr_user_id=0d038710-8b3c-458e-b24e-4dfddf81678d; SXS_XSESSION_ID="2|1:0|10:1586252279|15:SXS_XSESSION_ID|48:NjQwNTlmYzgtN2UwOC00MjVhLTkwYjgtMmVkZTg3ZThmOTlm|177fd855591d1cb18419b934149eac85a5dc019bc405e5766dbcc607d7a76b87"; SXS_XSESSION_ID_EXP="2|1:0|10:1586252279|19:SXS_XSESSION_ID_EXP|16:MTU4NjMzODY3OQ==|f0c98dcc2df131c89b89fcf82003600d2f6d3c5b9bb413d773d5b0ca1af70781"; uuid=557984a3-f1eb-d32a-bbe5-d1a04d15ed79; search=; Hm_lvt_03465902f492a43ee3eb3543d81eba55=1584630605,1586176927,1586252282,1586259541; MEIQIA_TRACK_ID=1Xz59NchxnvhxGfnzWD8u6imsRB; MEIQIA_VISIT_ID=1aNRrWmHJjsaKO5Id2Rk04j1icd; uid1=e7aa7c2d-a147-9dd4-a247-3044c2794179; uid2=97397d2a-5cfa-0894-ad45-85509228befd; Hm_lpvt_03465902f492a43ee3eb3543d81eba55=1586575274; gr_session_id_96145fbb44e87b47=15495963-3f3c-41d0-aeb0-c81e3bce662b; gr_cs1_15495963-3f3c-41d0-aeb0-c81e3bce662b=user_id%3Anull; gr_session_id_96145fbb44e87b47_15495963-3f3c-41d0-aeb0-c81e3bce662b=true; SXS_VISIT_XSESSION_ID_V3.0="2|1:0|10:1586577044|26:SXS_VISIT_XSESSION_ID_V3.0|48:OTc5YjFmM2MtN2U2OS00MTUxLWE5YzYtOTdlNjFlNWQ0ZDhl|ce71895297d0f00342e9d918c44f42c8b26e661c2d4b34ef8f22df843e8b968f"; SXS_VISIT_XSESSION_ID_V3.0_EXP="2|1:0|10:1586577044|30:SXS_VISIT_XSESSION_ID_V3.0_EXP|16:MTU4OTE2OTA0NA==|d2f94075301f75c91a955a0c7bee0ee0bc7e8f734c01d7bd95d228d4ebb83136"'
}
class ShiXiSeng(object):
def __init__(self):
# self.url = url
self.headers = headers
def requests_func(self, url):
try:
result = httpx.get(url, headers = self.headers)
if result.status_code == 200:
return result
return None
except RequestException:
return None
def salary_encode(self, salary):
encode_text = salary.encode("utf8")
encode_text = encode_text.replace(b"\xee\xa9\xb0", b'0')
encode_text = encode_text.replace(b"\xee\x90\xa4", b'2')
encode_text = encode_text.replace(b"\xee\x94\xaa", b'3')
encode_text = encode_text.replace(b"\xef\xa3\xaf", b'5')
# encode_text = encode_text.replace(b"\xe5\xa4\xa9", b'天')
# encode_text = encode_text.replace(b"\xe5\xa4\xa9", b'\xe5\xa4\xa9')
return encode_text.decode("utf-8")
def parse_page(self, url):
soup = BeautifulSoup(self.requests_func(url).text, 'lxml')
offers = soup.select('#__layout .interns .result-list .primary-content div div div.intern-wrap')
for index, offer in enumerate(offers):
title = offer.select('.f-l p a')[0].text
salary = offer.select('.f-l p span')[0].text
# print(self.url)
numbers = self.salary_encode(salary)
print(f'{index}title:>{title}, salary:>{numbers}')
def main(self):
urls = ['https://www.shixiseng.com/interns?' \
'page={}&keyword=Java&type=intern&area=' \
'&months=&days=°ree=&official=&enterprise=&salary=-0&' \
'publishTime=&sortType=&city=%E8%8E%86%E7%94%B0&internExtend='.format(page) for page in range(1, 20)]
for url in urls:
thread = threading.Thread(target=self.parse_page, args=(url,))
thread.start()
thread.join()
if __name__ == '__main__':
# urls = ['https://www.shixiseng.com/interns?' \
# 'page={}&keyword=Java&type=intern&area=' \
# '&months=&days=°ree=&official=&enterprise=&salary=-0&' \
# 'publishTime=&sortType=&city=%E8%8E%86%E7%94%B0&internExtend='.format(page) for page in range(1, 20)]
html = ShiXiSeng()
html.main()