1、代码实现
#-*- encoding: utf-8 -*-
'''
Created on 2019/12/06 14:46
Copyright (c) 2019/12/06, Google Copy right
@author: com
'''
import urllib2, urllib,cookielib,threading,gzip,sys
from selenium.webdriver import PhantomJS
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class RequestUtil:
# 浏览器代理全局设置
__browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'
# 初始化对象,对象有cookies和线程锁
def __init__(self):
self.cookies=''
self._lock = threading.RLock()
# 取得连接
def http_get_request(self, url, referer, timeout=''):
'''
get请求获得对应网页的两个操作对象
:param url:
:param referer:
:param timeout:
:return:
'''
# 获得锁
self._lock.acquire()
# 获取cookie
cookie = cookielib.CookieJar()
# opener对象
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),SmartRedirectHandler())
urllib2.install_opener(opener)
# 请求头
headers = {'User-Agent': self.__browserAgent,
'Referer': referer,
'Cache-Control': 'max-age=0',
'Accept': '*/*',
'Connection': 'Keep-Alive',
'Accept-encoding':'gzip'}
# 页面请求对象
req = urllib2.Request(url=url,headers=headers)
# 请求获得网页操作对象
if timeout == '':
open = urllib2.urlopen(req)
else:
open = urllib2.urlopen(req, timeout=timeout)
if self.cookies == '':
for item in cookie:
self.cookies = self.cookies + item.name + '=' + item.value + ';'
self.cookies = self.cookies[:-1]
if url != open.url:
req = urllib2.Request(url=open.url,headers=headers)
# 执行完成,释放锁
self._lock.release()
# 返回网页的操作对象
return (open,req)
def http_post_request(self, url, datas, referer, timeout=''):
'''
post请求获得对应网页的两个操作对象
:param url:
:param datas:
:param referer:
:param timeout:
:return:
'''
self._lock.acquire()
postdata = urllib.urlencode(datas)
headers={'User-Agent': self.__browserAgent,
'Referer': referer,
'Content-Type': 'application/x-www-form-urlencoded',
'Cache-Control': 'no-cache',
'Accept': '*/*',
'Connection': 'Keep-Alive',
'Accept-encoding':'gzip',
'Cookie':self.cookies}
req = urllib2.Request(url=url,data=postdata,headers=headers)
req.get_host()
if timeout == '':
open = urllib2.urlopen(req)
else:
open = urllib2.urlopen(req, timeout=timeout)
if url != open.url:
req = urllib2.Request(url=open.url,headers=headers)
self._lock.release()
return (open,req)
def http_get(self, url ,refer='https://www.baidu.com'):
return self.http_get_request(url, refer, 60)
def http_post(self, url, datas ,refer='https://www.baidu.com'):
return self.http_post_request(url, datas, refer, 60)
def http_post_request2(self, url, datas, timeout=''):
if timeout == '':
open = urllib2.urlopen(url,datas)
else:
open = urllib2.urlopen(url, datas,timeout=timeout)
data= open.read()
return data
def http_post2(self,url,datas):
return self.http_post_request2(url,datas,300)
def create_phandomjs(self, service_args, caps, timeout=30):
self.driver = PhantomJS(desired_capabilities=caps,service_args = service_args)
self.driver.set_page_load_timeout(timeout)
self.driver.set_script_timeout(timeout)
self.driver.implicitly_wait(timeout)
def close_phandomjs(self):
try:
self.driver.quit()
except:
pass
def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000):
caps = dict(DesiredCapabilities.PHANTOMJS)
caps['browserName'] = 'chrome'
caps["phantomjs.page.settings.resourceTimeout"] = timeout
caps["phantomjs.page.settings.loadImages"] = False
caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
caps["phantomjs.page.customHeaders.Referer"] = (refer)
service_args=[]
service_args.append('--load-images=no')
service_args.append('--disk-cache=yes')
service_args.append('--cookies-file=')
self.create_phandomjs(timeout = timeout,service_args = service_args, caps = caps)
self.driver.get(url)
# self.driver.save_screenshot('hainiu.png')
return self.driver.page_source
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
result.status = code
return result
def http_error_302(self, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
result.status = code
return result
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
r = RequestUtil()
html = r.http_get_phandomjs('https://mil.news.sina.com.cn/china/2019-12-06/doc-iihnzahi5616327.shtml')
html = html.decode('utf-8').encode(sys.getfilesystemencoding())
print html
r.close_phandomjs()