1.使用request爬取有效的URL
# -*- coding: GBK -*-
from urllib import request
import re
#爬取某个主页上的全部有效URL
def crawb():
# 1. 确定好要爬取的入口链接
url = "http://www.baidu.com"
#根据网页选取合适的正则表达式
pattern = '<a href=".*?"'
headers = {'User-Agent',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
opener = request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read().decode('utf8')
content_href = re.findall(pattern, data, re.I)
# 5.过滤掉重复的链接
sets = set(content_href)
# 6.后续操作,比如打印出来或者保存到文件中。
file = "url"
with open(file, 'w') as f:
for ur in sets:
#删除开头的<a href="和结尾的"
ur = ur[9:-1]
try:
respose=request.urlopen(ur)
f.write(ur + "\n")
except :
print(ur+":is not url")
if __name__ == "__main__":
crawb()
2.使用requests爬取有效的URL
# -*- coding: GBK -*-
from urllib import request
import re
import requests
def crawb():
url="http://www.baidu.com"
file = "url"
kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
with open(file, 'w') as f:
r = requests.get(url, headers=kv)
r.encoding = r.apparent_encoding
pagetext = r.text
# 正则表达式表示要爬取的是<a href="和"中的内容,"或'都可以,即当前页面下所有的链接url,返回列表
pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext)
for link in pagelinks:
try:
respose=request.urlopen(link)
f.write(link + "\n")
except:
print(link + ":is not url")
if __name__ == "__main__":
crawb()
3.beautifulSoup爬取页面中以http:开头的url
import re
import requests
from bs4 import BeautifulSoup
def crawb():
url = 'http://www.baidu.com'
page = requests.get(url).text
pagesoup = BeautifulSoup(page, 'lxml')
for link in pagesoup.find_all(name='a', attrs={"href": re.compile(r'^http:')}):
print(link.get('href'))
if __name__=="__main__":
crawb()