1.使用request爬取有效的URL

# -*- coding: GBK -*-

from urllib import request
import re

#爬取某个主页上的全部有效URL
def crawb():
    # 1. 确定好要爬取的入口链接
    url = "http://www.baidu.com"
    #根据网页选取合适的正则表达式
    pattern = '<a href=".*?"'
    headers = {'User-Agent',
               'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
    opener = request.build_opener()
    opener.addheaders = [headers]
    data = opener.open(url).read().decode('utf8')
    content_href = re.findall(pattern, data, re.I)
    # 5.过滤掉重复的链接
    sets = set(content_href)
    # 6.后续操作,比如打印出来或者保存到文件中。
    file = "url"
    with open(file, 'w') as f:
        for ur in sets:
            #删除开头的<a href="和结尾的"
            ur = ur[9:-1]
            try:
                respose=request.urlopen(ur)
                f.write(ur + "\n")
            except :
                print(ur+":is not url")

if __name__ == "__main__":
    crawb()

 

2.使用requests爬取有效的URL

# -*- coding: GBK -*-

from urllib import request
import re
import requests
def crawb():
    url="http://www.baidu.com"
    file = "url"
    kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
    with open(file, 'w') as f:
        r = requests.get(url, headers=kv)
        r.encoding = r.apparent_encoding
        pagetext = r.text
        # 正则表达式表示要爬取的是<a href="和"中的内容,"或'都可以,即当前页面下所有的链接url,返回列表
        pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext)
        for link in pagelinks:
            try:
                respose=request.urlopen(link)
                f.write(link + "\n")
            except:
                print(link + ":is not url")
if __name__ == "__main__":
    crawb()

 

3.beautifulSoup爬取页面中以http:开头的url

import re
import requests
from bs4 import BeautifulSoup
def crawb():
    url = 'http://www.baidu.com'
    page = requests.get(url).text
    pagesoup = BeautifulSoup(page, 'lxml')
    for link in pagesoup.find_all(name='a', attrs={"href": re.compile(r'^http:')}):
        print(link.get('href'))

if __name__=="__main__":
    crawb()