简单的分析一下这个爱拍视频

总结起来 就是及其不安全的一个网站

Python爬虫项目--爱拍视频批量下载_html


想获取登录态 很简单到只要设置cookie就ok了 也是很惊讶,然后就是正常爬取流程了。说明一下,建议设置https 然后登录安全要做好。

  • 代码部分:
    首先是获取下载链接
#下载爱拍
import requests
import time
from bs4 import BeautifulSoup

AIPAI_SPICE_URL = 'http://home.aipai.com/17899407?action=card&sub=&sort=id&total=844&clicks=5617816&flowers=27350&bookTotal=0&page='
LOGIN_COOKIES = 'cookie部分 这部分需要先登录复制过来即可'
cookies2 = dict(map(lambda x: x.split('='), LOGIN_COOKIES.split(";")))



def DownloadTagToTxt(url):
res = requests.get(url, cookies=cookies2)
bs_html = BeautifulSoup(res.text, 'lxml')
h5_list = bs_html.find_all('h5')
for h5 in h5_list:
a_list = h5.find_all('a')
#a_list = bs_html.find_all('h5')
for a in a_list:
a_str = a.get('href')
if a_str.endswith('mp4'):
# 打印出a标签
print(a_str)

if __name__ == '__main__':
for num in range(1,72):
cur_str = str(num)
AIPAI_URL = AIPAI_SPICE_URL+cur_str
DownloadTagToTxt(AIPAI_URL)

其次是下载的代码

import os
import time
import requests

def downloadFile(name, url):
'''
:param name:下载保存的名称
:param url: 下载链接
:return:
'''
headers = {'Proxy-Connection': 'keep-alive'}
r = requests.get(url, stream=True, headers=headers)
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()

def formatFloat(num):
return '{:.2f}'.format(num)

if __name__ == '__main__':
file_name = 'VideoLink.txt'
for line in open(file_name):
down_link = line.strip()
split_list = down_link.split('/')
down_link_name = split_list[len(split_list)-2]+'_'+split_list[len(split_list)-1]
downloadFile(down_link_name,down_link)