from selenium import webdriver
import os
import time
import pymysql
from bs4 import BeautifulSoup
import requests
import threading
from selenium.webdriver.common.keys import Keys
h, pt, u, p, db = 'localhost', 3306, 'root', '', 'qqzone'
def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8mb4')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else:
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
r = cursor.fetchall()
cursor.close()
conn.close()
return r
def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8mb4')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0
# D:\pyaction\toutiao_team_win
img_dir = 'C:/Users/Administrator/Desktop/1/toutiao_team/dl_img/'
img_dir = 'D:/pyaction/toutiao_team_win/dl_img/'
import random
def spider_webimg_dl_return_local_img_path(img_dir, img_url, media_type='img',
local_default='default.DONOT_REMOVE.png'):
r = '%s%s' % (img_dir, local_default)
if media_type == 'img':
try:
req = requests.get(img_url)
time.sleep(3)
if req.status_code != 200:
print('-!=200')
return r
time.sleep(30)
print(img_url)
bytes = req._content
# r = '%s%s%s%s%s' % (
# img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
# img_url.split('!/c')[0].split('/')[-1].replace('*', '_'), '.png')
# print(r)
r = '%s%s%s%s%s' % (
img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
str(random.randrange(1000, 9999)), '.png')
print(r)
filter_l = ['&', '=', '?', '-']
for fi in filter_l:
r.replace(fi, '')
if bytes != 0:
with open(r, 'wb')as f:
f.write(bytes)
except Exception as e:
print(e)
elif media_type == 'mp4':
try:
time.sleep(30)
print(img_url)
r = '%s%s%s%s%s' % (
img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
img_url.split('.mp4?')[0].split('/')[-1].replace('*', '_'), '.mp4')
print(r)
req = requests.get(img_url)
time.sleep(3)
if req.status_code != 200:
print('-!=200')
return ''
bytes = req._content
time.sleep(210)
if bytes != 0:
with open(r, 'wb')as f:
f.write(bytes)
except Exception as e:
r = ''
print(e)
return r
driver = webdriver.Chrome()
myurl = 'https://weibo.com/u/1779073702'
myurl = 'https://weibo.com/u/1779073702?is_all=1'
myurl = 'https://weibo.com/login.php'
driver.get(myurl)
# 此处需要内存和cpu空余,能够支持dom解析和处理重js页面
time.sleep(10)
driver.refresh()
time.sleep(10)
# xp = '//*[@id="pl_common_top"]/div/div/div[3]/div[2]/ul/li[3]/a'
# try:
# # 此处解决了不能点击该元素报错,第三次尝试ok
# driver.find_element_by_xpath(xp).click()
# time.sleep(40)
#
# except Exception as e:
# print(e)
# os._exit(1024)
js = 'document.getElementsByClassName("username")[1].childNodes[0].value="welcome.cn";' \
'document.getElementsByClassName("password")[0].childNodes[0].value="welcome";' \
'document.getElementsByClassName("form_login_register")[0].childNodes[5].childNodes[0].click();'
js = 'document.getElementById("loginname").value="welcome.cn";' \
'document.getElementsByName("password")[0].value="welcome";' \
'document.getElementsByClassName("W_btn_a btn_32px")[0].click();'
js = 'document.getElementById("loginname").value="welcome.cn";' \
'document.getElementsByName("password")[0].value="yidingok0422";' \
'document.getElementsByClassName("W_btn_a btn_32px")[0].click();'
try:
# 需要键盘事件 - response
driver.find_element_by_id("loginname").send_keys(Keys.SPACE)
driver.find_element_by_id("loginname").send_keys(Keys.BACK_SPACE)
except Exception as e:
print(e)
try:
driver.execute_script(js)
time.sleep(30)
except Exception as e:
print(e)
os._exit(1024)
while True:
sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE lefttimes_weibo>0 AND INSTR(imgurls,".mp4")=0 AND id IN ( SELECT MAX(id) FROM qqzoneshuoshuo GROUP BY id_site) ORDER BY time_script DESC,id ASC ;'
res_content = mysql_fetch(sql, 'dic')
print(res_content)
if len(res_content) == 0:
continue
comment_l_sq = 0
for i in res_content[0:]:
# id, words,imgurls,time_site
dbid, content, img_list, time_site = i['id'], i['words'], i['imgurls'], i['time_site']
if 1 > 13:
if '天' in time_site or '月' in time_site:
continue
lh = int(time.strftime("%H", time.localtime()))
if lh - int(time_site.split(':')[0]) >= 24:
continue
if '早安' in content and lh >= 11:
continue
elif '晚安' in content and lh <= 20:
continue
# because another element <div> obscures it
content = content.split('展开全文')[0].split('上传')[0].split('浏览')[0].replace('"', ' ').replace("'", ' ')
content = content.replace('"', ' ').replace("'", ' ').replace('\n', ' ')
filter_l = ['密龄素材空间', '评论']
for fi in filter_l:
content = content.replace(fi, ' ')
ad_url_l = ['567557180229', '565875313425', '545159271159', '546048319163', '567693004121'] # 补水喷雾 手链 面膜 防晒喷雾
ad_url_l = [554545061378, 567877981760, 567693004121, 567557180229, 545159271159, 546048319163, 565875313425]
ad_this = ad_url_l[int(time.time()) % len(ad_url_l)]
ad_url = 'https://item.taobao.com/item.htm?id={}'.format(ad_this)
print(ad_url)
hot_topic_list_url = 'https://weibo.com/u/1779073702/home'
js = 'window.location.href="{}"'.format(hot_topic_list_url)
driver.execute_script(js)
time.sleep(20)
try:
hot_url_l = [i.get_attribute('href') for i in driver.find_elements_by_css_selector('li>p>a')]
except Exception as e:
print(e)
continue
try:
# hot_url_l_index = random.choice([int(time.time()) % len(hot_url_l), 0, 1, 2])
hot_url_l_index = int(time.time()) % len(hot_url_l)
except Exception as e:
print(e)
continue
js = 'window.location.href="{}"'.format(hot_url_l[hot_url_l_index])
driver.execute_script(js)
time.sleep(random.randrange(13, 16))
comment_l = driver.find_elements_by_css_selector('.WB_row_line>li:nth-child(3)>a>span>span>span')
ele_clickable = False
comment_l_sq = random.choice([-1, -1, 1, 1, 1, 2, 2, 3])
# comment_l_sq = int(time.time()) % len(comment_l)
# '.WB_publish>div>textarea'
mytopic, myname = ' #doaez朵韵诗磁石娃娃燕窝润颜面膜# ', '南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ '
mystr = '{}{}{}{}'.format(mytopic, myname, content, ad_url)
isc_log, rep_times, textarea_i, isc = 0, 3, 1, 0
for rep in range(rep_times):
comment_l_sq += random.choice([1, 1, 2, 2, 3])
comment_l_sq = min(comment_l_sq, len(comment_l))
isc_log = isc
for isc in range(isc_log, 100):
# time.sleep(1)
js = 'window.scrollTo(0,{})'.format(isc * 30)
driver.execute_script(js)
not_pull_up_index, pull_up_index = 1, 1
try:
comment_l[comment_l_sq].click()
time.sleep(2)
# 不需要键盘事件 - response
# driver.find_elements_by_tag_name("textarea")[not_pull_up_index].send_keys(Keys.SPACE)
# time.sleep(0.5)
# driver.find_elements_by_tag_name("textarea")[not_pull_up_index].send_keys(Keys.BACK_SPACE)
driver.find_elements_by_tag_name("textarea")[pull_up_index].send_keys(Keys.SPACE)
time.sleep(0.5)
driver.find_elements_by_tag_name("textarea")[pull_up_index].send_keys(Keys.BACK_SPACE)
#mystr = '{}{}'.format(mystr, random.randint(0, 9))
js = 'document.getElementsByTagName("textarea")[{}].value="{}"'.format(pull_up_index, mystr)
# js = 'document.getElementsByTagName("textarea")[{}].value="{}"'.format(not_pull_up_index, mystr)
print(js)
driver.execute_script(js)
time.sleep(2)
js = "document.getElementsByName('forward')[0].click();" # pull_up_index
# js = "document.getElementsByName('forward')[{}].click();".format(not_pull_up_index)
driver.execute_script(js)
time.sleep(2)
js = "document.getElementsByClassName('btn W_fr')[0].childNodes[0].click()" # pull_up_index
# js = "document.getElementsByClassName('btn W_fr')[{}].childNodes[0].click()".format(
# not_pull_up_index)
driver.execute_script(js)
time.sleep(2)
ele_clickable = True
not_pull_up_index += 1
# 不收起,输入框+1 comment_l[comment_l_sq].click()
# 收起,输入框
print('=================>')
comment_l[comment_l_sq].click()
print('<=================')
time.sleep(2)
break
except Exception as e:
print(e)
if e == 'list index out of range':
break
continue
if not ele_clickable:
continue
sql = 'UPDATE qqzoneshuoshuo SET lefttimes_weibo=lefttimes_weibo-1 WHERE id={}'.format(dbid)
print(sql)
try:
mysql_write(sql)
except:
pass
# 15min后刷新,循环存入数据,期间定时刷新,维持页面
for si in range(15):
try:
driver.refresh()
time.sleep(60)
time.sleep(random.randint(0, 10))
print(si)
except Exception as e:
print(145, e)