import os, sys
os_sep = os.sep
base_dir = os.path.dirname(os_sep.join(os.path.abspath(file).split(os_sep)[0:-2]))
sys.path.append(base_dir)
from core.utils import MysqlHelper
import time
import logging
import requests
import threading
from selenium import webdriver
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='%s/log/%s%s.log' % (
base_dir, time.strftime('%Y%m%d', time.localtime(time.time())), file.split(os_sep)[-1]),
filemode='a')
class MyThread(threading.Thread):
def init(self, func, args, name):
threading.Thread.init(self)
self.func, self.args, self.name = func, args, name
def run(self):
self.func(self.args)
url_counter = 0
def main():
# 最近3600秒内url在test_tab0具有唯一性
mysql_obj = MysqlHelper()
#
"""
联合检测
q = 'SELECT url FROM ( SELECT url FROM test_tab0 WHERE unix_timestamp(now()) - create_time<=3600 UNION ALL SELECT url FROM test_tab0_error where status!=2 ) AS t GROUP BY url;'
未失效url检测
q = 'SELECT url FROM test_tab0 WHERE unix_timestamp(now()) <expire_time;'
当前半小时
q = 'SELECT url FROM test_tab0 WHERE unix_timestamp(now()) - create_time<= 1800;'
后期任务:
test_tab0_error积累一定数据后对url重新检测
#3个功能点:当前半个小时、当前未失效的url test_tab0内url的异常情况(当前的2个功能点)、(后期任务:test_tab0_error积累一定数据后对url重新检测)
"""
q = 'SELECT url FROM test_tab0 WHERE unix_timestamp(now()) <expire_time;'
tuple_l = mysql_obj.select(q)
del mysql_obj
if len(tuple_l) == 0:
s = '无待检测url,程序退出'
print(s)
logging.info(s)
# 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s
sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
'g3user.com', '51g3.com.cn'], 4, 10
# 重构到基类 where list
# d当前为为了f_l字段的需求改动
def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_tab0'):
mysql_obj = MysqlHelper()
f_s = ','.join(f_l)
q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
s = '%s%s' % (' DB ', q)
logging.info(s)
t = mysql_obj.select(q)
if t != -1:
t = t[0]
del mysql_obj
return t
def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
time.sleep(sleep_seconds)
global url_counter
ret = {}
# db url状态值 状态 0:打不开 1:打开无广告 2:已处理
ret['ok'], ret['status_code'], s = -1, -1, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time()))
try:
if url.find('http') == -1:
url = '%s%s' % (http_tag, url)
r = requests.get(url)
ret['status_code'], txt_pos = int(r.status_code), -1
s = '%s,%s,%s,%s' % (s, ret['status_code'], url, r)
except Exception as e:
ret['ok'] = 0
s = '%s %s %s' % (s, ' SPIDER ', e)
logging.error(s)
print(e, url)
# 当前,仅考虑目标站返回200
if ret['status_code'] == 200:
driver = webdriver.PhantomJS(
executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get(url)
time.sleep(1)
page_source = driver.page_source
for ii in mycode_l:
if page_source.find(ii) > -1:
ret['ok'], txt_pos = 1, 1
break
if txt_pos == -1:
s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
ret['ok'], ret['info'] = 0, s
# elif ret['status_code'] == 403:
# www.hsdcw.com/fenlei/41668214.html
elif ret['status_code'] == 403:
pass
else:
ret['ok'], ret['info'] = 0, s
url_counter += 1
s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
print(s)
if ret['ok'] == 0:
logging.warning(s)
else:
logging.info(s)
return ret
tn, tl, tstep = len(tuple_l), [], 5000
def tf(ts):
te = ts + tstep
te = min(te, tn)
for i in tuple_l[ts:te]:
url = i[0]
"""
针对新浪爱问的规则: 不检测
"""
if url.find('iask.sina.com') > -1:
continue
write_db_flag = 1
for t in range(0, repeat_times, 1):
print('threadID', threading.get_ident(), url)
ret = chk_exception_url(url, repeat_sleep_times)
if ret['ok'] == 1:
write_db_flag = 0
break
if write_db_flag == 1:
try:
title, uid, money_total = get_onerow(url)
except Exception as e:
s = '%s %s %s' % (' DB Exception-去test_tab0查', url, e)
logging.info(s)
print(s)
break
# 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除
mysql_obj = MysqlHelper()
q = 'SELECT id FROM test_tab0_error WHERE url="%s" LIMIT 1' % (url)
print(q)
try:
r = mysql_obj.select(q)
s = '%s%s' % ('-SQL-', q)
logging.info(s)
print(q)
except Exception as e:
s = '%s%s %s' % (' DB Exception-', q, e)
logging.info(s)
print(s)
break
ctime = int(time.time())
db_status = 1 if ret['status_code'] == 200 else 0
if len(r) == 0:
q = 'INSERT INTO test_tab0_error (title,url,status,remarks,update_time,create_time,uid,money) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s")' % (
title, url, db_status, ret['info'], ctime, ctime, uid, money_total)
elif len(r) == 1:
continue
"""
q = 'UPDATE test_tab0_error SET title="%s",status="%s",remarks="%s",update_time="%s" ,uid="%s", money="%s"' % (
title, db_status, ret['info'], ctime, uid, money_total)
后期处理test_tab0_error二次检测的更新
"""
try:
mysql_obj.execute(q)
mysql_obj.commit()
del mysql_obj
s = '%s%s' % (' DB SQL ok ', q)
logging.info(s)
print(s)
except Exception as e:
s = '%s%s %s' % (' DB Exception-', q, e)
logging.error(s)
print(s)
for i in range(0, tn, tstep):
if i >= tn:
break
thread_instance = MyThread(tf, (i), tf.__name__)
tl.append(thread_instance)
for t in tl:
t.setDaemon = False
t.start()
for t in tl:
t.join()
if name == 'main':
main()