第五次更新报告——4月6日
- 一、爬虫模块
- 二、数据库模块
- 三、web服务器
- 四、网站模块
一、爬虫模块
这周尝试编写爬虫的一般模板,但是在编写的过程中遇到了一些困难,例如不同网页链接标签并不相同,并且网页可能具有反爬虫机制,所以在开发时进展比较缓慢。因而这周打算将爬虫的一般模块暂时放缓,编写其他网页论坛的爬虫的时候记录其网页结构,最后再进行编写。所以这周将对主要的5个网页新闻进行爬取,下一周将进行论坛进行爬虫编写。
二、数据库模块
- 增加了对用户信息的读写
- 增加了对单条热词数据(包含出现总次数、30日内每日出现次数、12月内每月出现次数、10年内每年出现次数)的初始化、修改、维护
- 增加了被爬取链接的增删查
代码如下:
import pymysql
import MyNLP
# 打开数据库连接
db = pymysql.Connect(host = "localhost", port = 3306, user = "root", passwd = "123456", db = "bs")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
def set_InfoKW(InfoKW):
sql = "insert into InfoKW values ( '%s', '%s', '%s', '%s', '%s' )"
data = (InfoKW.url , InfoKW.text, InfoKW.time.strftime("%Y-%m-%d"), InfoKW.source, InfoKW.get_KW_str())
cursor.execute(sql % data)
connect.commit()
def get_InfoKW(url):
sql = "select * from InfoKW where url = '%s'"
data = (url)
cursor.execute(sql % data)
res = cursor.fetchall()
return MyNLP.Info_kw(res[0], res[1], res[2], res[3], res[4].split(","))
def set_User(mail, pw):
sql = "insert into User values ( '%s', '%s')"
data = (mail, pw)
cursor.execute(sql % data)
connect.commit()
def get_User(mail):
sql = "select pw from User where mail = '%s'"
data = (mail)
cursor.execute(sql % data)
res = cursor.fetchall()
return res[0]
def init_KW(kw):
sql = "insert into KW values ( '%s', 0, '0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0', '0,0,0,0,0,0,0,0,0,0,0,0,0', '0,0,0,0,0,0,0,0,0,0,0')"
data = (KW)
cursor.execute(sql % data)
connect.commit()
def add_KW(KW, num, days, months, years):
sql = "select * from KW where kw = '%s'"
data = (KW)
cursor.execute(sql % data)
res = cursor.fetchall()
num += int(res[1])
day = ""
temp = res[2].split(',')
for i in range(0, 31):
days[i] = int(temp[i])
day += days[i] + ","
day = day[:-2]
month = ""
temp = res[3].split(',')
for i in range(0, 13):
months[i] = int(temp[i])
month += months[i] + ','
month = month[:-2]
year = ""
temp = res[4].split(',')
for i in range(0, 11):
years[i] = int(temp[i])
year += year[i] + ','
year = year[:-2]
sql = "update KW set num = '%d', days = '%s', months = '%s', years = '%s' where kw = '%s')"
data = (num, day, month, year, KW)
cursor.execute(sql % data)
connect.commit()
#def maintain_KW():
def get_url():
sql = "select * from url"
cursor.execute(sql)
res = cursor.fetchall()
url_list = []
for item in res:
url_list.append(item)
return url_list
def sef_url(url):
sql = "insert into url values ( '%s')"
data = (url)
cursor.execute(sql % data)
connect.commit()
def del_url(url):
sql = "delete from url where url = '%s'"
data = (url)
cursor.execute(sql % data)
connect.commit()
# 关闭数据库连接
db.close()
三、web服务器
实现了基本的静态web服务器,动态web服务器正在尝试开发中。
代码如下:
import socket
import gevent
import re
import sys
# 设置静态文件根目录
HTML_ROOT_DIR = './html'
WSGI_PYTHON_DIR = './wsgipython'
class HTTPServer(object):
def __init__(self, application):
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.server_socket.bind(("localhost", 8080))
self.app = application
def start(self):
self.server_socket.listen(128)
while True:
client_socket, client_address = self.server_socket.accept()
print("[%s, %s]用户连接上了" % client_address)
handle_client_process = gevent.spawn(self.handle_socket, client_socket)
handle_client_process.join()
def start_response(self, status, headers):
"""
status="200 ok"
headers = [
('Content-Type', 'text/plain')
]
:param status:
:param headers:
:return:
"""
response_headers = "HTTP1.1 " + status + "\r\n"
for header in headers:
response_headers += "%s: %s\r\n" % header
self.response_headers = response_headers
def handle_socket(self, client_socket):
"""处理客户端请求"""
# 获取客户端请求数据
request_data = client_socket.recv(1024)
#print("request data: ", request_data)
request_lines = request_data.splitlines()
#for request_line in request_lines:
#print(request_line)
# 解析请求报文
# 'GET / HTTP/1.1'
request_start_line = request_lines[0]
#print('&' * 20)
#print(type(request_start_line))
# 提取用户请求的文件名
#print('*' * 10)
print(request_start_line.decode('utf-8'))
file_name = re.match(r"\w+ +(/[^ ]*) ", request_start_line.decode('utf-8')).group(1)
method = re.match(r"(\w+) +/[^ ]* ", request_start_line.decode('utf-8')).group(1)
print(file_name)
print(method)
env = {
"PATH_INFO": file_name,
'METHOD': method
}
response_body = self.app(env, self.start_response)
response = self.response_headers + '\r\n' + response_body
# 向客户端返回响应数据
client_socket.send(bytes(response, 'utf-8'))
# 关闭客户端连接
client_socket.close()
print("socket close")
import time
# 设置静态文件根目录
HTML_ROOT_DIR = "./html"
class Application(object):
"""框架的核心部分,也就是框架的主题程序,框架是通用的"""
def __init__(self, urls):
# 设置路由信息
self.urls = urls
def __call__(self, env, start_response):
path = env.get("PATH_INFO", "/")
print(path)
# /static/index.html
if path.startswith("/static"):
# 要访问静态文件
file_name = path[7:]
# 打开文件,读取内容
try:
if file_name[-5:] == ".html":
file_name = file_name[:-5]
file = open(HTML_ROOT_DIR + file_name + ".html", "rb")
except IOError:
# 代表未找到路由信息,404错误
status = "404 Not Found"
headers = []
start_response(status, headers)
return "not found"
else:
file_data = file.read()
file.close()
print(HTML_ROOT_DIR + file_name + ".html")
status = "200 OK"
headers = []
start_response(status, headers)
return file_data.decode("utf-8")
print(HTML_ROOT_DIR + file_name + ".html Not Found")
for url, handler in self.urls:
#("/ctime", show_ctime)
if path == url:
return handler(env, start_response)
# 代表未找到路由信息,404错误
status = "404 Not Found"
headers = []
start_response(status, headers)
return "not found"
def show_ctime(env, start_response):
status = "200 OK"
headers = [
("Content-Type", "text/plain")
]
start_response(status, headers)
return time.ctime()
def say_hello(env, start_response):
status = "200 OK"
headers = [
("Content-Type", "text/plain")
]
start_response(status, headers)
return "hello frawework"
def say_haha(env, start_response):
status = "200 OK"
headers = [
("Content-Type", "text/plain")
]
start_response(status, headers)
return "hello haha"
def main():
#sys.path.insert(1, WSGI_PYTHON_DIR)
#if len(sys.argv) < 2:
#sys.exit("python MyWebServer_v1.py Module:app")
## python MyWebServer_v1.py MyWebFrameWork:app
#module_name, app_name = sys.argv[1].split(":")
## module_name = "MyWebFrameWork"
## app_name = "app"
#m = __import__(module_name)
#app = getattr(m, app_name)
urls = [
("/", show_ctime),
("/ctime", show_ctime),
("/sayhello", say_hello),
("/sayhaha", say_haha),
]
app = Application(urls)
http_server = HTTPServer(app)
# http_server.set_port
http_server.start()
if __name__ == '__main__':
main()
四、网站模块
利用vue创建了前端项目,并简化一开始创建的项目,留下需要的东西,删除初始网页的内容,以便增加之后我们自己需要的内容。