这个爬虫只是一个简单的爬虫,单线程并且任务的管理等
一 爬虫数据持久化方式
使用的是mysql数据库。需要设备上先安装mysql数据库,python中使用mysql 数据库的方法是使用pymysql库来操作。
例:
import pymysql
conn = pymysql.connect('127.0.0.1',user="root",passwd="root",port=3306,db="db_book",charset="utf8")
cur = conn.cursor()
#>>>>>>>>>>>>>>>>>>>>>从类型列表里搜索数据
sql = "select cateid,cateUrl from cate_list"
cur.execute(sql)
data = cur.fetchall() #data就是一个结果的集合
#>>>>>>>>>>>>>>>>>>>> 添加数据到表里,就是修改数据
sql = "insert into book_list(bookName,bookAuthor,cateId,bookid) values('%s','%s','%s','%s')" %(bookName,bookAuthor,cateid,bookid)
data = cur.execute(sql) #执行sql语句,data返回的是结果,1:成功
cur.connection.commit() #注意要执行完后要commit一下,不然修改结果不生效
数据库结构:
数据库名称:db_book
表:book_list: //图书表,保存图书的信息
cate_list: //分类信息,保存所有的图书分类,
chapter_list //章节表,保存的是图书章节的信息
建表语句如下:
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for book_list
-- ----------------------------
DROP TABLE IF EXISTS `book_list`;
CREATE TABLE `book_list` (
`id` int(2) NOT NULL AUTO_INCREMENT,
`bookName` varchar(255) DEFAULT NULL,
`bookAuthor` varchar(255) DEFAULT NULL,
`cateId` int(2) NOT NULL,
`bookid` int(2) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=451 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for cate_list
-- ----------------------------
DROP TABLE IF EXISTS `cate_list`;
CREATE TABLE `cate_list` (
`id` int(2) NOT NULL AUTO_INCREMENT,
`cateid` varchar(255) DEFAULT NULL,
`cateName` varchar(255) DEFAULT NULL,
`cateUrl` varchar(255) DEFAULT NULL,
`cate_name` varchar(255) NOT NULL,
`cate_url` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for chapter_list
-- ----------------------------
DROP TABLE IF EXISTS `chapter_list`;
CREATE TABLE `chapter_list` (
`id` int(2) NOT NULL AUTO_INCREMENT,
`chapterid` int(2) NOT NULL,
`chapterName` varchar(100) DEFAULT NULL,
`bookId` int(2) DEFAULT NULL,
`chapterText` text,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=298410 DEFAULT CHARSET=utf8;
SET FOREIGN_KEY_CHECKS = 1;
二 爬虫代码的编写
1.python 的安装,我使用的是anaconda + pycharm 开发的。
anaconda(简介):内部包含python 和一些常用的python库,而且可以做到python环境管理,类似于如果一个项目使用2.7版本,另一个项目使用3.7版本。开发中项目切换也比较麻烦,使用了anaconda直接可以使用命令来切换。
下载的时候要注意到官网下载很慢,推荐到清华镜像网下载。安装后使用安装别的库有可能失败,需要替换为清华源,可以参考一下博客:
anaconda 清华镜像
pycharm直接下载免费版本的就可以。
anaconda+pycharm 使用参考
2. 爬虫库的选择 requests + beautifulSoup
这俩个库的使用可以参考着个博客:
python requests库使用
python beautifulSoup库使用
三 代码
爬虫全部代码如下
# -*- coding: utf-8 -*-
#!/usr/bin/python
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import requests
from bs4 import BeautifulSoup
import pymysql
baseUrl = "https://www.9awx.com/"
conn = pymysql.connect('127.0.0.1',user="root",passwd="root",port=3306,db="db_book",charset="utf8")
cur = conn.cursor()
def checkContainer(item):
return item.find("div",class_="l")
pass
#这方法没用。。。。。
def dealText(text):
text.replace("温馨提示:","")
text.replace("本站提供《重生之绝世废少》最新章节阅读。","")
text.replace("同时提供《重生之绝世废少》全文阅读和全集txt下载。","")
text.replace("PC站小说和手机WAP站小说同步更新","")
text.replace("请使用手机访问 m.9awx.com 阅读。","")
text.replace("如你喜欢本站请将本站放入你的桌面以方便再次访问。","")
return text
pass
#解析章节详情
def loadChapterDetile(chapterId,chapterName,bookId,url):
try:
request = requests.get(url)
soup = BeautifulSoup(request.content, "html.parser", from_encoding="utf-8")
lists = soup.find("div", id="content")
text = lists.get_text()
clearText = dealText(text)
addChapterData(chapterId,chapterName,bookId,clearText)
except:
print(chapterName+"出错了")
pass
#插入章节数据
def addChapterData(chapterId,chapterName,bookid,chapterText):
try:
sql = "insert into chapter_list(chapterid,chapterName,bookid,chapterText) values('%s','%s','%s','%s')" % (
chapterId, chapterName,bookid,chapterText)
data = cur.execute(sql)
cur.connection.commit()
if data == 1:
print(chapterName + ":保存成功")
else:
print(chapterName + "保存失败了>>>>>>")
except:
print(chapterName+"出错了")
pass
#是否下载过这章
def isDownloadChapter(bookid,name):
try:
sql = "select * from chapter_list where bookId=%d and chapterName='%s'" % (bookid,name)
cur.execute(sql)
data = cur.fetchall()
return len(data) >0
except:
print("检查是否下载过章节出错:bookid:"+str(bookid)+" 章节名:"+name)
pass
#解析图书内容
def loadDetil(bookid,url):
try:
request = requests.get(url)
soup = BeautifulSoup(request.content, "html.parser", from_encoding="utf-8")
lists = soup.find("div",id="list")
aList = lists.find_all("a")
count = 1
for item in aList:
href = item.get("href")
name=item.string
chapterId = 1000000 + count
fixurl = url.replace("index.html",href)
count = count + 1
if not isDownloadChapter(bookid,name):
print("未下载过章节:"+name)
loadChapterDetile(chapterId,name,bookid,fixurl)
else:
print("已下载过章节:"+name)
except:
print(bookid+"出错了")
pass
#插入图书数据
def addBookData(bookid,bookName,bookAuthor,cateid):
try:
sql = "insert into book_list(bookName,bookAuthor,cateId,bookid) values('%s','%s','%s','%s')" %(bookName,bookAuthor,cateid,bookid)
data = cur.execute(sql)
cur.connection.commit()
if data == 1:
print(bookName+":保存成功")
else:
print(bookName+"保存失败了>>>>>>")
except:
print(bookName+"出错了")
pass
#判断是否下载过这本书
def isDownloadBook(name):
try:
sql = "select * from book_list where bookName='%s'" %(name)
cur.execute(sql)
data = cur.fetchall()
return len(data) >0
except:
print("检查图书出错,书名:"+name)
pass
#去下载各个分类下的书籍信息
def mainLoad(url,cateid):
try:
request = requests.get(url)
soup = BeautifulSoup(request.content, "html.parser", from_encoding="utf-8")
container = soup.find_all("div",id="newscontent")
for item in container:
divl = item.find("div", class_="l")
if divl != None:
lis = divl.find_all("li")
for li in lis:
span = li.find("span",class_="s2")
a = span.find("a")
bookName = a.string
span4 = li.find("span",class_="s4")
name = span4.string
sql = "select bookid from book_list order by bookid desc limit 1"
cur.execute(sql)
data = cur.fetchall()
bookId = data[0][0] + 1 #有问题直接使用自动生成的id就行
isdownload = isDownloadBook(bookName)
if not isdownload:
print("未下载过:"+name)
addBookData(bookId,bookName,name,cateid)
else:
print("已下载过:"+name)
detilUrl = a.get("href")
loadDetil(bookId,detilUrl)
except:
print("出问题了")
pass
#拿到所有的分类数据,主要是分类的url和分类的id
def findAllCateInfo():
sql = "select cateid,cateUrl from cate_list"
cur.execute(sql)
data = cur.fetchall()
for cateurl in data:
mainLoad(baseUrl+cateurl[1]+"/",cateurl[0])
pass
if __name__ == '__main__':
findAllCateInfo()
pass