存入数据库
1. 创建数据库
这里使用的是MySQL数据库
**注:**要以管理员模式打开终端
先输入:net start mysql
启动mysql服务
可以先尝试登录以下mysql数据库:
语法:mysql -h 主机名 (ip) -u 用户名 -P 端口号 -p
使用navicat数据库可视化软件:新建数据库连接,本地就是localhost(127.0.0.1)
连接完显示如下:
新建数据库:选择utf8中文编码格式
新建表:id为主键自增
2.更改Scrapy配置,与数据库连接
- 在item.py文件中
将需要存储的数据先加入items中进行数据清洗
存入数据name(姓名), categories(分类), score(分数)
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
categories = scrapy.Field()
score = scrapy.Field()
- 之后切换至movie.py文件中加入items.py中的
MovieItem
类
# 调用items.py文件中的MovieItem类
from ..items import MovieItem
完善get_content
函数:实例化MovieItem,将网页中爬取到的name, category, score数据进行清洗;yield
之后会自动反馈至piplines中
def get_content(self, response):
# item实例化
item = MovieItem()
source = json.loads(response.text)
for i in source['results']:
# 数据先存入items.py中
item['name'] = i['name']
item['categories'] = i['categories']
item['score'] = i['score']
# 会向piplines.py反馈
yield item
- 数据传递至pipline.py中,所以在pipline.py中完善代码:
但由于Scrapy的pipline配置默认是关闭的所以得先启动pipline
切换至setting.py中
找到ITEM_PIPELINES
解开注释便可
pipline成功启动 - 再切换至piplines.py文件中
完整代码如下:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
from pymysql.converters import escape_string
# 第四步,从items中提取数据
class MoviePipeline:
def __init__(self, mysql_url, mysql_db, mysql_user, mysql_password):
self.mysql_url = mysql_url
self.mysql_db = mysql_db
self.mysql_user = mysql_user
self.mysql_password = mysql_password
@classmethod
def from_crawler(cls, crawler):
return cls(mysql_url=crawler.settings.get('MYSQL_URL'), mysql_db=crawler.settings.get('MYSQL_DB'), mysql_user=crawler.settings.get('MYSQL_USER'), mysql_password=crawler.settings.get('MYSQL_PASSWORD'))
def open_spider(self, spider):
print(self.mysql_url, self.mysql_db)
self.client = pymysql.connect(user=self.mysql_user,password=self.mysql_password,host=self.mysql_url,db=self.mysql_db)
self.cursor = self.client.cursor()
def process_item(self, item, spider):
# 从item中取出数据
name = item['name']
# category是一个列表,需要转为字符串,用|拼接
categories = '|'.join(item['categories'])
score = item['score']
sql = 'insert into movie(name, categories, score) values ("{}", "{}", "{}")'.format(name, categories, score)
print(sql)
self.cursor.execute(sql)
self.client.commit()
return item
def close_spider(self, spider):
self.client.close()
代码讲解:
- 方法
__init__(),from_crawler(),open_spider(),close_spider()
都是固定代码,复制粘贴即可 - 方法
process_item()
中需要从item中提取出清洗完毕的数据
name = item[‘name’]
categories = ‘|’.join(item[‘categories’])
score = item[‘score’] - 还是在
process_item()
中写入SQL语句,加入数据库中即可
self.cursor.execute(sql)
self.client.commit()
return item
固定代码必须写
我们还没有完成数据库的连接,有些变量还没有进行赋值
- 切换至setting.py文件最后位置加入
MYSQL_URL = '127.0.0.1'
MYSQL_DB = 'mymovie'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_URL——mysql数据库地址
MYSQL_DB——mysql数据库中的库名
MYSQL_USER——mysql数据库 用户名
MYSQL_PASSWORD——mysql数据库 密码
之后运行run.py便可
结果:
3.代码总结:
movie.py中:
import scrapy
import json
# 调用items.py文件中的MovieItem类
from ..items import MovieItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['spa1.scrape.center']
# start_urls = ['http://spa1.scrape.center/']
def start_requests(self):
# 将从demo.py中获取的接口网页连接加入,列表生成式
web_url = ['https://spa1.scrape.center/api/movie?limit=10&offset={}'.format(page) for page in range(0, 91, 10)]
for i in web_url:
yield scrapy.Request(i, self.get_content)
def get_content(self, response):
# item实例化
item = MovieItem()
source = json.loads(response.text)
for i in source['results']:
# 数据先存入items.py中
item['name'] = i['name']
item['categories'] = i['categories']
item['score'] = i['score']
# 会向piplines.py反馈
yield item
items.py中:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
categories = scrapy.Field()
score = scrapy.Field()
piplines.py中:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
from pymysql.converters import escape_string
# 第四步,从items中提取数据
class MoviePipeline:
def __init__(self, mysql_url, mysql_db, mysql_user, mysql_password):
self.mysql_url = mysql_url
self.mysql_db = mysql_db
self.mysql_user = mysql_user
self.mysql_password = mysql_password
@classmethod
def from_crawler(cls, crawler):
return cls(mysql_url=crawler.settings.get('MYSQL_URL'), mysql_db=crawler.settings.get('MYSQL_DB'), mysql_user=crawler.settings.get('MYSQL_USER'), mysql_password=crawler.settings.get('MYSQL_PASSWORD'))
def open_spider(self, spider):
print(self.mysql_url, self.mysql_db)
self.client = pymysql.connect(user=self.mysql_user,password=self.mysql_password,host=self.mysql_url,db=self.mysql_db)
self.cursor = self.client.cursor()
def process_item(self, item, spider):
# 从item中取出数据
name = item['name']
# category是一个列表,需要转为字符串,用|拼接
categories = '|'.join(item['categories'])
score = item['score']
sql = 'insert into movie(name, categories, score) values ("{}", "{}", "{}")'.format(name, categories, score)
print(sql)
self.cursor.execute(sql)
self.client.commit()
return item
def close_spider(self, spider):
self.client.close()
setting.py中:
解开ITEM_PIPELINES注释:
ITEM_PIPELINES = {
'Movie.pipelines.MoviePipeline': 300,
}
添加数据库:
MYSQL_URL = '127.0.0.1'
MYSQL_DB = 'mymovie'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'