python爬虫爬取安居客数据

转载

编程艺术家 2024-09-08 19:37:04

文章标签 python爬虫爬取安居客数据 ci html 数据 文章分类 Python 后端开发

课前说明：本章节请求的 url 部分用 ** 代替

本章节需要掌握的知识点：

1、如何用 xpath 解析数据；

2、如何用 csv 存储数据（注：由于字典是无序的，所以写入csv时，表头可能跟内容搭配不对，需要手动调整表头信息）；

3、对csv 数据进行分析；

爬虫分三个模块：

1、请求模块：用于构造请求体，并将请求到的网页（数据）返回给解析模块；

2、解析模块：用于提取数据（本章节用xpath提取网页中的数据），并返回数据给存储模块；

3、存储模块：用于csv存储数据。

案例简介：

用于抓取网页 https://%s.lianj**.com/ershoufang/pg%d/ 中二手房的信息，例如：图片url、房价、位置、房型等信息。

简单解析一下爬虫的思路：

1、访问链接：https://%s.lianj**.com/ershoufang/pg%d/ 检查该网站是否动态加载网站（本网站为静态网站）

2、如何判断网站是否为静态网站：右键检查网页源代码，从页面上随机选取我们要爬取的内容，例如房价信息，然后ctrl+f查找，若找到了，则为静态网页，否则为动态加载网页（多试几次，多用一些我们需要爬取的内容尝试）

3、观察网页url的变化，经过观察，页面的变化规律为：

https://%s.lianj**.com/ershoufang/pg1/

https://%s.lianj**.com/ershoufang/pg2/

https://%s.lianj**.com/ershoufang/pg3/

4、用xpath 解析网页内容：

右击检查 -> 点击（下图带箭头的标志） -> 然后点击 “简装三房望花园景观。。。”（我们要爬取的信息） -> 点击之后控制台右边会出现蓝色块，这个蓝色块就是我们想要的信息 -> 在这蓝色块上右击 -> Copy -> Copy xpath 这样就能得到我们想要的内容的 xpath 路径

python爬虫爬取安居客数据_ci

爬虫代码如下：

import csv
from urllib import request
from time import sleep

from lxml import etree


# 请求模块
def handle_url(url, page, city):
    '''
    :param url: 请求url
    :param page: 请求页面
    :param city: 请求城市
    :return: 返回请求对象
    '''
    # 拼接url
    page_url = url % (city, page)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    }
    return request.Request(url=page_url, headers=headers)


def request_data(url, city, start, end):
    '''
    :param url: 请求url
    :param city: 请求城市
    :param start: 开始页面
    :param end: 结束页面
    :return: 返回请求到的HTML（数据）
    '''
    for page in range(start, end + 1):
        req = handle_url(url, page, city)
        res = request.urlopen(req)
        sleep(2)
        yield res.read().decode('utf-8')


# 解析模块
def analysis_html(html_list, city):
    '''
    :param html_list: 请求到的HTML列表
    :param city: 请求城市
    :return: 返回解析数据列表
    '''
    for html in html_list:
        # 用etree把HTML字符串初始化成一个节点树
        html_tree = etree.HTML(html)
        if city == 'bj':
            house_list = html_tree.xpath('//ul[@class="sellListContent"]//li')
            for house in house_list:
                title = house.xpath('.//div[@class="title"]/a/text()')
                houseInfo = house.xpath('.//div[@class="houseInfo"]//text()')
                positionInfo = house.xpath('.//div[@class="positionInfo"]//text()')
                unitPrice = house.xpath('.//div[@class="unitPrice"]//text()')
                totalPrice = house.xpath('.//div[@class="totalPrice"]//text()')
                pic = house.xpath('.//img[@class="lj-lazy"]/@data-original')
                if len(title) != 0:
                    item = {}
                    item['title'] = title[0]
                    item['houseInfo'] = ''.join(houseInfo[0:11])
                    item['positionInfo'] = ''.join(positionInfo[0:5])
                    item['unitPrice'] = unitPrice[0]
                    item['totalPrice'] = ''.join(totalPrice[0:2])
                    item['pic'] = pic[0]
                    yield item
                else:
                    pass
        else:
            # 查找到所有的房屋信息
            house_list = html_tree.xpath('//ul[@class="sellListContent"]//li')
            # 遍历所有的房屋信息，从中提取我们想要的内容，并且格式化
            for house in house_list:
                # 每一个house代表一个房屋信息的li节点，我们可以以house为根，通过
                # xpath的相对路径去解析本节点内部的数据
                # 创建一个字典，用于整合一条房屋信息
                item = {}
                # 用相对路径提取元素
                item['title'] = house.xpath('.//div[@class="title"]/a/text()')[0]
                item['houseInfo'] = ''.join(house.xpath('.//div[@class="houseInfo"]//text()'))
                item['positionInfo'] = ''.join(house.xpath('.//div[@class="positionInfo"]//text()'))
                item['unitPrice'] = house.xpath('.//div[@class="unitPrice"]//text()')[0]
                item['totalPrice'] = house.xpath('.//div[@class="totalPrice"]//text()')[0]
                item['pic'] = house.xpath('.//img[@class="lj-lazy"]/@data-original')[0]
                yield item



# 存储模块
def write_to_csv(data):
    '''
    写入csv表
    :param data: 
    :return: 
    '''
    # 把数据整合成二维列表的形式
    fp = open('ershoufang.csv', 'a+')
    # 用csv模块，创建一个写的对象
    writer = csv.writer(fp)
    # 首先写表头
    writer.writerow(['positionInfo', 'pic', 'title', 'unitPrice', 'totalPrice', 'houseInfo'])
    # 遍历data,把所有的房屋信息写入
    for item in data:
        # 把字典整合成一个列表
        house = []
        for k, v in item.items():
            house.append(v)
        # 把整合出来的信息写入
        writer.writerow(house)
    fp.close()


def main():
    url = 'https://%s.lianj**.com/ershoufang/pg%d/'
    city = input('请输入城市：')
    start = int(input('请输入起始页：'))
    end = int(input('请输入终止页：'))
    # 请求
    html_list = request_data(url, city, start, end)
    # 解析
    data = analysis_html(html_list, city)
    # 储存
    write_to_csv(data)


if __name__ == '__main__':
    main()

作者爬虫的是 gz (广州) 的所有二手房信息，现对二手房进行分析：

代码如下(推荐用Jupyter Notebook工具，方便调试)：

import re
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
%matplotlib inline
import matplotlib.pyplot as plt

# 加载二手房信息
ershoufang = pd.read_csv('./ershoufang.csv')

# 对二手房中的 houseInfo进行切割
def houseInfo(house):
    info_list = house.split('|')
    return info_list


# 将切割后的信息整合成二维列表
def h_info(house):
    house_list = []
    for h in house:
        house_list.append(h)
    return house_list


# 提取单价
def unit_price(price):
    pat = re.compile(r'单价(\d+)元/平米')
    res = float(pat.findall(price)[0])
    return res


# 切割 houseInfo   注：此步骤本该在爬虫阶段就把houseInfo分开，方便后期进行数据分析！！！
house_info = ershoufang['houseInfo'].map(houseInfo)
# 整合 houseInfo
house_info = h_info(house_info)
# 将 houseInfo 转为DataFrame 数据
house_df = DataFrame(house_info,columns=['name','type','size','position','other1','elevator','other2'])
# 通过级联的方式合并两个 DataFrame 数据
ershoufang = pd.concat([ershoufang,house_df],axis=1)
# 删除多余的信息
del ershoufang['houseInfo']
# 去掉单价中的汉字
ershoufang['unitPrice'] = ershoufang['unitPrice'].map(unit_price)
# 重命名其中的两个列索引名
ershoufang.rename(columns={'unitPrice':'unitPrice(元/平米)','totalPrice':'totalPrice(万元)'})

# 分类：type   聚合：unitPrice
totalPrice_max = ershoufang.groupby('type')[['totalPrice']].max()
totalPrice_min = ershoufang.groupby('type')[['totalPrice']].min()
totalPrice_mean = ershoufang.groupby('type')[['totalPrice']].mean()

unitPrice_max = ershoufang.groupby('type')[['unitPrice']].max()
unitPrice_min = ershoufang.groupby('type')[['unitPrice']].min()
unitPrice_mean = ershoufang.groupby('type')[['unitPrice']].mean()

# 画出单价的线形图
plt.figure(figsize=(16,9))
plt.plot(unitPrice_max,marker='*',markersize=10,label='unitPrice_max')
plt.plot(unitPrice_min,marker='*',markersize=10,label='unitPrice_min')
plt.plot(unitPrice_mean,marker='*',markersize=10,label='unitPrice_mean')
plt.grid(alpha=0.8)
plt.xlabel('tyeps')
plt.ylabel('yuan/pingfang',rotation=0,horizontalalignment='right')
plt.legend(loc=(0,1),ncol=3)
plt.yticks(range(10000,110000,10000))

# 画出总价的线形图
plt.figure(figsize=(16,9))
plt.plot(totalPrice_max,marker='*',markersize=10,label='totalPrice_max')
plt.plot(totalPrice_min,marker='*',markersize=10,label='totalPrice_min')
plt.plot(totalPrice_mean,marker='*',markersize=10,label='totalPrice_mean')
plt.grid(alpha=0.8)
plt.xlabel('tyeps')
plt.ylabel('totalPrice(w)',rotation=0,horizontalalignment='right')
plt.legend(loc=(0,1),ncol=3)
plt.yticks(range(100,3000,100))

单价线形图如下：

（注：由于matplotlib不支持中文，故横坐标中文显示不出来，内容为几室几厅，最后两个为:双拼别墅，独栋别墅）

python爬虫爬取安居客数据_ci_02