一、选题背景
1998年我国住房制度改革,停止了长期实行的住房实物福利分配制度,使得房地产业逐渐活跃起来,由此出现了住房短缺的现象,加之社会快速城镇化、居民收入不断增长以及银行住房消费信贷的支持,住房的潜在需求增大,推动着房价持续快速上涨。随着国内经济水平的不断发展,物价房价也在不断的增长。以广州为例,对于广州二手房价的分析,可以直观的看出广州各个小区的房价差异,同时衍射出各个区域的经济水平。
二、主题式网络爬虫设计方案
1.主题式网络爬虫名称
《Python爬虫之爬取广州二手房价》
2.主题式网络爬虫爬取的内容与数据特征分析
爬取内容:“广州” ,“二手房价”
数据特征分析:“二手房价”随着地区的不同也呈现着不同的价格,越接近市中心房价越贵,可通过后续的散点图,直方图等观察
3.主题式网络爬虫设计方案概述
实现思路:在浏览器 中通过F12访问网页源代码,,分析网站源代码,找到自己所需要的数据所在的位置,提取数据,对数据进行保存到相同路径csv文件中,读取改文件,进行数据清洗,数据模型分析,数据可视化处理,绘制分布图,直方图,散点图。
技术难点:对库使用和库中函数的运用,爬取的内容的机构分析处理做数据分析,即求回归系数。由于不明原因,输出结果经常会显示超出列表范围。
三.主题页面的结构特征分析
1.主题页面的结构与特征分析:先寻找到链家对应的网页代码,紧接着寻找广州地区,二手房价,房价数据所对应的class标签。
2.页面解析
1 import requests
2 import threading
3 import pandas as pd
4 from lxml import etree
5 import csv
6 # 全部信息列表
7 count=list()
8
9 #生成1-100页url
10 def url_creat():
11 #基础url
12 url = 'https://xm.lianjia.com/ershoufang/pg{}/'
13 #生成前10页url列表
14 links=[url.format(i) for i in range(1,101)]
15 return links
16
17 #对url进行解析
18 def url_parse(url):
19 headers = {
20 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
21 'Accept-Encoding': 'gzip, deflate, br',
22 'Accept-Language': 'zh-CN,zh;q=0.9',
23 'Cache-Control': 'no-cache',
24 'Connection': 'keep-alive',
25 'Cookie': 'lianjia_uuid=7e346c7c-5eb3-45d9-8b4f-e7cf10e807ba; UM_distinctid=17a3c5c21243a-0c5b8471aaebf5-6373267-144000-17a3c5c21252dc; _smt_uid=60d40f65.47c601a8; _ga=GA1.2.992911268.1624510312; select_city=370200; lianjia_ssid=f47906f0-df1a-49e2-ad9b-648711b11434; CNZZDATA1253492431=1056289575-1626962724-https%253A%252F%252Fwww.baidu.com%252F%7C1626962724; CNZZDATA1254525948=1591837398-1626960171-https%253A%252F%252Fwww.baidu.com%252F%7C1626960171; CNZZDATA1255633284=1473915272-1626960625-https%253A%252F%252Fwww.baidu.com%252F%7C1626960625; CNZZDATA1255604082=1617573044-1626960658-https%253A%252F%252Fwww.baidu.com%252F%7C1626960658; _jzqa=1.4194666890570963500.1624510309.1624510309.1626962867.2; _jzqc=1; _jzqy=1.1624510309.1626962867.2.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.jzqsr=baidu; _jzqckmp=1; _qzjc=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217a3c5c23964c1-05089a8de73cbf-6373267-1327104-17a3c5c23978b3%22%2C%22%24device_id%22%3A%2217a3c5c23964c1-05089a8de73cbf-6373267-1327104-17a3c5c23978b3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wyyantai%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1624510327,1626962872; _gid=GA1.2.134344742.1626962875; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1626962889; _qzja=1.1642609541.1626962866646.1626962866646.1626962866647.1626962872770.1626962889355.0.0.0.3.1; _qzjb=1.1626962866646.3.0.0.0; _qzjto=3.1.0; _jzqb=1.3.10.1626962867.1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiNzQ3M2M3OWQyZTQwNGM5OGM1MDBjMmMxODk5NTBhOWRhNmEyNjhkM2I5ZjNlOTkxZTdiMDJjMTg0ZGUxNzI0NDQ5YmZmZGI1ZjZmMDRkYmE0MzVmNmNlNDIwY2RiM2YxZTUzZWViYmQwYmYzMDQ1NDcyMzYwZTQzOTg3MzJhYTRjMTg0YjNhYjBkMGMyZGVmOWZiYjdlZWQwMDcwNWFkZmI5NzA5MjM1NmQ1NDg0MzQ3NGIzYjkwY2IyYmEwMjA2NjBjMjI2OWRjNjFiNDE3ZDc1NGViNjhlMzIzZmI0MjFkNzU5ZGNlMzAzMDhlNDAzYzIzNjllYWFlMzYxZGYxYjNmZmVkNGMxYTk1MmQ3MGY2MmJhMTQ1NWI4ODIwNTE5ODI2Njg2MmVkZTk4OWZiMDhjNTJhNzE3OTBlNDFiZDQzZTlmNDNmOGRlMTFjYTAwYTRlZTZiZWY5MTZkMTcwN1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI3ZjI1NWI1ZlwifSIsInIiOiJodHRwczovL3FkLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvMTAzMTE2MDkzOTU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==',
26 'Host': 'qd.lianjia.com',
27 'Pragma': 'no-cache',
28 'Referer': 'https://qd.lianjia.com/',
29 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
30 'sec-ch-ua-mobile': '?0',
31 'Sec-Fetch-Dest': 'document',
32 'Sec-Fetch-Mode': 'navigate',
33 'Sec-Fetch-Site': 'same-origin',
34 'Sec-Fetch-User': '?1',
35 'Upgrade-Insecure-Requests': '1',
36 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
37 response=requests.get(url=url,headers=headers).text
38 tree=etree.HTML(response)
39 #ul列表下的全部li标签
40 li_List=tree.xpath("//*[@class='sellListContent']/li")
41 #创建线程锁对象
42 lock = threading.RLock()
43 #上锁
44 lock.acquire()
45 for li in li_List:
46 #标题
47 title=li.xpath('./div/div/a/text()')[0]
48 #网址
49 link=li.xpath('./div/div/a/@href')[0]
50 #位置
51 postion=li.xpath('./div/div[2]/div/a/text()')[0]
52 #区
53 address=li.xpath('./div/div[2]/div/a[2]/text()')[0]
54 #类型
55 types=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[0]
56 #面积
57 area=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[1].replace('平米','')
58 #房屋信息
59 info=li.xpath('./div/div[3]/div/text()')[0][2:-1]
60 info=''.join(info)
61 #总价
62 count_price=li.xpath('.//div/div[6]/div/span/text()')[0]
63 #单价
64 angle_price=li.xpath('.//div/div[6]/div[2]/span/text()')[0].replace('元/平','')
65 dic={'标题':title,'房屋类型':types,"区":postion,'位置':address,'面积(平米)':area,"单价(元/平米)":angle_price,'总价(万)':count_price,'介绍':info,"网址":link}
66 print(dic)
67 #将房屋信息加入总列表中
68 count.append(dic)
69 #解锁
70 lock.release()
71 def run():
72 links = url_creat()
73 #多线程爬取
74 for i in links:
75 x=threading.Thread(target=url_parse,args=(i,))
76 x.start()
77 x.join()
78 #将全部房屋信息转化为excel
79 data=pd.DataFrame(count)
80 data.to_csv('广州房价信息.csv',index=False)
81 if __name__ == '__main__':
82 run()
2.
1 # 导入写入的广州二手房数据
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 df
3.
1 #对介绍栏数据清洗
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 #厅
4 df['厅'] = df['介绍'].str.split('|').str[0].replace("厅","")
5 #获取朝向
6 df['朝向'] = df['介绍'].str.split('|').str[2]
7 #获取装修类型
8 df['装修类型'] = df['介绍'].str.split('|').str[3]
9 #获取楼层
10 df['楼层'] = df['介绍'].str.split('|').str[4]
11 #获取建筑年份
12 df['年份'] = df['介绍'].str.split('|').str[5].replace("年建","")
13 df = df.drop('介绍', axis = 1)
14 df = df.drop('网址', axis = 1)
15 df
16 #print(df)
4.
1 #数据可视化阶段
2 #查看哪个朝向的房屋数量最多
3 from pylab import *
4 mpl.rcParams['font.sans-serif'] = ['SimHei']
5 mpl.rcParams['axes.unicode_minus'] = False
6 df = (df['朝向'].value_counts())[:16].to_frame()
7 plt.figure(figsize=(15,15))
8 plt.pie(df['朝向'], labels=df.index.values, autopct='%.1f%%')
9 plt.title('中国租房信息',fontsize=20)
5.
1 #查看哪个装修类型房屋数量最多
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 df['装修类型'] = df['介绍'].str.split('|').str[3]
4 from pylab import *
5 mpl.rcParams['font.sans-serif'] = ['SimHei']
6 mpl.rcParams['axes.unicode_minus'] = False
7 df = (df['装修类型'].value_counts())[:16].to_frame()
8 plt.figure(figsize=(15,15))
9 plt.pie(df['装修类型'], labels=df.index.values, autopct='%.1f%%')
10 plt.title('中国租房信息',fontsize=20)
6.
1 from pylab import *
2 mpl.rcParams['font.sans-serif'] = ['SimHei']
3 mpl.rcParams['axes.unicode_minus'] = False
4 df = (df['房屋类型'].value_counts())[:16].to_frame()
5 plt.figure(figsize=(15,15))
6 plt.pie(df['房屋类型'], labels=df.index.values, autopct='%.1f%%')
7 plt.title('中国租房信息',fontsize=20)
7.
1 from matplotlib import pyplot as plt
2 import pandas as pd
3 import jieba
4 import wordcloud
5 from pylab import *
6 mpl.rcParams['font.sans-serif'] = ['SimHei']
7 mpl.rcParams['axes.unicode_minus'] = False
8 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
9 #房屋类型和价格的分析
10 plt.figure(figsize=(30,10))
11 plt.subplot(1,2,1) #一行两列第一个图
12 type = df['房屋类型']
13 # type = list(type)
14 price = df['总价(万)']
15 plt.scatter(type,price)
16 plt.xlabel('房屋类型')
17 plt.ylabel('价格')
18
19 plt.subplot(1,2,2) #一行两列第一个图
20 plt.title('类型统计',fontsize=20,)
21 type.value_counts().plot(kind='bar',) #绘制条形图
22 plt.xlabel('房屋类型')
23 plt.show()
8.
1 #对广州房源区域数据进行分组、统计、排序
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 region_num = df['位置'].value_counts()
4 region_num
9.
1 #转换为列表对象
2 data_region_pair = [list(z) for z in zip(region_num.index.tolist(), region_num.values.tolist())]
3 data_region_pair
4
5 # 条形图
6 fig = plt.figure(figsize=(100,50))
7
8 plt.rcParams['font.sans-serif'] = ['SimHei']#SimHeri是中文字体名称
9 plt.rcParams['axes.unicode_minus'] = False
10
11 plt.title('广州区域房源数量条形图')
12 plt.xlabel('位置',fontsize=18,color='red')
13 plt.ylabel('房源数量',fontsize=18,color='red')
14
15 x = region_num.index.tolist()
16 y = region_num.values.tolist()
17
18 rects = plt.bar(x, y)
19
20 for rect in rects:
21 height = rect.get_height()
22 plt.text(rect.get_x() + rect.get_width() / 2, height+20, str(height), ha="center", va="bottom")
23 plt.xticks(rotation=270)
24 plt.show()
9.
1 #查看各小区房屋数量
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 fig,ax = plt.subplots()
4 fig.set_size_inches(10,300)
5 xq = df['区'].value_counts().sort_values()
6 index = list(xq.index)
7 value = list(xq.values)
8 qx = pd.DataFrame({'区':index,'城市':value})
9 qx.plot.barh(x='区',y='城市',ax=ax,color='blue',fontsize=12)
10 plt.legend(loc='right')
11 for a,b in zip(value,np.arange(0,14,1)):
12 plt.text(a+0.5,b,a,fontsize=12)
13 plt.show()
10.
1 # 绘制面积和总价的散点关系图
2 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
3 home_area = df['面积(平米)'].apply(lambda x:float(x))
4 # print(home_area.head())
5 total_price = df['总价(万)']
6 # print(total_price.head())
7 plt.scatter(home_area,total_price,s=3)
8 plt.title('广州房价情况',fontsize=15)
9 plt.xlabel('房屋面积',fontsize=15)
10 plt.ylabel('房价',fontsize=15)
11 plt.grid(linestyle=":", color="r")
12 plt.sho
四、完整代码
1 import requests
2 import threading
3 import pandas as pd
4 from lxml import etree
5 import csv
6 # 全部信息列表
7 count=list()
8
9 #生成1-100页url
10 def url_creat():
11 #基础url
12 url = 'https://xm.lianjia.com/ershoufang/pg{}/'
13 #生成前10页url列表
14 links=[url.format(i) for i in range(1,101)]
15 return links
16
17 #对url进行解析
18 def url_parse(url):
19 headers = {
20 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
21 'Accept-Encoding': 'gzip, deflate, br',
22 'Accept-Language': 'zh-CN,zh;q=0.9',
23 'Cache-Control': 'no-cache',
24 'Connection': 'keep-alive',
25 'Cookie': 'lianjia_uuid=7e346c7c-5eb3-45d9-8b4f-e7cf10e807ba; UM_distinctid=17a3c5c21243a-0c5b8471aaebf5-6373267-144000-17a3c5c21252dc; _smt_uid=60d40f65.47c601a8; _ga=GA1.2.992911268.1624510312; select_city=370200; lianjia_ssid=f47906f0-df1a-49e2-ad9b-648711b11434; CNZZDATA1253492431=1056289575-1626962724-https%253A%252F%252Fwww.baidu.com%252F%7C1626962724; CNZZDATA1254525948=1591837398-1626960171-https%253A%252F%252Fwww.baidu.com%252F%7C1626960171; CNZZDATA1255633284=1473915272-1626960625-https%253A%252F%252Fwww.baidu.com%252F%7C1626960625; CNZZDATA1255604082=1617573044-1626960658-https%253A%252F%252Fwww.baidu.com%252F%7C1626960658; _jzqa=1.4194666890570963500.1624510309.1624510309.1626962867.2; _jzqc=1; _jzqy=1.1624510309.1626962867.2.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.jzqsr=baidu; _jzqckmp=1; _qzjc=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217a3c5c23964c1-05089a8de73cbf-6373267-1327104-17a3c5c23978b3%22%2C%22%24device_id%22%3A%2217a3c5c23964c1-05089a8de73cbf-6373267-1327104-17a3c5c23978b3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wyyantai%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1624510327,1626962872; _gid=GA1.2.134344742.1626962875; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1626962889; _qzja=1.1642609541.1626962866646.1626962866646.1626962866647.1626962872770.1626962889355.0.0.0.3.1; _qzjb=1.1626962866646.3.0.0.0; _qzjto=3.1.0; _jzqb=1.3.10.1626962867.1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiNzQ3M2M3OWQyZTQwNGM5OGM1MDBjMmMxODk5NTBhOWRhNmEyNjhkM2I5ZjNlOTkxZTdiMDJjMTg0ZGUxNzI0NDQ5YmZmZGI1ZjZmMDRkYmE0MzVmNmNlNDIwY2RiM2YxZTUzZWViYmQwYmYzMDQ1NDcyMzYwZTQzOTg3MzJhYTRjMTg0YjNhYjBkMGMyZGVmOWZiYjdlZWQwMDcwNWFkZmI5NzA5MjM1NmQ1NDg0MzQ3NGIzYjkwY2IyYmEwMjA2NjBjMjI2OWRjNjFiNDE3ZDc1NGViNjhlMzIzZmI0MjFkNzU5ZGNlMzAzMDhlNDAzYzIzNjllYWFlMzYxZGYxYjNmZmVkNGMxYTk1MmQ3MGY2MmJhMTQ1NWI4ODIwNTE5ODI2Njg2MmVkZTk4OWZiMDhjNTJhNzE3OTBlNDFiZDQzZTlmNDNmOGRlMTFjYTAwYTRlZTZiZWY5MTZkMTcwN1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI3ZjI1NWI1ZlwifSIsInIiOiJodHRwczovL3FkLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvMTAzMTE2MDkzOTU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==',
26 'Host': 'qd.lianjia.com',
27 'Pragma': 'no-cache',
28 'Referer': 'https://qd.lianjia.com/',
29 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
30 'sec-ch-ua-mobile': '?0',
31 'Sec-Fetch-Dest': 'document',
32 'Sec-Fetch-Mode': 'navigate',
33 'Sec-Fetch-Site': 'same-origin',
34 'Sec-Fetch-User': '?1',
35 'Upgrade-Insecure-Requests': '1',
36 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
37 response=requests.get(url=url,headers=headers).text
38 tree=etree.HTML(response)
39 #ul列表下的全部li标签
40 li_List=tree.xpath("//*[@class='sellListContent']/li")
41 #创建线程锁对象
42 lock = threading.RLock()
43 #上锁
44 lock.acquire()
45 for li in li_List:
46 #标题
47 title=li.xpath('./div/div/a/text()')[0]
48 #网址
49 link=li.xpath('./div/div/a/@href')[0]
50 #位置
51 postion=li.xpath('./div/div[2]/div/a/text()')[0]
52 #区
53 address=li.xpath('./div/div[2]/div/a[2]/text()')[0]
54 #类型
55 types=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[0]
56 #面积
57 area=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[1].replace('平米','')
58 #房屋信息
59 info=li.xpath('./div/div[3]/div/text()')[0][2:-1]
60 info=''.join(info)
61 #总价
62 count_price=li.xpath('.//div/div[6]/div/span/text()')[0]
63 #单价
64 angle_price=li.xpath('.//div/div[6]/div[2]/span/text()')[0].replace('元/平','')
65 dic={'标题':title,'房屋类型':types,"区":postion,'位置':address,'面积(平米)':area,"单价(元/平米)":angle_price,'总价(万)':count_price,'介绍':info,"网址":link}
66 print(dic)
67 #将房屋信息加入总列表中
68 count.append(dic)
69 #解锁
70 lock.release()
71 def run():
72 links = url_creat()
73 #多线程爬取
74 for i in links:
75 x=threading.Thread(target=url_parse,args=(i,))
76 x.start()
77 x.join()
78 #将全部房屋信息转化为excel
79 data=pd.DataFrame(count)
80 data.to_csv('广州房价信息.csv',index=False)
81 if __name__ == '__main__':
82 run()
83
84
85 # 导入写入的广州二手房数据
86 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
87 df
88
89 #对介绍栏数据清洗
90 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
91 #厅
92 df['厅'] = df['介绍'].str.split('|').str[0].replace("厅","")
93 #获取朝向
94 df['朝向'] = df['介绍'].str.split('|').str[2]
95 #获取装修类型
96 df['装修类型'] = df['介绍'].str.split('|').str[3]
97 #获取楼层
98 df['楼层'] = df['介绍'].str.split('|').str[4]
99 #获取建筑年份
100 df['年份'] = df['介绍'].str.split('|').str[5].replace("年建","")
101 df = df.drop('介绍', axis = 1)
102 df = df.drop('网址', axis = 1)
103 df
104 #print(df)
105
106
107 #-----------------------------------------------------------------------------------------------
108 #数据可视化阶段
109 #查看哪个朝向的房屋数量最多
110 from pylab import *
111 mpl.rcParams['font.sans-serif'] = ['SimHei']
112 mpl.rcParams['axes.unicode_minus'] = False
113 df = (df['朝向'].value_counts())[:16].to_frame()
114 plt.figure(figsize=(15,15))
115 plt.pie(df['朝向'], labels=df.index.values, autopct='%.1f%%')
116 plt.title('中国租房信息',fontsize=20)
117
118
119 #查看哪个装修类型房屋数量最多
120 df = pd.read_csv(r'C:/Users/bb/广州房价信息.csv')
121 df['装修类型'] = df['介绍'].str.split('|').str[3]
122 from pylab import *
123 mpl.rcParams['font.sans-serif'] = ['SimHei']
124 mpl.rcParams['axes.unicode_minus'] = False
125 df = (df['装修类型'].value_counts())[:16].to_frame()
126 plt.figure(figsize=(15,15))
127 plt.pie(df['装修类型'], labels=df.index.values, autopct='%.1f%%')
128 plt.title('中国租房信息',fontsize=20)
129
130
131 from pylab import *
132 mpl.rcParams['font.sans-serif'] = ['SimHei']
133 mpl.rcParams['axes.unicode_minus'] = False
134 df = (df['房屋类型'].value_counts())[:16].to_frame()
135 plt.figure(figsize=(15,15))
136 plt.pie(df['房屋类型'], labels=df.index.values, autopct='%.1f%%')
137 plt.title('中国租房信息',fontsize=20)
138
139
140 from matplotlib import pyplot as plt
141 import pandas as pd
142 import jieba
143 import wordcloud
144 from pylab import *
145 mpl.rcParams['font.sans-serif'] = ['SimHei']
146 mpl.rcParams['axes.unicode_minus'] = False
147 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
148 #房屋类型和价格的分析
149 plt.figure(figsize=(30,10))
150 plt.subplot(1,2,1) #一行两列第一个图
151 type = df['房屋类型']
152 # type = list(type)
153 price = df['总价(万)']
154 plt.scatter(type,price)
155 plt.xlabel('房屋类型')
156 plt.ylabel('价格')
157
158 plt.subplot(1,2,2) #一行两列第一个图
159 plt.title('类型统计',fontsize=20,)
160 type.value_counts().plot(kind='bar',) #绘制条形图
161 plt.xlabel('房屋类型')
162 plt.show()
163
164
165 #对广州房源区域数据进行分组、统计、排序
166 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
167 region_num = df['位置'].value_counts()
168 region_num
169
170
171
172 #转换为列表对象
173 data_region_pair = [list(z) for z in zip(region_num.index.tolist(), region_num.values.tolist())]
174 data_region_pair
175
176 # 条形图
177 fig = plt.figure(figsize=(100,50))
178
179 plt.rcParams['font.sans-serif'] = ['SimHei']#SimHeri是中文字体名称
180 plt.rcParams['axes.unicode_minus'] = False
181
182 plt.title('广州区域房源数量条形图')
183 plt.xlabel('位置',fontsize=18,color='red')
184 plt.ylabel('房源数量',fontsize=18,color='red')
185
186 x = region_num.index.tolist()
187 y = region_num.values.tolist()
188
189 rects = plt.bar(x, y)
190
191 for rect in rects:
192 height = rect.get_height()
193 plt.text(rect.get_x() + rect.get_width() / 2, height+20, str(height), ha="center", va="bottom")
194 plt.xticks(rotation=270)
195 plt.show()
196
197
198
199
200 #查看各小区房屋数量
201 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
202 fig,ax = plt.subplots()
203 fig.set_size_inches(10,300)
204 xq = df['区'].value_counts().sort_values()
205 index = list(xq.index)
206 value = list(xq.values)
207 qx = pd.DataFrame({'区':index,'城市':value})
208 qx.plot.barh(x='区',y='城市',ax=ax,color='blue',fontsize=12)
209 plt.legend(loc='right')
210 for a,b in zip(value,np.arange(0,14,1)):
211 plt.text(a+0.5,b,a,fontsize=12)
212 plt.show()
213
214
215
216
217 # 绘制面积和总价的散点关系图
218 df = pd.read_csv(r'C:\Users\Administrator\广州房价信息.csv')
219 home_area = df['面积(平米)'].apply(lambda x:float(x))
220 # print(home_area.head())
221 total_price = df['总价(万)']
222 # print(total_price.head())
223 plt.scatter(home_area,total_price,s=3)
224 plt.title('广州房价情况',fontsize=15)
225 plt.xlabel('房屋面积',fontsize=15)
226 plt.ylabel('房价',fontsize=15)
227 plt.grid(linestyle=":", color="r")
228 plt.show()
五、总结
1.广州不管是全新的房源或者是二手的房源价格都普遍超过大部分城市,位居全国第四
2.写完此课程设计让我对python的运用更进一步,也更加深刻的理解了数据可视化。近几周的python程序设计是我做过最有意义的一件事