百度飞桨7天入门课体验
偶然的机会从公众号中发现了百度飞桨从小白逆袭到大神的课程,课程结束,物超所值。
7天的课程全程免费,课程内容丰富,作业环环相扣,还有在线微信群大神全程指导!

第一天:python基础学习
作业文件的查找与存储
内容:熟悉python中os的使用

#导入OS模块
import os
#待搜索的目录路径
path = "Day1-homework"
#待搜索的名称
filename = "2020"
#定义保存结果的数组
result = []
index = 0
def findfiles(path):
    '''for root, dirs, files in os.walk(path):        
    for filename in files:            
    print(os.path.join(root, filename))'''
    global index
    files = os.listdir(path)
    for file in files:        
    file_path = os.path.join(path,file)        
    if os.path.isdir(file_path):            
    findfiles(file_path)        
    else:            
    if filename in file_path:                
    index =index + 1                
    result.append(index)                
    result.append(file_path)                
    print(result)                
    result.clear()
if __name__ == '__main__':    
    findfiles(path)

第二天内容:
主要内容:网络爬虫,requests库与beautifulSoup4库的使用
爬虫内容
1.发送请求(requests模块)

2.获取响应数据(服务器返回)

3.解析并提取数据(BeautifulSoup查找或者re正则)

4.保存数据
一、爬取百度百科中《青春有你2》中所有参赛选手信息,返回页面数据

import json
import re
import requests
import datetime
from bs4 import BeautifulSoup
import os
#获取当天的日期,并进行格式化,用于后面文件命名,格式:20200420today = datetime.date.today().strftime('%Y%m%d')    
def crawl_wiki_data():    """    爬取百度百科中《青春有你2》中参赛选手信息,返回html    """    
    headers = {         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'    }    
    url='https://baike.baidu.com/item/青春有你第二季'                         
    try:        
    response = requests.get(url,headers=headers)        
    print(response.status_code)
        #将一段文档传入BeautifulSoup的构造方法,就能得到一个文档的对象, 可以传入一段字符串        
        soup = BeautifulSoup(response.text,'lxml')                #返回的是class为table-view log-set-param的<table>所有标签        tables = soup.find_all('table',{'class':'table-view log-set-param'})
        crawl_table_title = "参赛学员"
        for table in  tables:                       
        #对当前节点前面的标签和字符串进行查找            
        table_titles=able.find_previous('div').find_all('h3')            
        for title in table_titles:                
        if(crawl_table_title in title):                    
        return table           
        except Exception as e:        
        print(e)

二、对爬取的页面数据进行解析,并保存为JSON文件

def parse_wiki_data(table_html):    
'''    从百度百科返回的html中解析得到选手信息,以当前日期作为文件名,存JSON文件,保存到work目录下    '''    
bs = BeautifulSoup(str(table_html),'lxml')    
all_trs = bs.find_all('tr')
    error_list = ['\'','\"']
    stars = []
    for tr in all_trs[1:]:         
         all_tds = tr.find_all('td')
         star = {}
         #姓名         
         star["name"]=all_tds[0].text         
         #个人百度百科链接         
         star["link"]= 'https://baike.baidu.com' + all_tds[0].find('a').get('href')         
         #籍贯         
         star["zone"]=all_tds[1].text        
          #星座         
          star["constellation"]=all_tds[2].text         
          #身高         
          star["height"]=all_tds[3].text         
          #体重         
          star["weight"]= all_tds[4].text
         #花语,去除掉花语中的单引号或双引号         
         flower_word = all_tds[5].text         
         for c in flower_word:             
             if  c in error_list:                 
             flower_word=flower_word.replace(c,'')         
             star["flower_word"]=flower_word                   
             #公司         
             if not all_tds[6].find('a') is  None:             
                 star["company"]= all_tds[6].find('a').text         
             else:             
                  star["company"]= all_tds[6].text  
             stars.append(star)
    json_data = json.loads(str(stars).replace("\'","\""))       
    with open('work/' + today + '.json', 'w', encoding='UTF-8') as f:        
    json.dump(json_data, f, ensure_ascii=False)

三、爬取每个选手的百度百科图片,并进行保存

def crawl_pic_urls():    
'''    爬取每个选手的百度百科图片,并保存    '''     
with open('work/'+ today + '.json', 'r', encoding='UTF-8') as file:         
    json_array = json.loads(file.read())
    headers = {         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'      }
    for star in json_array:
        name = star['name']        
        link = star['link']
        #!!!请在以下完成对每个选手图片的爬取,将所有图片url存储在一个列表pic_urls中!!!        
        response = requests.get(link,headers=headers)        
        bs = BeautifulSoup(response.text,'lxml')        
        pic_list_url = bs.select('.summary-pic a')[0].get('href')        
        pic_list_url = 'https://baike.baidu.com' + pic_list_url #获取图片存储页面        
        pic_list_response = requests.get(pic_list_url,headers=headers)        
        bs = BeautifulSoup(pic_list_response.text,'lxml')        
        pic_list_html = bs.select('.pic-list img ')        
        pic_urls=[]        
        for pic_html in pic_list_html:            
            pic_url = pic_html.get('src')            
            pic_urls.append(pic_url)
       down_pic(name,pic_urls)
def down_pic(name,pic_urls):    
'''    根据图片链接列表pic_urls, 下载所有图片,保存在以name命名的文件夹中,    '''    
    path = 'work/'+'pics/'+name+'/'
    if not os.path.exists(path):      
        os.makedirs(path)
        for i, pic_url in enumerate(pic_urls):        
            try:            
                pic = requests.get(pic_url, timeout=15)            
                string = str(i + 1) + '.jpg'            
                with open(path+string, 'wb') as f:                
                    f.write(pic.content)                
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))        
            except Exception as e:            
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))           
                print(e)            
                continue
四、打印爬取的所有图片的路径
def show_pic_path(path):    
'''    遍历所爬取的每张图片,并打印所有图片的绝对路径    '''    
    pic_num = 0    
    for (dirpath,dirnames,filenames) in os.walk(path):
        for filename in filenames:           
            pic_num += 1           
            print("第%d张照片:%s" % (pic_num,os.path.join(dirpath,filename)))                 
            print("共爬取《青春有你2》选手的%d照片" % pic_num)
if __name__ == '__main__':
     #爬取百度百科中《青春有你2》中参赛选手信息,返回html     
     html = crawl_wiki_data()
     #解析html,得到选手信息,保存为json文件     
     parse_wiki_data(html)
     #从每个选手的百度百科页面上爬取图片,并保存     
     crawl_pic_urls()
     #打印所爬取的选手图片路径     
     show_pic_path('/home/aistudio/work/pics/')
     print("所有信息爬取完成!")

第三天 选手数据分析
本节课重点是numpy 、matplotlib、PIL、pandas等库的使用

选手区域分布图

import matplotlib.pyplot as plt
import numpy as np 
import jsonimport matplotlib.font_manager as font_manager
#显示matplotlib生成的图形%matplotlib inline
with open('data/data31557/20200422.json', 'r', encoding='UTF-8') as file:         
json_array = json.loads(file.read())
#绘制小姐姐区域分布柱状图,x轴为地区,y轴为该区域的小姐姐数量
zones = []
for star in json_array:    
    zone = star['zone']    
    zones.append(zone)
    print(len(zones))
    print(zones)

'''zone_list = []
count_list = []
for zone in zones:    
   if zone not in zone_list:        
   count = zones.count(zone)        
   zone_list.append(zone)        
   count_list.append(count)'''
   count_dic={}
   for zone in zones:  
        count_dic[zone]=count_dic.get(zone,0)+1
        zone_list=list(count_dic.keys())
        count_list=list(count_dic.values())
        print(zone_list)
        print(count_list)# 设置显示中文plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
        plt.figure(figsize=(20,15))
        plt.bar(range(len(count_list)), count_list,color='r',tick_label=zone_list,facecolor='#9999ff',edgecolor='white')
# 这里是调节横坐标的倾斜度,rotation是度数,以及设置刻度字体大小  
        plt.xticks(rotation=45,fontsize=20)
        plt.yticks(fontsize=20)
        plt.legend()
        plt.title('''《青春有你2》参赛选手''',fontsize = 24)
        plt.savefig('/home/aistudio/work/result/bar_result.jpg')plt.show()


选手体重饼图
import matplotlib.pyplot as plt
import numpy as np 
import json
import matplotlib.font_manager as font_manager
import pandas as pd
#显示matplotlib生成的图形
%matplotlib inline
df = pd.read_json(r'data/data31557/20200422.json')
weights = df['weight']
arrs = weights.values
for i in range(len(arrs)):    
    arrs[i]=float(arrs[i][0:-2])#将kg去掉bin = [0,45,50,55,100]#将数据按照bin进行切割
    se1 = pd.cut(arrs,bin)
    sizes = pd.value_counts(se1)#进行计数,并排序
    labels = '<=45kg','45kg~50kg','50kg~55kg','>55kg'explode = (0.1,0.1,0.1,0)
    #fig1,ax1 = plt.subplots()
    ax1=plt.subplot(111)
    ax1.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow = True,startangle=165)
    ax1.axis('equal')
    plt.savefig('/home/aistudio/work/result/pie_result01.jpg')
    plt.show()
   '''