# coding: utf-8

# In[63]:

import requests
from pyquery import PyQuery as pq

response = requests.get(url='http://news.sina.com.cn/china/')
response.encoding = 'utf-8'
html = response.text
doc = pq(html)
news_list = []
for item in doc('.news-item').items():
news_url = item.find('h2 > a').attr('href')
if news_url:
news_list.append(get_article(news_url))
def get_article(news_url):
response = requests.get(url=news_url)
response.encoding = 'utf-8'
html = response.text
doc = pq(html)
msg = {}
msg['title'] = doc.find('#artibodyTitle').text()
msg['content'] = ''.join(doc.find('#artibody > p').text().split())
msg['time'] = doc.find('#navtimeSource').text()
msg['keyword'] = doc.find('.article-keywords > a').text()
return msg
news_list


# In[64]:

import pandas as pd
df = pd.DataFrame(news_list)
df.head()


# In[65]:

df['keyword'] = df['keyword'].map(lambda e : e.split())
df.head()


# In[66]:

df['time'].map(lambda e : e.split()).head()


# In[67]:

df['time'].str.extract('(\d+年\d+月\d+日\d+:\d+)\s+(\w+)').head()


# In[68]:

df[['datetime', 'from']] = df['time'].str.extract('(\d+年\d+月\d+日\d+:\d+)\s+(\w+)')
df.head()


# In[69]:

df['datetime'] = pd.to_datetime(df['datetime'], format = '%Y年%m月%d日%H:%M')
df['datetime'].head()


# In[70]:

df['datetime'].map(lambda e : (e.year, e.month, e.day)).head()


# In[71]:

del df['time']
df.head()


# In[72]:

df = df[['from', 'title', 'content', 'keyword', 'datetime']]


# In[73]:

df.to_excel('news.xlsx')


python - 采集 新浪新闻-国内-最新消息 转为xlsx_html