遇到的问题:
1.用户昵称的编码问题。
请求https://m.weibo.cn/api/container/getSecond?containerid=1005055232655493_-_FOLLOWERS&page=4的cookie有一个参数H5_INDEX_TITLE是有userName经过urlencode编码得来的,因此要获得用户userName,可以在登录微博后查看m.weibo.cn的网页源代码中获得。
userName是经过编码处理的,中文或字符会转出Unicode编码,而英文字符不会。我是通过正则表达式匹配得到的,匹配结果Meeeeeeeeeeeeee\u4e36中的\u4e36是不能转义的,因此就不能变成中文。这就存在一个问题,将匹配的userName经过urlencode得到的H5_INDEX_TITLE不是我们想要的值。因为\u4e36被当作一个字符串进行urlencode。
解决办法:使用decode
username.encode('utf-8').decode('unicode_escape'),decode('unicode_escape')将转义字符\u读取出来
参考地址:Python读取文件中unicode编码转成中文显示问题
2.urlencode
from
urllib
import
parse
query
=
{
'name'
:
'username'
,
'age'
:
20
,
}
parse.urlencode(query)
'name=username&age=20'
在线解密工具: https://tool.lu/encdec/
代码如下:
1 #!/usr/bin/env python3
2 # -*- coding:utf-8 -*-
3 import requests
4 import time
5 import random
6 import re
7 import csv
8 from urllib import parse
9 from user_agent import getUserAgent
10
11
12 class GetMweiboFollow(object):
13
14 def __init__(self, username, password):
15 '''
16 GetMweiboFollow给绑定属性username,password;使用requests的Session(),使得登录微博后能够保持登录状态
17 :param username: 用户登录新浪微博的账号(邮箱,手机号码等,不包括QQ登录)
18 :param password: 账号密码
19 '''
20 self.__username = username
21 self.__password = password
22 self.request = requests.Session()
23
24 def login_mweibo(self):
25 # 登录微博
26 print('登录前请关闭微博的登录保护!!!')
27 user_agent = getUserAgent()
28 headers = {
29 'Accept': '*/*',
30 'Accept-Encoding': 'gzip, deflate, br',
31 'Accept-Language': 'zh-CN,zh;q=0.8',
32 'Cache-Control': 'no-cache',
33 'Connection': 'keep-alive',
34 'Content-Length': '286',
35 'Content-Type': 'application/x-www-form-urlencoded',
36 'Host': 'passport.weibo.cn',
37 'Origin': 'https://passport.weibo.cn',
38 'Pragma': 'no-cache',
39 'Referer': 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F',
40 # http%3A%2F%2Fm.weibo.cn%2F 可用urldecode解码,http://m.weibo.cn/
41 'User-Agent': user_agent
42 }
43 data = {
44 'username': self.__username,
45 'password': self.__password,
46 'savestate': '1',
47 'r': 'http://m.weibo.cn/',
48 'ec': '0',
49 'pagerefer': 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F',
50 'entry': 'mweibo',
51 'wentry': '',
52 'loginfrom': '',
53 'client_id': '',
54 'code': '',
55 'qq': '',
56 'mainpageflag': '1',
57 'hff': '',
58 'hfp': ''
59 }
60 login_url = 'https://passport.weibo.cn/sso/login'
61 try:
62 time.sleep(random.uniform(1.0, 2.5))
63 login_response = self.request.post(login_url, headers=headers, data=data)
64 login_status = login_response.json()['msg'] # 获得登录状态信息,用于判断是否成功登录。
65 if login_response.status_code == 200 and login_status == '用户名或密码错误':
66 print('{}登录失败!'.format(login_status))
67 else:
68 print("{}成功登录微博!".format(data['username']))
69 # 以下为成功登录微博的标志。无论是否成功登录微博,此请求状态码都为200
70 # login_response.json()['msg'] == ''或者login_response.json()['retcode'] == 20000000
71 self.uid = login_response.json()['data']['uid'] # 获得用户ID,即uid
72 self.cookie_info = login_response.headers['Set-Cookie'] # 获得服务器响应此请求的set-cookie,用于后面构建cookie
73 return True, self.uid, self.cookie_info
74 except Exception as e:
75 print('Error:', e.args)
76
77 def get_cookies(self):
78 # 构建cookie,用于获得关注列表get_follow_url()时,发送请求的headers的Cookie设置
79 # 通过正则表达式,获得cookie里的几个参数SUB、SHUB、SCF、SSOloginState
80 comp = re.compile(r'SUB=(.*?);.*?SUHB=(.*?);.*?SCF=(.*?);.*?SSOLoginState=(.*?);.*?ALF=(.*?);.*?')
81 reg_info = re.findall(comp, self.cookie_info)[0]
82 SUB = reg_info[0]
83 SHUB = reg_info[1]
84 SCF = reg_info[2]
85 SSOLoginState = reg_info[3]
86 # ALF = reg_info[4]
87 m_weibo_cookie = 'SUB' + '=' + SUB + ';' \
88 + 'SHUB' + '=' + SHUB + ';' \
89 + 'SCF' + '=' + SCF + ';' \
90 + 'SSOLoginState' + '=' + SSOLoginState
91 headers = {
92 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
93 'Accept-Encoding': 'gzip, deflate, br',
94 'Accept-Language': 'zh-CN,zh;q=0.9',
95 'Connection': 'keep-alive',
96 'Cookie': m_weibo_cookie,
97 'Host': 'm.weibo.cn',
98 'Upgrade-Insecure-Requests': '1',
99 'User-Agent': getUserAgent()
100 }
101 # 发送请求给m.weibo.cn,获得响应体中的其它cookie参数,_T_WM、H5_INDEX_TITLE
102 # MLOGIN、H5_INDEX、WEIBOCN_FROM的值是固定的
103 # H5_INDEX_TITLE是将用户昵称经过urlencode得到的
104 m_weibo_resp = self.request.get('https://m.weibo.cn/', headers=headers)
105 username = re.findall(r'"userName":"(.*?)"', m_weibo_resp.text)[0]
106 # 获得的用户昵称,中文字符是转成了Unicode编码(如\u4e5d),而英文字符没有。因此要将username由unicode编码为utf-8,再以uniocde_escape解码
107 # unicode_escape可以将转义字符\u读取出来
108 username_unicode = username.encode('utf-8').decode('unicode_escape')
109 _T_WM = re.findall(r'_T_WM=(.*?);', m_weibo_resp.headers['Set-Cookie'])[0]
110 MLOGIN = 1
111 H5_INDEX = 3
112 WEIBOCN_FROM = 1110006030
113 H5_INDEX_TITLE = parse.urlencode({'H5_INDEX_TITLE': username_unicode})
114 self.build_weibo_cookie = m_weibo_cookie + ';' \
115 + '_T_WM' + '=' + _T_WM + ';' \
116 + 'MLOGIN' + '=' + str(MLOGIN) + ';' \
117 + 'H5_INDEX' + '=' + str(H5_INDEX) + ';' \
118 + H5_INDEX_TITLE + ';'\
119 + 'WEIBOCN_FROM' + '=' + str(WEIBOCN_FROM)
120
121 def get_follow_url(self, page=1, *args):
122 # 关注列表的api接口,Ajax加载。每一页最多十条关注列表信息;页数大于1,传入page参数
123 # 获得每页的api接口的json格式数据,即关注列表信息
124 user_agent = getUserAgent()
125 contain_uid = str(100505) + self.uid
126 if page <= 1:
127 params = {'containerid': '{}_-_FOLLOWERS'.format(contain_uid)}
128 cookie = self.build_weibo_cookie
129 else:
130 params = {'containerid': '{}_-_FOLLOWERS'.format(contain_uid),
131 'page': args[0]}
132 cookie = self.build_weibo_cookie + ';' \
133 + 'M_WEIBOCN_PARAMS=fid%3D{}_-_FOLLOWERS%26uicode%3D10000012'.format(contain_uid)
134 headers = {
135 'Accept': 'application/json, text/plain, */*',
136 'Accept-Encoding': 'gzip, deflate, br',
137 'Accept-Language': 'zh-CN,zh;q=0.9',
138 'Connection': 'keep-alive',
139 'Cookie': cookie,
140 'Host': 'm.weibo.cn',
141 'Referer': 'https://m.weibo.cn/p/second?containerid={}'.format(params['containerid']),
142 'User-Agent': user_agent,
143 'X-Requested-With': 'XMLHttpRequest'
144 }
145 follow_url = 'https://m.weibo.cn/api/container/getSecond?'
146 try:
147 time.sleep(random.uniform(0.5, 2.7))
148 resp = self.request.get(follow_url, headers=headers, params=params)
149 if resp.status_code == 200:
150 follow_maxPage = int(resp.json()['data']['maxPage'])
151 if follow_maxPage >= 1:
152 return resp, follow_maxPage
153 else:
154 return resp
155 except Exception as e:
156 print(e.args)
157 return None
158
159 def get_follow(self, response):
160 # 获得关注列表的用户的信息,使用yield
161 follow_info = response.json()['data']['cards']
162 for info in follow_info:
163 follow = {'id': info['user']['id'],
164 'screen_name': info['user']['screen_name'],
165 'gender': info['user']['gender'],
166 'description': info['user']['description'],
167 'followers_count': info['user']['followers_count'],
168 'follow_count': info['user']['follow_count'],
169 'statuses_count': info['user']['statuses_count'],
170 'scheme': info['scheme']
171 }
172 if info['user']['verified'] == 'true':
173 follow['verified_reason'] = info['user']['verified_reason']
174 yield follow
175 else:
176 follow['verified_reason'] = 'None'
177 yield follow
178
179 def write_to_csv(self, *args, has_title=True):
180 # param has_title: 用于判断是否在csv表格中写入关注列表信息的列名。一般只写入一次。
181 if has_title is True:
182 with open('follow.csv', 'w', encoding='utf-8', newline='') as file:
183 follow_title = csv.writer(file, delimiter=',')
184 follow_title.writerow(['id', 'screen_name', 'gender', 'description', 'follow_count', 'followers_count',
185 'statuses_count', 'scheme', 'verified_reason'])
186 if has_title is False:
187 with open('follow.csv', 'a+', encoding='utf-8', newline='') as file:
188 follow = csv.writer(file, delimiter=',')
189 for data in self.get_follow(args[0]):
190 print(data)
191 follow.writerow([data['id'], data['screen_name'], data['gender'], data['description'],
192 data['follow_count'], data['followers_count'], data['statuses_count'],
193 data['scheme'], data['verified_reason']])
194
195
196 def main():
197 user = input('user:')
198 passwd = input('password:')
199 start_time = time.time()
200 gkp = GetMweiboFollow(user, passwd)
201 gkp.login_mweibo()
202 gkp.get_cookies()
203 if gkp.get_follow_url() is not None: # 若gkp.get_follow_url()不为None,说明成功发送了请求,并获得api的json数据
204 if isinstance(gkp.get_follow_url(), tuple): # 若gkp.get_follow_url()是tuple,说明关注列表有两页及以上(大于10个)
205 follow_maxPage = gkp.get_follow_url()[1] # 最大页数
206 gkp.write_to_csv(has_title=True)
207 for page in range(1, follow_maxPage + 1): # 获得每页的api的response,从而得到关注的人的信息,并写入csv
208 response = gkp.get_follow_url(follow_maxPage, page)[0]
209 gkp.write_to_csv(response, has_title=False)
210 end_time = time.time()
211 print('耗费时间:', end_time - start_time)
212 else:
213 # 页数为1
214 response = gkp.get_follow_url()
215 gkp.write_to_csv(has_title=True)
216 gkp.write_to_csv(response, has_title=False)
217 end_time = time.time()
218 print('耗费时间:', end_time - start_time)
219 else:
220 print('获取关注列表失败!')
221 end_time = time.time()
222 print('耗费时间:', end_time - start_time)
223 exit()
224
225
226 if __name__ == '__main__':
227 main()