Python + selenium 爬取淘宝商品列表及商品评论[2021-08-26]

  • 主要内容
  • 登录淘宝
  • 获取商品列表
  • 获取评论信息
  • 存入数据库
  • 需要提醒


主要内容

通过python3.8+ selenium 模拟chrome操作进行淘宝商品列表及评论的爬取
还存在以下问题:
需要人扫二维码登录以便于绕过反爬机制(后面再优化)
评论爬取耗时比较长,因为页面加载完整后才能进行评论的爬取,而各类商品详情页的图片数量不同,导致加载时间不同,有的甚至要加载1-2min(也可能是公司网限制了购物网站的网速)

整体思路:
通过扫码登录淘宝,绕过反爬机制
通过关键字搜索,获取商品列表信息
逐一访问商品详情页面,获取商品评论信息
转df存入数据库(评论信息,满10个商品存一次)

登录淘宝

通过selenium登录淘宝主要有2种方式,一种是在代码中写入账号密码,并且加入滑块模拟绕过反爬,我自己觉得有点不靠谱,而且我一开始也是用这种思路,导致账号被锁了…所以我现在采取的思路是通过登录支付宝的登录页面,扫描二维码来间接登录淘宝,这样可以不需要滑块验证,目前还行.

def loginTB(item):# item 为你需要通过淘宝搜索的宝贝关键字
	browser.get(
		'https://auth.alipay.com/login/index.htm?loginScene=7&goto=https%3A%2F%2Fauth.alipay.com%2Flogin%2Ftaobao_trust_login.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.taobao.com%252Fmember%252Falipay_sign_dispatcher.jhtml%253Ftg%253Dhttps%25253A%25252F%25252Fwww.taobao.com%25252F¶ms=VFBMX3JlZGlyZWN0X3VybD1odHRwcyUzQSUyRiUyRnd3dy50YW9iYW8uY29tJTJG')

   # 设置显示等待  等待搜索框出现
	wait = WebDriverWait(browser, 180)
	wait.until(EC.presence_of_element_located((By.ID, 'q')))

    # 查找搜索框,输入搜索关键字并点击搜索
	text_input = browser.find_element_by_id('q')
	text_input.send_keys(item)
	btn = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')
	btn.click()

获取商品列表

两个函数,一个用于翻页,一个用于获取商品列表信息,需要嵌套使用

def get_TB_data():
	page_index = 1
	data_list = []
	while page_index > 0 :
		print("===================正在抓取第{}页===================".format(page_index))
		print("当前页面URL:" + browser.current_url)
        # 解析数据
		data_list += get_item_list(browser.page_source)
        # 设置显示等待  等待下一页按钮
		wait = WebDriverWait(browser, 60)
		try:
			wait.until(EC.presence_of_element_located((By.XPATH, '//a[@class="J_Ajax num icon-tag"]')))
			time.sleep(1)
			try:
	            # 通过动作链,滚动到下一页按钮元素处
				write = browser.find_element_by_xpath('//li[@class="item next"]')
				ActionChains(browser).move_to_element(write).perform()
			except NoSuchElementException as e:
				print("爬取完毕!")
				page_index = 0
				break
			time.sleep(2)
			webdriver.ActionChains(browser).move_to_element(write).click(write).perform()
			page_index += 1
	return data_list

这里返回一个list,里面包含各商品列表的dic,最后会转df
这里需要注意的是shop_info = {} 一定要在循环内,否则因为python的指引问题,会导致list出错

def get_item_list(data):
	xml = etree.HTML(data)
	product_names = xml.xpath('//img[@class="J_ItemPic img"]/@alt')
	prices = xml.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()')
	shop_names = xml.xpath('//div[@class="shop"]/a/span[last()]/text()')
	dteail_urls = xml.xpath('//div[@class="pic"]/a/@href')
	sales_volumes = xml.xpath('//div[@class="deal-cnt"]/text()')
	addresss = xml.xpath('//div[@class="location"]/text()')

	data_list = []
	for i in range(len(product_names)):
		shop_info = {}
		shop_info['item_name'] = product_names[i]
		shop_info['price'] = prices[i]
		shop_info['shop_name'] = shop_names[i]
		shop_info['salse_volume'] = sales_volumes[i]
		shop_info['address'] = addresss[i]
		shop_info['item_url'] = dteail_urls[i]
		with open('shop_data.json','a',encoding = 'utf-8') as f :
			f.write(json.dumps(shop_info, ensure_ascii=False) + '\n')
		data_list.append(shop_info)
		print('正在爬取第%s件商品'%(i+1))
		print('商品名称:%s'%product_names[i])
		print('商品单价:%s'%prices[i])
		print('店铺名称:%s'%shop_names[i])
		print('累计售卖:%s'%sales_volumes[i])
		print("-"*30)
	return data_list

获取评论信息

同样是2个函数,一个用于获取评论信息,一个用于总控(逐一切换商品详情页及翻页)

def get_comment(data_list):
	comment_dic = {}
	for i in range(len(data_list)):
		comment_list = []
		time.sleep(1)
		print('准备开始爬取第%s个商品的评论信息'%(i+1))
		z = 1
		while  z == 1:
			try:
				if data_list[i]['item_url'][0] =='/':
					browser.get('https:'+data_list[i]['item_url'])
				else:
					browser.get(data_list[i]['item_url'])
				time.sleep(3)
				browser.execute_script('window.scrollTo(0,'+str(100+random.random()*30)+')')
				browser.find_element_by_xpath('//div[@id="J_TabBarBox"]/ul/li[2]/a').click()
				comment_list = get_comment_info(browser.page_source)
				time.sleep(1)
				#翻页
				while True:
					try:
						next_page=browser.find_element_by_xpath('//div[@class="rate-page"]/div[@class="rate-paginator"]//a[contains(text(),"下一页>>")]')
						browser.execute_script("arguments[0].click();", next_page)
						comment_list += get_comment_info(browser.page_source)
					except NoSuchElementException as e:
						z = 0
						break
			except:
				break
		comment_dic[data_list[i]['item_name']] = comment_list
		if i > 0  and i % 10 == 0:
			comment_df = pd.DataFrame(columns=('user_name','comment','com_time','com_add','item_name','insert_time'))
			for item_name , comments in comment_dic.items():
				comment_tmp = pd.DataFrame(comments)
				comment_tmp['item_name'] = item_name
				comment_tmp['insert_time'] = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
				comment_df = pd.concat([comment_df,comment_tmp])
			data2mysql(comment_df,'comment_list')
			comment_dic = {}

获取评论信息,如果该商品没有评论则跳过
是否有追评会导致XPATH不一样,要注意
同时要注意如果评论内容里面有moji表情,会导致存入数据库出现问题,所以要剔除

def get_comment_info(text):
	source = etree.HTML(text)
	user_name = re.findall('<div class="rate-user-info">(.*?)</div>',text)
	if len(user_name) > 0:
		info_list = source.xpath('//div[@class="rate-grid"]/table/tbody/tr')	
		com_list = []
		for i in range(len(info_list)):
			item = {}
			item['user_name'] = user_name[i].replace('<span>','').replace('</span>','')
			if info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]'):
				item['comment'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
				item['com_time'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-premiere"]/div[@class="tm-rate-tag"]//div[@class="tm-rate-date"]/text()')[0]
				item['com_add'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-append"]//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
			else:
				item['comment'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]/text()')[0]
				item['com_time'] = info_list[i].xpath('./td[1]/div[@class="tm-rate-date"]/text()')[0]
				item['com_add'] = ''
			item['comment'] = str(bytes(item['comment'], encoding='utf-8').decode('utf-8').encode('gbk', 'ignore').decode('gbk'))
			item['comment'] = item['comment'].replace(' ','')
			print('爬取到评论信息')
			print('用户名:%s'%item['user_name'])
			print('评论时间:%s'%item['com_time'])
			print('评论内容:%s'%item['comment'])
			print('追加评论:%s'%item['com_add'])
			print("-"*30)
			com_list.append(item)
	else:
		print('此商品没有评论')
	return com_list

存入数据库

def data2mysql(df,table_name):
	engine = ('mysql+pymysql://root:xxxxx@localhost:3306/selenium_taobao_pachong?charset=utf8')
	df = df.applymap(str)
	df.to_sql(name = table_name ,con = engine, if_exists = 'append',index = False,index_label = False)

需要提醒

如果被反爬锁定了,可以尝试下取消chrome的开发模式,以及自动检测来绕过,如果还不行的话,就需要在chrome的驱动程序上进行修改了,但是windows系统好像不太好弄.这也是为什么选择扫描二维码的形式进行登录,并且大量使用sleep来放慢速度
u1s1,淘宝技术还是可以的

chrome_options = webdriver.ChromeOptions();
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']);
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
browser = webdriver.Chrome(options=chrome_options)