from urllib import request
import urllib.request
import json
import re,uuid
from lxml import etree
class peihuaImg(object):
def __init__(self):
super(peihuaImg, self).__init__()
self.pn=5
def request(self):
while self.pn<=5:
request_url="https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8"
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0',
'Referer':'https://mm.taobao.com/search_t…126488.640745.2.1b545b81ziYw0u',
}
res=request.Request(request_url,headers = headers)
htm=request.urlopen(res).read().decode('gbk')
# with open ('txt/nihao/maji.txt','w+',encoding="gbk") as f:
# f.write(htm)
with open('txt/nihao/maji.txt','r',encoding="gbk",errors='ignore') as f:
res_new=f.read()
maji = etree.HTML(res_new)
content=maji.xpath('//div[@class="ladyIndex-top-slider J_LadyIndexTopSlider"]/div[@class="tab-content cleafix"]')
for div in content:
# divq=re.compile(r'^//(.*)')
# div=div.search(divq).group(1)
# print(div)
contents=div.xpath('div[@class="tab-pannel"]/img/@src')
# print(contents)
for i in contents:
if 'http:' != i[:5]:
i="http:" + str(i)
content_new=request.urlopen(i).read()
type=i.split('.')[-1]
imageName = str(uuid.uuid1())
FileMing = str('txt/') + imageName + '.' + type
with open(FileMing,'wb') as e:
e.write(content_new)
# print(contents)
self.pn+=30
# with request.urlopen(res) as f:
# htm = f.read().decode('gbk')
# htm=re.sub(r"(,?)(\w+?)\s*?:",r"\1'\2':",htm)
# htm = htm.replace("'", '\"')
# # doubleQuotedJsonStr=addedSing
# html = json.loads(htm)
# print(html)
# self.maji(html['all_items'])
# res_q=urllib.request.urlopen(res).read().decode('gbk')
# charset = chardet.detect(res)
# print(charset)
# print(res_q)
# res_new=json.loads(res)
# ww=self.maji1(res_new['items'])
# print(ww)
# self.pn+=30
if __name__ == '__main__':
ai=peihuaImg()
ai.request()
# 程序说明:采集mm.taobao.com模特图片和详细内容。将采集的数据保存到sqlite数据库中
# 保存图片已采集模特的userId为文件名
# '''
# '''
# import requests
# import os
# import uuid
# import re
# import sqlite3
# from time import sleep
# from PIL import Image # 处理图片
#
# # 当前项目路径
# BASE_PATH = os.getcwd()
# DB = 'taobao_mm.db' # 数据保存的数据库名
# TABLE = 'mm_table' # 数据保存的表名
# # 缩略图尺寸:
# IMG_W = 600
# IMG_H = 600
#
#
# # 创建数据sqlite数据库和表
# def create_db_table():
# create_table_sql = '''
# create table IF NOT EXISTS %s(userid integer,
# realName varchar(50),
# city varchar(50),
# height varchar(10),
# weight varchar(10),
# totalFavorNum int(11),
# img varchar(225),
# content text
# )
# ''' % (TABLE)
# # 链接数据库
# db = sqlite3.connect(DB)
# # 创建游标
# cur = db.cursor()
# # 执行sql语句
# cur.execute(create_table_sql)
# # 提交结果
# db.commit()
# # 关闭数据库
# db.close()
#
#
# # 插入数据
# def insert_res(data):
# # 确定数据库已经创建
# create_db_table()
#
# sql = '''insert into %s(userid,realName,city,height,weight,totalFavorNum,img,content)
# values('%s','%s','%s','%s','%s','%s','%s','%s')
# ''' % (TABLE, data.get('userid'), data.get('realName'), data.get('city'), data.get('height'), data.get('weight'),
# data.get('totalFavorNum'), data.get('img'), data.get('content'))
# # 链接数据库
# db = sqlite3.connect(DB)
# # 创建游标
# cur = db.cursor()
# # 执行sql语句
# cur.execute(sql)
# # 提交事务
# db.commit()
# # 关闭数据库
# db.close()
# return True
#
#
# # 处理图片的函数,2个参数:1)图片地址 2)模特id
# def upload_img(url, userId):
# # 保存图片已采集模特的userId为文件名
# relative_addr = 'upload' + '/%d/' % (userId)
# # 判断绝对地址文件夹是否存在:os.path.join(BASE_PATH,relative_addr)
# absolut_addr = os.path.join(BASE_PATH, relative_addr)
# # 如果不存在。则创建文件夹
# if not os.path.exists(absolut_addr):
# os.makedirs(absolut_addr) # 创建文件夹
# # 取得当前图片的后缀
# postfix = url.split('.')[-1] # 文件名已.号分割。
# # 已当前userId为当前的图片名
# file_name = str(userId) + '.' + postfix
# # 文件相对地址:
# file_url = relative_addr + file_name
# # 请求url图片地址。
# img_source = requests.get('http:' + url)
# # 将响应的结果保存到文件中file_url中:
# with open(file_url, 'wb') as f:
# f.write(img_source.content)
# # 生成缩略图
# img = Image.open(file_url)
# img.thumbnail((IMG_W, IMG_H))
# img.save(file_url, img.format)
# # 返回相对地址:
# return file_url
#
#
# # 爬取用户详细信息
# def get_content(userId):
# headers = {
# 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
# 'referer': 'https://mm.taobao.com/search_tstar_model.htm'}
# url = 'https://mm.taobao.com/self/aiShow.htm?userId=' + str(userId)
# img_source = requests.get(url=url, headers=headers)
# html = img_source.text
# # 正则匹配文章内容
# content = re.search('<div.*?id="J_ScaleImg">(.*?)</div>', html, re.S).group(1)
# # 正则匹配文章中的所有图片
# img_list = re.findall('src="(.*?)"', content, re.S)
# # 确定文件的保存路径
# relative_addr = 'upload' + '/%d/' % (userId)
# print(relative_addr)
# # 判断绝对地址文件夹是否存在:os.path.join(BASE_PATH,upload_addr)
# absolut_addr = os.path.join(BASE_PATH, relative_addr)
# if not os.path.exists(absolut_addr):
# os.makedirs(absolut_addr) # 创建文件夹
# # 循环处理图片
# temp_num = 0
# for url_img in img_list:
# # 取得原文件后缀
# postfix = url_img.split('.')[-1] # 文件名已.号分割。
# # 新文件文件名,格式为 : userID_数字.后缀
# file_name = '%s_%s.%s' % (str(userId), str(temp_num), postfix)
# # temp_num累加1
# temp_num = temp_num + 1
# # 文件相对地址:
# file_url = relative_addr + file_name
# # 请求url图片地址。
# img_source = requests.get('http:' + url_img)
# # 将响应的结果保存到文件中:
# with open(file_url, 'wb') as f:
# f.write(img_source.content)
# # 生成缩略图
# try:
# img = Image.open(file_url)
# img.thumbnail((IMG_W, IMG_H))
# img.save(file_url, img.format)
# except Exception as e:
# print(file_name, '')
# else:
# continue
# # 替换采集内容中的图片地址
# content = content.replace(url_img, '/' + file_url)
# return content
#
#
# # 请求列表分页数据,url:采集地址,page:页数
# def main(url, page):
# headers = {
# 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
# 'referer': 'https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.1b545b81UeJRlv'}
# print('开始:')
# for i in range(1, page + 1):
# data = {'viewFlag': 'A',
# 'sortType': 'default',
# 'currentPage': i,
# 'pageSize': 100}
# r = requests.post(url=url, data=data, headers=headers)
# # 判断请求状态
# # 根据请求状态进行处理,请求次数过多出错,就需要使用代理
# print('第一页状态:', r.status_code, end='')
# if r.status_code == 200:
# res = r.json() # res 保存请求响应的结果
# res_list = res['data']['searchDOList'] # 保存响应的数据列表
# # 循环列表,将数据插入到数据库中
# for item in res_list:
# # 判断图片文件夹是否存在:
# relative_addr = 'upload' + '/%d/' % (item['userId']) # 相对地址
# # 判断绝对地址文件夹是否存在:os.path.join(BASE_PATH,upload_addr)
# absolut_addr = os.path.join(BASE_PATH, relative_addr)
# if os.path.exists(absolut_addr):
# # 跳出本次循环,进行下次循环
# continue
# insert_data = {'realName': item['realName'],
# 'userid': item['userId'],
# 'city': item['city'],
# 'height': item['height'],
# 'weight': item['weight'],
# 'totalFavorNum': item['totalFavorNum'],
# 'img': upload_img(item['avatarUrl'], item['userId']),
# 'content': get_content(item['userId']) # 详细内容图片过多。只做参考
# }
# # 将数据插入到数据库中
# print('\r\n', item['realName'], '-', item['userId'], ':已采集', '\r\n', end='')
# insert_res(insert_data)
# break; # 注意 ,需要采集所有需要将这句删除。
# sleep(1) # 每次插入操作间隔1秒,方式sqlite被锁
# else:
# print('请求是失败,页数:', i)
# print('结束!', )
#
#
# # print(res_list)
# # print(res)
# if __name__ == '__main__':
# url = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
# main(url, 1) # 第一个参数为采集地址。参数2:采集页数