目的
获取1688工厂名片的相关信息
详细需求
一、进入1688网站
https://www.1688.com/
二、使用“工厂”这个搜索框
三、输入工厂名称进行搜索,如“深圳市杰之美时装有限公司”
四、返回搜索结果,并获取逐个店铺/工厂的连接
五、获取有关数据
思路解析
一、搜索关键词,获取返回网页中的工厂ID
二、链接拼接-进入工厂名片详情页
三、目标信息定位
四、模拟构建请求
这里的难点就是sign值的获取
五、思路汇总
1.请求工厂关键词-解析得到工厂ID
2.需要进行三个json请求
2.1. json1获取工厂基本信息
2.2 json7获取粉丝数
2.3 json2获取生产实力
3.获取cookie(这里直接复制使用的,具有时效性,会过期)
4.对每个json请求的data进行处理
5.拼接完成后,交给本地js进行处理得到sign值
6.构建请求
7.信息提取与打印
源码
1688.js
function u(e) {
function t(e, t) {
return e << t | e >>> 32 - t
}
function n(e, t) {
var n, o, r, i, s;
return r = 2147483648 & e,
i = 2147483648 & t,
n = 1073741824 & e,
o = 1073741824 & t,
s = (1073741823 & e) + (1073741823 & t),
n & o ? 2147483648 ^ s ^ r ^ i : n | o ? 1073741824 & s ? 3221225472 ^ s ^ r ^ i : 1073741824 ^ s ^ r ^ i : s ^ r ^ i
}
function o(e, t, n) {
return e & t | ~e & n
}
function r(e, t, n) {
return e & n | t & ~n
}
function i(e, t, n) {
return e ^ t ^ n
}
function s(e, t, n) {
return t ^ (e | ~n)
}
function a(e, r, i, s, a, p, u) {
return e = n(e, n(n(o(r, i, s), a), u)),
n(t(e, p), r)
}
function p(e, o, i, s, a, p, u) {
return e = n(e, n(n(r(o, i, s), a), u)),
n(t(e, p), o)
}
function u(e, o, r, s, a, p, u) {
return e = n(e, n(n(i(o, r, s), a), u)),
n(t(e, p), o)
}
function c(e, o, r, i, a, p, u) {
return e = n(e, n(n(s(o, r, i), a), u)),
n(t(e, p), o)
}
function d(e) {
for (var t, n = e.length, o = n + 8, r = (o - o % 64) / 64, i = 16 * (r + 1), s = new Array(i - 1), a = 0, p = 0; n > p;)
t = (p - p % 4) / 4,
a = p % 4 * 8,
s[t] = s[t] | e.charCodeAt(p) << a,
p++;
return t = (p - p % 4) / 4,
a = p % 4 * 8,
s[t] = s[t] | 128 << a,
s[i - 2] = n << 3,
s[i - 1] = n >>> 29,
s
}
function l(e) {
var t, n, o = "", r = "";
for (n = 0; 3 >= n; n++)
t = e >>> 8 * n & 255,
r = "0" + t.toString(16),
o += r.substr(r.length - 2, 2);
return o
}
function f(e) {
e = e.replace(/\r\n/g, "\n");
for (var t = "", n = 0; n < e.length; n++) {
var o = e.charCodeAt(n);
128 > o ? t += String.fromCharCode(o) : o > 127 && 2048 > o ? (t += String.fromCharCode(o >> 6 | 192),
t += String.fromCharCode(63 & o | 128)) : (t += String.fromCharCode(o >> 12 | 224),
t += String.fromCharCode(o >> 6 & 63 | 128),
t += String.fromCharCode(63 & o | 128))
}
return t
}
var m, h, g, _, y, v, R, S, w, O = [], E = 7, A = 12, q = 17, b = 22, T = 5, x = 9, N = 14, C = 20, k = 4, J = 11,
P = 16, L = 23, I = 6, D = 10, j = 15, W = 21;
for (e = f(e),
O = d(e),
v = 1732584193,
R = 4023233417,
S = 2562383102,
w = 271733878,
m = 0; m < O.length; m += 16)
h = v,
g = R,
_ = S,
y = w,
v = a(v, R, S, w, O[m + 0], E, 3614090360),
w = a(w, v, R, S, O[m + 1], A, 3905402710),
S = a(S, w, v, R, O[m + 2], q, 606105819),
R = a(R, S, w, v, O[m + 3], b, 3250441966),
v = a(v, R, S, w, O[m + 4], E, 4118548399),
w = a(w, v, R, S, O[m + 5], A, 1200080426),
S = a(S, w, v, R, O[m + 6], q, 2821735955),
R = a(R, S, w, v, O[m + 7], b, 4249261313),
v = a(v, R, S, w, O[m + 8], E, 1770035416),
w = a(w, v, R, S, O[m + 9], A, 2336552879),
S = a(S, w, v, R, O[m + 10], q, 4294925233),
R = a(R, S, w, v, O[m + 11], b, 2304563134),
v = a(v, R, S, w, O[m + 12], E, 1804603682),
w = a(w, v, R, S, O[m + 13], A, 4254626195),
S = a(S, w, v, R, O[m + 14], q, 2792965006),
R = a(R, S, w, v, O[m + 15], b, 1236535329),
v = p(v, R, S, w, O[m + 1], T, 4129170786),
w = p(w, v, R, S, O[m + 6], x, 3225465664),
S = p(S, w, v, R, O[m + 11], N, 643717713),
R = p(R, S, w, v, O[m + 0], C, 3921069994),
v = p(v, R, S, w, O[m + 5], T, 3593408605),
w = p(w, v, R, S, O[m + 10], x, 38016083),
S = p(S, w, v, R, O[m + 15], N, 3634488961),
R = p(R, S, w, v, O[m + 4], C, 3889429448),
v = p(v, R, S, w, O[m + 9], T, 568446438),
w = p(w, v, R, S, O[m + 14], x, 3275163606),
S = p(S, w, v, R, O[m + 3], N, 4107603335),
R = p(R, S, w, v, O[m + 8], C, 1163531501),
v = p(v, R, S, w, O[m + 13], T, 2850285829),
w = p(w, v, R, S, O[m + 2], x, 4243563512),
S = p(S, w, v, R, O[m + 7], N, 1735328473),
R = p(R, S, w, v, O[m + 12], C, 2368359562),
v = u(v, R, S, w, O[m + 5], k, 4294588738),
w = u(w, v, R, S, O[m + 8], J, 2272392833),
S = u(S, w, v, R, O[m + 11], P, 1839030562),
R = u(R, S, w, v, O[m + 14], L, 4259657740),
v = u(v, R, S, w, O[m + 1], k, 2763975236),
w = u(w, v, R, S, O[m + 4], J, 1272893353),
S = u(S, w, v, R, O[m + 7], P, 4139469664),
R = u(R, S, w, v, O[m + 10], L, 3200236656),
v = u(v, R, S, w, O[m + 13], k, 681279174),
w = u(w, v, R, S, O[m + 0], J, 3936430074),
S = u(S, w, v, R, O[m + 3], P, 3572445317),
R = u(R, S, w, v, O[m + 6], L, 76029189),
v = u(v, R, S, w, O[m + 9], k, 3654602809),
w = u(w, v, R, S, O[m + 12], J, 3873151461),
S = u(S, w, v, R, O[m + 15], P, 530742520),
R = u(R, S, w, v, O[m + 2], L, 3299628645),
v = c(v, R, S, w, O[m + 0], I, 4096336452),
w = c(w, v, R, S, O[m + 7], D, 1126891415),
S = c(S, w, v, R, O[m + 14], j, 2878612391),
R = c(R, S, w, v, O[m + 5], W, 4237533241),
v = c(v, R, S, w, O[m + 12], I, 1700485571),
w = c(w, v, R, S, O[m + 3], D, 2399980690),
S = c(S, w, v, R, O[m + 10], j, 4293915773),
R = c(R, S, w, v, O[m + 1], W, 2240044497),
v = c(v, R, S, w, O[m + 8], I, 1873313359),
w = c(w, v, R, S, O[m + 15], D, 4264355552),
S = c(S, w, v, R, O[m + 6], j, 2734768916),
R = c(R, S, w, v, O[m + 13], W, 1309151649),
v = c(v, R, S, w, O[m + 4], I, 4149444226),
w = c(w, v, R, S, O[m + 11], D, 3174756917),
S = c(S, w, v, R, O[m + 2], j, 718787259),
R = c(R, S, w, v, O[m + 9], W, 3951481745),
v = n(v, h),
R = n(R, g),
S = n(S, _),
w = n(w, y);
var H = l(v) + l(R) + l(S) + l(w);
return H.toLowerCase()
}
// console.log(a)
// console.log(p)
// var e='5970c860dcff67864c7b1912bd984ee9&1602670696727&12574478&{"cid":"TpFacCoreInfosService:TpFacCoreInfosService","methodName":"execute","params":"{\"facAliId\":\"2019106328\"}"}'
// var e='87ce3da8b6f5218713fc35e8b9d9d7de&1602738209823&12574478&{"cid":"TpFacCoreInfosService:TpFacCoreInfosService","methodName":"execute","params":"{\\"facAliId\\":\\"2019106328\\"}"}'
// var a = (new Date).getTime()
// // var a='1602810824704'
// var s='12574478'
// var token='8a0539ff99e9241f36538eee5f490e48'
// var data='{"cid":"TpFacCoreInfosService:TpFacCoreInfosService","methodName":"execute","params":"{\\"facAliId\\":\\"2019106328\\"}"}'
//
// p = (token + "&" + a + "&" + s + "&" + data)
// console.log(a)
// console.log(u(p))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author : jia666
# @Time : 2020/10/15 17:07
import re
import time
import requests
import execjs
from urllib import parse
class S1688(object):
def __init__(self, word):
self.head = {
'cookie':'cookie2=15b9e05276e7c4893f44f1509e7846f6; t=2189917eb616063e3da62aa4f41673d9; _tb_token_=7b657778a3376; __cn_logon__=false; cna=dFkQGGsGrnACARsm+oOaXAW/; xlly_s=1; h_keys="%u6df1%u5733%u5e02%u6770%u4e4b%u7f8e%u65f6%u88c5%u6709%u9650%u516c%u53f8"; _csrf_token=1603072773208; alicnweb=touch_tb_at%3D1603088557128; _m_h5_tk=bd327dc391fc8b112121750e88805f27_1603099001009; _m_h5_tk_enc=7095c3c810519260432599239fa2c840; ad_prefer="2020/10/19 14:23:12"; isg=BCcnARTANLN3qbA-Gyksu_xDtlvxrPuOwp-EYfmU17bd6EeqAX-p3mWpCuj2ANMG; l=eBEQxAnqOmPZIkKZBO5CFurza779uIRb4sPzaNbMiInca10FaF6eCNQVOwXJudtjgtCUIetybUMLyRLHR3fRwxDDB5JEV7vS3xvO.; tfstk=cJ6cBRAuSsRjwbAlO-9fv3snaPbcaGoykdJPUT8LyYvDAYBJ0s4xaXPjD0YAoOh1.',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}
self.word = word # 工厂关键词
self.api = 'https://h5api.m.1688.com/h5/mtop.taobao.widgetservice.getjsoncomponent/1.0/?'
def Main(self):
self.Get_word() # 一、关键词处理-获取工厂ID
self.Get_mtopjsonp1() # 二、获取合作伙伴地址等信息
self.Get_mtopjsonp2() # 三、获取生产实力
self.Get_mtopjsonp7() # 四、获取粉丝数
self.RE_html() # 五、正则提取与整理
def Get_html(self, url):
'请求网页,返回文本'
req = requests.get(url, headers=self.head) #网页请求
html = re.sub('\s', '', req.text) #去除多余空格
return html
def Init_js(self, data):
token = re.findall('_m_h5_tk=(.*?)_', self.head['cookie'], re.S)[0] # token值
s = '12574478' #固定参数
self.a = str(int(time.time() * 1000)) #时间戳
p = (token + "&" + self.a + "&" + s + "&" + data) #参数整理
with open('1688.js', 'r', encoding='utf-8') as f: #加载js
ctx = execjs.compile(f.read())
self.sign = ctx.call('u', p) #执行sign生成函数获取sign值
def Get_word(self):
'获取工厂ID'
url = 'https://s.1688.com/company/company_search.htm?keywords={}&charset=utf8'.format(self.word)
# print(url)
req = self.Get_html(url)
# print(req)
self.factory_id = re.findall('"realUserId":"(.*?)"', req, re.S)[0] #获取工厂ID
def Get_mtopjsonp1(self):
data = '{"cid":"TpFacCoreInfosService:TpFacCoreInfosService","methodName":"execute","params":"{\\"facAliId\\":\\"' + str(
self.factory_id) + '\\"}"}' # 参数更新
url=self.Get_url(data,1)
# print(url)
self.html1 = self.Get_html(url) #请求网页返回的文本
# print(self.html1)
self.Check_html(self.html1) #检查cookie是否正常
def Check_html(self, html):
k = re.findall('令牌过期', html, re.S)
if k:
print('令牌过期,更新cookie后重试')
def Get_url(self,data,n):
self.Init_js(str(data)) # js获取
parms = 'jsv=2.6.0&appKey=12574478&t={}&sign={}&api=mtop.taobao.widgetService.getJsonComponent&v=1.0&type=jsonp&timeout=5000&dataType=jsonp&callback=mtopjsonp{}&'.format(
self.a, self.sign,n)
sdata = parse.quote(str(data)) # quote()将字符串进行编码
url = self.api + parms + 'data=' + sdata
return url
def Get_mtopjsonp2(self):
data = {"cid": "FactoryStrengthServiceWidget:FactoryStrengthServiceWidget", "methodName": "execute"}
k = "{\"extParam\":{\"factoryUserId\":\"%s\"}}" % (self.factory_id)#参数修改
data.update(({'params': k}))#字典更新
url=self.Get_url(data,2)
self.html2 = self.Get_html(url)
self.Check_html(self.html2)
def Get_mtopjsonp7(self):
data = {'cid': 'ShopFavouriteServiceWidget:ShopFavouriteServiceWidget', 'methodName': 'execute'}
k = '{"extParams":{"method":"readFavourite","targetUserId":"%s"}}' % (self.factory_id)
data.update(({'params': k}))
url=self.Get_url(data,7)
self.html7 = self.Get_html(url)
self.Check_html(self.html7)
def RE_html(self):
facName = re.findall('"facName":"(.*?)"', self.html1, re.S)[0] # 工厂名称
data = re.findall('"data":"(.*?)"', self.html1, re.S) # 数据
comment = re.findall('"desc":"(.*?)"', self.html1, re.S) # 备注
factoryPv = re.findall('"factoryPv":"(.*?)"', self.html1, re.S)[0] # 浏览数
address = re.findall('"factoryDetailedAddress":"(.*?)"', self.html1, re.S)[0] # 地址
favCount = re.findall('"favCount":"(.*?)"', self.html7, re.S)[0] # 粉丝数
k = re.findall('"value":"(.*?)"', self.html2, re.S) # 生产实力
s = ''
for i, com in enumerate(comment):
if i==0:
t='\t'
else:
t='%\t'
s += str(com) + ':' + str(data[i]) +t
s += '粉丝数:' + favCount + '\t' + '浏览数:' + factoryPv + '\t'
h = '厂房面积' + k[0] + '平方' + '\t' + \
'生产人数' + k[1] + '人' + '\t' + \
'设备总数' + k[2] + '台' + '\t' + \
'仓储类型' + k[3] + '\t' + \
'加工方式' + k[4] + '\t' + \
'代工模式' + k[5] + '\t' + \
'质检类型' + k[6] + '\t' + \
'售后服务' + k[7] + '\t'
print(facName + '\n' + '*' * 50 + '\n' + s + '\n' + address + '\n' + '-' * 50 + '\n' + h)
if __name__ == '__main__':
word = '深圳市杰之美时装有限公司'
ex = S1688(word)
ex.Main()
实现效果
注意:cookie需要手动更新