python和nodejs交互 nodejs python交互

转载

mob64ca141a683a 2023-10-12 00:24:14

文章标签 python和nodejs交互 python nodejs开发web 数据 html python脚本 文章分类 Python 后端开发

最近研究了一下网站爬虫，觉得python和nodejs都有优点，所以我决定实现一个爬虫，用python来抓取网页的源代码，用nodejs的cheerio模块来获取源代码内的数据。正好我有明年换工作的打算，于是我选择爬智联招聘的网站。

代码地址：https://github.com/duan602728596/ZhiLianUrllib

1.用python进行一个http请求

# coding: utf-8
# http.py
import sys
import types
import urllib
import urllib2

# 获取传递的参数

# @param argv[0]{string}：脚本名称

# @param argv[1]{string}：请求方式，get或post

# @param argv[2]{string}：请求地址

# @param argv[3]{string}：请求的数据

argv = {
'filename': sys.argv[0],
'method': sys.argv[1],
'url': sys.argv[2],
'data': sys.argv[3],
}

class Http:

# 初始化数据

def __init__(self, method, url, data = ''):

self.method = method # 请求的类型

self.url = url # 请求的地址

self.data = self.getData(data) # 请求的数据

# 请求头

self.header = {
'Accept-Encoding': 'deflate',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'cache-control': 'no-cache',
}

# 获取请求数据的

def getData(self, data):
if type(data) is types.StringType:
gd = data
elif type(data) is types.DictionaryType:
gd = urllib.urlencode(data)
else:
gd = ''
return gd
# get
def get(self):
if self.data == '':
u = self.url
else:
u = self.url + '?' + self.data
request = urllib2.Request(u)
response = urllib2.urlopen(request)
return response.read()
# post
def post(self):
request = urllib2.Request(self.url, self.data, self.header)
response = urllib2.urlopen(request)
return response.read()
# init
def init(self):
if self.method == 'get':
self.result = self.get()
elif self.method == 'post':
self.result = self.post()
else:
self.result = ''
# 初始化请求
http = Http(argv['method'], argv['url'], argv['data'])
http.init()
text = http.result
# 输出请求
print(text)

在该脚本中，使用sys库获取命令行传递的各种参数，使用types库进行数据类型的判断，使用urllib库和urllib2库进行网页内容的抓取。传递的参数有请求的方式、请求的url地址、请求的数据。初始化后，根据传递的请求方式决定执行get请求还是post请求，执行请求后将结果输出出来，传回nodejs程序中。

2.nodejs和python实现通信

/**
* pyhttp.js
*
* 与python脚本通信，进行一个请求
* @param info{object}：与python脚本通信的配置
* @param callback{function}：通信完成后执行的事件，传递参数为返回的数据
*/
const childProcess = require('child_process');
function pyhttp(info, callback){
/* 发送请求 */
return new Promise((resolve, reject)=>{
// cmd
const cps = childProcess.spawn('python', [
// avgs
info.file,
info.method,
info.url,
info.data
]);
// 储存文本
let txt = '';
// 错误
cps.stderr.on('data', function(data){
reject(data);
});
// 获取数据
cps.stdout.on('data', function(data){
txt += data;
});
// 获取完数据
cps.on('exit', function(code){
resolve(txt);
});
}).then(callback).catch((error)=>{
console.log(error);
});
}
module.exports = pyhttp;

在nodejs脚本中执行其他脚本并返回执行结果，使用child_process模块，语法为** child_process.spawn(command, [args], [options]) ，command是命令，args是参数。在这里我遇到了一个小小的坑，我之前是用的child_process.exec(command, [options], callback)，但是这个的返回值是有大小限制的，因为网站的源代码比较大，导致报错。用child_process.spawn(command, [args], [options])**或者重新设置返回值大小可解决。调用pyhttp.js需要传递两个参数，第一个参数是运行python脚本的命令配置，第二个参数是回调函数，，传递脚本的运行结果。

3.对源代码进行处理

/**
* deal.js
*
* 处理数据
* @param dealText{string}：获取到的页面源代码
* @param ishref{boolean}：是否获取下一页的地址，默认为false，不获取
*/
const cheerio = require('cheerio');
/* 提取冒号后面的文本 */
const mhtext = text => text.replace(/.+：/, '');
function each($, ishref = false){
const a = [];
// 获取table
const $table = $('#newlist_list_content_table').children('table');
for(let i = 0, j = $table.length; i < j; i++){
const $this = $table.eq(i);
const $tr = $this.children('tr'),
$tr0 = $tr.eq(0),
$tr1 = $tr.eq(1);
const $span = $tr1.children('td').children('div').children('div').children('ul').children('li').children('span');
if($this.children('tr').children('th').length <= 0){
a.push({
// 职位招聘
'zwzp': $tr0.children('.zwmc').children('div').children('a').html(),
// 招聘地址
'zpdz': $tr0.children('.zwmc').children('div').children('a').prop('href'),
// 反馈率
'fklv': $tr0.children('.fk_lv').children('span').html(),
// 公司名称
'gsmc': $tr0.children('.gsmc').children('a').html(),
// 工作地点
'gzdd': $tr0.children('.gzdd').html(),
// 进入地址
'zldz': $tr0.children('.gsmc').children('a').prop('href'),
// 公司性质
'gsxz': mhtext($span.eq(1).html()),
// 公司规模
'gsgm': mhtext($span.eq(2).html())
});
}
}
const r = {};
r['list'] = a;
if(ishref != false){
r['href'] = $('.pagesDown').children('ul').children('li').children('a').eq(2).prop('href').replace(/&p=\d/, '');
}
return r;
}
function deal(dealText, ishref = false){
const $ = cheerio.load(dealText, {
decodeEntities: false
});
return each($, ishref);
}
module.exports = deal;

deal.js用cheerio模块来对抓取到的源代码进行处理。传递参数dealText为源代码，ishref 为是否抓取分页的地址。

注意，在用cheerio模块来获取数据时有一个问题，

const cheerio = require('cheerio');
const html = `
 
1 
 2 
 3 
 `; 
const $ = cheerio.load(html);
/* 获取li */
$('#demo').children('li'); // 这样是获取不到li的
$('#demo').children('ul').children('li'); // 获取到了li
虽然cheerio的语法和jquery一样，但是原理千差万别，因为网页的数据被解析成了object对象，所以必须通过子节点一级一级向下查找，不能跳级。
数据处理：公司性质和公司规模删除掉了：和：前面的文字，下一页的url地址删除掉&p=\d参数，该参数是分页参数。
4.nodejs和python实现通信
/* app.js */
const fs = require('fs');
const pyhttp = require('./pyhttp');
const deal = require('./deal');
const _result = {};
/**
* 请求地址和参数
*
* jl：地点
* kw：职位关键字
* sf：工资范围下限
* st：工资范围上限
* el：学历
* et：职位类型
* pd：发布时间
* p: 分页page
* ct：公司性质
* sb：相关度
* we: 工作经验
*
*/
const info = (url, method = 'get', data = '')=>{
return {
// python脚本
file: 'http.py',
// 请求类型
method: method,
// 请求地址
url: url,
// 请求数据
data: data
}
};
const page = 4; // 循环次数
// 回调
const callback = (text)=>{
return new Promise((resolve, reject)=>{
resolve(text);
});
};
pyhttp(info(encodeURI('http://sou.zhaopin.com/jobs/searchresult.ashx?' +
'jl=北京&kw=web前端&sm=0&sf=10001&st=15000&el=4&we=0103&isfilter=1&p=1&et=2')), function(text){
const p0 = deal(text, true);
_result.list = p0.list;
const n = [];
for(let i = 0; i < page; i++){
n.push(pyhttp(info(`${p0.href}&p=${i + 2}`)), callback);
}
Promise.all(n).then((result)=>{
for(let i in result){
_result.list = _result.list.concat(deal(result[i]).list);
}
}).then(()=>{
fs.writeFile('./result/result.js', `window._result = ${JSON.stringify(_result, null, 4)};`, (error)=>{
if(error){
console.log(error);
}else{
console.log('写入数据成功！');
}
});
});
});

将pyhttp.js和deal.js包含进来后，首先对智联的搜索页进行一次请求，回调函数内处理返回的源代码，将第一页数据添加到数组，并且获取到了分页的地址，使用Promise.all并行请求第2页到第n页，回调函数内对数据进行处理并添加到数组中，将数据写入result.js里面（选择js而不是json是为了便于数据在html上展现）。

获取到的数据：

1.jpg

5.页面上展现数据

/* 渲染单个数据 */
const Group = React.createClass({
// 处理a标签
dela: str => str.replace(/.*<\/a>/g, ''),
// 处理多出来的标签
delb: str => str.replace(/<\/?[^<>]>/g, '),
render: function(){
return (
 
{this.delb(this.props.obj.zwzp)} 
{this.props.obj.fklv} 
{this.dela(this.props.obj.gsmc)} 
{this.props.obj.gzdd} 
 
{decodeURI(this.props.obj.zldz)} 
{this.props.obj.gsxz} 
{this.props.obj.gsgm} 
); 
}
});
/* 表格类 */
const Table = React.createClass({
// 渲染组
group: function(){
return window._result.list.map((object, index)=>{
return ();
});
},
render: function(){
return (
职位 
反馈率 
公司名称 
工作地点 
智联地址 
公司性质 
公司规模 
);
}
});
ReactDOM.render(
, 
document.getElementById('result')
);

在页面上展示数据，使用react和bootstrap。其中在展示时，公司名称发现有无用a标签，职位内有b标签，使用正则表达式删除它们。

页面结果：

2.jpg

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。