browserless ws 的处理实际上一个proxy 对于启动的实际无头浏览器ws 服务进行了代理,同时为了安全browserless 进行了token 的处理
以下对于内部实现进行一个简单说明
参考处理
- ws route 注册
browserless 模块中的start 方法
wsRoutes 是实现WebSocketRoute 或者BrowserWebsocketRoute 的子类
const wsRoutes: Array<WebSocketRoute | BrowserWebsocketRoute> = [];
...
httpRoutes.forEach((r) => this.router.registerHTTPRoute(r));
wsRoutes.forEach((r) => this.router.registerWebSocketRoute(r));
...
...
BrowserWebsocketRoute 子类如下
- ChromiumCDPWebSocketRoute ws route 定义
可以看到需要一个browser 对象,比对象是属于懒加载的,在需要的时候才会进行创建,对于路由Route 都会有一个BrowserManager 对象实现浏览器的管理,对于不同浏览器的实现,直接传递了实际的browser 实现,比如ChromiumCDPWebSocketRoute 使用的ChromiumCDP
export default class ChromiumCDPWebSocketRoute extends BrowserWebsocketRoute {
name = BrowserlessRoutes.ChromiumCDPWebSocketRoute;
auth = true;
browser = ChromiumCDP;
concurrency = true;
description = `Launch and connect to Chromium with a library like puppeteer or others that work over chrome-devtools-protocol.`;
path = [WebsocketRoutes['/'], WebsocketRoutes.chromium];
tags = [APITags.browserWS];
handler = async (
req: Request,
socket: Duplex,
head: Buffer,
_logger: Logger,
browser: ChromiumCDP,
): Promise<void> => browser.proxyWebSocket(req, socket, head);
}
ChromiumCDP 实现的功能
ChromiumCDP 实现了实际通过websocket 访问浏览器的能力,同时也包含了对无头浏览器的启动管理,详细的可以查看ChromiumCDP 类
浏览器的启动
核心是router 一个websocket 的包装方法
public registerWebSocketRoute(
route: WebSocketRoute | BrowserWebsocketRoute,
): WebSocketRoute | BrowserWebsocketRoute {
this.log.trace(`Registering WebSocket "${route.path}"`);
const bound = route.handler.bind(route);
const wrapped = this.wrapWebSocketHandler(route, bound);
// 此处是一个并发控制
route.handler = route.concurrency
? this.limiter.limit(
wrapped,
this.onQueueFullWebSocket,
this.onWebsocketTimeout,
this.getTimeout,
)
: wrapped; const wrapped = this.wrapWebSocketHandler(route, bound);
wrapWebSocketHandler 的处理
protected wrapWebSocketHandler =
(
route: WebSocketRoute | BrowserWebsocketRoute,
handler: WebSocketRoute['handler'] | BrowserWebsocketRoute['handler'],
) =>
async (req: Request, socket: stream.Duplex, head: Buffer) => {
if (!isConnected(socket)) {
this.log.warn(`WebSocket Request has closed prior to running`);
return Promise.resolve();
}
const logger = new this.logger(route.name, req);
if ('browser' in route && route.browser) {
// 通过browserManager 获取或者创建浏览器
const browser = await this.browserManager.getBrowserForRequest(
req,
route,
logger,
);
if (!isConnected(socket)) {
this.log.warn(`WebSocket Request has closed prior to running`);
this.browserManager.complete(browser);
return Promise.resolve();
}
if (!browser) {
return writeResponse(socket, 500, `Error loading the browser.`);
}
try {
this.log.trace(`Running found WebSocket handler.`);
await handler(req, socket, head, logger, browser);
} finally {
this.log.trace(`WebSocket Request handler has finished.`);
this.browserManager.complete(browser);
}
return;
}
return (handler as WebSocketRoute['handler'])(req, socket, head, logger);
};
getBrowserForRequest 处理
整个代码还是比较长的,核心是基于参数进行浏览器的获取或者创建
public getBrowserForRequest = async (
req: Request,
router: BrowserHTTPRoute | BrowserWebsocketRoute,
logger: Logger,
): Promise<BrowserInstance> => {
const { browser: Browser } = router;
const blockAds = parseBooleanParam(
req.parsed.searchParams,
'blockAds',
false,
);
const decodedLaunchOptions = convertIfBase64(
req.parsed.searchParams.get('launch') || '{}',
);
let parsedLaunchOptions: BrowserServerOptions | CDPLaunchOptions;
// Handle browser re-connects here
if (req.parsed.pathname.includes('/devtools/browser')) {
const sessions = Array.from(this.browsers);
const id = req.parsed.pathname.split('/').pop() as string;
const found = sessions.find(([b]) =>
b.wsEndpoint()?.includes(req.parsed.pathname),
);
if (found) {
const [browser, session] = found;
++session.numbConnected;
this.log.debug(`Located browser with ID ${id}`);
return browser;
}
throw new NotFound(
`Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
);
}
// Handle page connections here
if (req.parsed.pathname.includes('/devtools/page')) {
const id = req.parsed.pathname.split('/').pop() as string;
if (!id.includes(BLESS_PAGE_IDENTIFIER)) {
const browsers = Array.from(this.browsers).map(([browser]) => browser);
const allPages = await Promise.all(
browsers
.filter((b) => !!b.wsEndpoint())
.map(async (browser) => {
const { port } = new URL(
browser.wsEndpoint() as unknown as string,
);
const response = await fetch(
`http://127.0.0.1:${port}/json/list`,
{
headers: {
Host: '127.0.0.1',
},
},
).catch(() => ({
json: () => Promise.resolve([]),
ok: false,
}));
if (response.ok) {
const body = await response.json();
// @ts-ignore
return body.map((b) => ({ ...b, browser }));
}
return null;
}),
);
const found = allPages.flat().find((b) => b.id === id);
if (found) {
const session = this.browsers.get(found.browser)!;
++session.numbConnected;
return found.browser;
}
throw new NotFound(
`Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
);
}
}
try {
parsedLaunchOptions = JSON.parse(decodedLaunchOptions);
} catch (err) {
throw new BadRequest(
`Error parsing launch-options: ${err}. Launch options must be a JSON or base64-encoded JSON object`,
);
}
const routerOptions =
typeof router.defaultLaunchOptions === 'function'
? router.defaultLaunchOptions(req)
: router.defaultLaunchOptions;
const launchOptions = {
...routerOptions,
...parsedLaunchOptions,
};
const manualUserDataDir =
launchOptions.args
?.find((arg) => arg.includes('--user-data-dir='))
?.split('=')[1] || (launchOptions as CDPLaunchOptions).userDataDir;
// Always specify a user-data-dir since plugins can "inject" their own
// unless it's playwright which takes care of its own data-dirs
const userDataDir =
manualUserDataDir ||
(!this.playwrightBrowserNames.includes(Browser.name)
? await generateDataDir(undefined, this.config)
: null);
const proxyServerArg = launchOptions.args?.find((arg) =>
arg.includes('--proxy-server='),
);
/**
* If it is a playwright request
*/
if (
launchOptions.args &&
proxyServerArg &&
req.parsed.pathname.startsWith('/playwright')
) {
(launchOptions as BrowserServerOptions).proxy = {
server: proxyServerArg.split('=')[1],
};
const argIndex = launchOptions.args.indexOf(proxyServerArg);
launchOptions.args.splice(argIndex, 1);
}
const browser = new Browser({
blockAds,
config: this.config,
logger,
userDataDir,
});
const session: BrowserlessSession = {
id: null,
initialConnectURL:
path.join(req.parsed.pathname, req.parsed.search) || '',
isTempDataDir: !manualUserDataDir,
launchOptions,
numbConnected: 1,
resolver: noop,
routePath: router.path,
startedOn: Date.now(),
ttl: 0,
userDataDir,
};
this.browsers.set(browser, session);
const match = (req.headers['user-agent'] || '').match(pwVersionRegex);
const pwVersion = match ? match[1] : 'default';
// 启动浏览器
await browser.launch(launchOptions as object, pwVersion);
// 执行hooks ,后边介绍下
await this.hooks.browser({ browser, meta: req.parsed });
browser.on('newPage', async (page) => {
await this.onNewPage(req, page);
(router.onNewPage || noop)(req.parsed || '', page);
});
return browser;
};
name = BrowserlessRoutes.ChromiumCDPWebSocketRoute;
说明
browserless 对于ws 的处理实际上就是ws proxy 对于浏览器的管理是基于了BrowserManager,同时为了确保稳定基于了queue 队列实现了
限速处理,以上是一个简单说明,通过此可以简单了解内部处理
参考资料
src/browsers/chrome.cdp.ts
src/browsers/chromium.cdp.ts
src/router.ts
src/browsers/index.ts
src/browserless.ts
https://docs.browserless.io/open-api#tag/Browser-WebSocket-APIs
https://github.com/berstend/puppeteer-extra/tree/master/packages/playwright-extra
https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra