docfast/dist/services/browser.js
OpenClaw Subagent 70eb6908e3
Some checks failed
Build & Deploy to Staging / Build & Deploy to Staging (push) Has been cancelled
Document rate limit headers in OpenAPI spec
- Add reusable header components (X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, Retry-After)
- Reference headers in 200 responses on all conversion and demo endpoints
- Add Retry-After header to 429 responses
- Update Rate Limits section in API description to mention response headers
- Add comprehensive tests for header documentation (21 new tests)
- All 809 tests passing
2026-03-18 11:06:22 +01:00

311 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import puppeteer from "puppeteer";
import logger from "./logger.js";
const BROWSER_COUNT = parseInt(process.env.BROWSER_COUNT || "2", 10);
const PAGES_PER_BROWSER = parseInt(process.env.PAGES_PER_BROWSER || "8", 10);
const RESTART_AFTER_PDFS = 1000;
const RESTART_AFTER_MS = 60 * 60 * 1000; // 1 hour
const instances = [];
const waitingQueue = [];
let roundRobinIndex = 0;
export function getPoolStats() {
const totalAvailable = instances.reduce((s, i) => s + i.availablePages.length, 0);
const totalPages = instances.length * PAGES_PER_BROWSER;
const totalPdfs = instances.reduce((s, i) => s + i.pdfCount, 0);
return {
poolSize: totalPages,
totalPages,
availablePages: totalAvailable,
queueDepth: waitingQueue.length,
pdfCount: totalPdfs,
restarting: instances.some((i) => i.restarting),
uptimeMs: Date.now() - (instances[0]?.lastRestartTime || Date.now()),
browsers: instances.map((i) => ({
id: i.id,
available: i.availablePages.length,
pdfCount: i.pdfCount,
restarting: i.restarting,
})),
};
}
export async function recyclePage(page) {
try {
const client = await page.createCDPSession();
await client.send("Network.clearBrowserCache").catch(() => { });
await client.detach().catch(() => { });
// Clean up request interception (set by renderUrlPdf for SSRF protection)
page.removeAllListeners("request");
await page.setRequestInterception(false).catch(() => { });
const cookies = await page.cookies();
if (cookies.length > 0) {
await page.deleteCookie(...cookies);
}
await page.goto("about:blank", { timeout: 5000 }).catch(() => { });
}
catch {
// ignore
}
}
async function createPages(b, count) {
const pages = [];
for (let i = 0; i < count; i++) {
const page = await b.newPage();
pages.push(page);
}
return pages;
}
function pickInstance() {
// Round-robin among instances that have available pages
for (let i = 0; i < instances.length; i++) {
const idx = (roundRobinIndex + i) % instances.length;
const inst = instances[idx];
if (inst.availablePages.length > 0 && !inst.restarting) {
roundRobinIndex = (idx + 1) % instances.length;
return inst;
}
}
return null;
}
async function acquirePage() {
// Check restarts
for (const inst of instances) {
if (!inst.restarting && (inst.pdfCount >= RESTART_AFTER_PDFS || Date.now() - inst.lastRestartTime >= RESTART_AFTER_MS)) {
scheduleRestart(inst);
}
}
const inst = pickInstance();
if (inst) {
const page = inst.availablePages.pop();
return { page, instance: inst };
}
// All pages busy, queue with 30s timeout
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
const idx = waitingQueue.findIndex((w) => w.resolve === resolve);
if (idx >= 0)
waitingQueue.splice(idx, 1);
reject(new Error("QUEUE_FULL"));
}, 30_000);
waitingQueue.push({
resolve: (v) => {
clearTimeout(timer);
resolve(v);
},
});
});
}
function releasePage(page, inst) {
inst.pdfCount++;
const waiter = waitingQueue.shift();
if (waiter) {
recyclePage(page).then(() => waiter.resolve({ page, instance: inst })).catch(() => {
if (inst.browser && !inst.restarting) {
inst.browser.newPage().then((p) => waiter.resolve({ page: p, instance: inst })).catch(() => {
waitingQueue.unshift(waiter);
});
}
else {
waitingQueue.unshift(waiter);
}
});
return;
}
recyclePage(page).then(() => {
inst.availablePages.push(page);
}).catch(() => {
if (inst.browser && !inst.restarting) {
inst.browser.newPage().then((p) => inst.availablePages.push(p)).catch(() => { });
}
});
}
async function scheduleRestart(inst) {
if (inst.restarting)
return;
inst.restarting = true;
logger.info(`Scheduling browser ${inst.id} restart (pdfs=${inst.pdfCount}, uptime=${Math.round((Date.now() - inst.lastRestartTime) / 1000)}s)`);
const drainCheck = () => new Promise((resolve) => {
const check = () => {
if (inst.availablePages.length === PAGES_PER_BROWSER && waitingQueue.length === 0) {
resolve();
}
else {
setTimeout(check, 100);
}
};
check();
});
await Promise.race([drainCheck(), new Promise(r => setTimeout(r, 30000))]);
for (const page of inst.availablePages) {
await page.close().catch(() => { });
}
inst.availablePages.length = 0;
try {
await inst.browser.close().catch(() => { });
}
catch { }
const execPath = process.env.PUPPETEER_EXECUTABLE_PATH || undefined;
inst.browser = await puppeteer.launch({
headless: true,
executablePath: execPath,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage"],
});
const pages = await createPages(inst.browser, PAGES_PER_BROWSER);
inst.availablePages.push(...pages);
inst.pdfCount = 0;
inst.lastRestartTime = Date.now();
inst.restarting = false;
logger.info(`Browser ${inst.id} restarted successfully`);
while (waitingQueue.length > 0 && inst.availablePages.length > 0) {
const waiter = waitingQueue.shift();
const p = inst.availablePages.pop();
if (waiter && p)
waiter.resolve({ page: p, instance: inst });
}
}
async function launchInstance(id) {
const execPath = process.env.PUPPETEER_EXECUTABLE_PATH || undefined;
const browser = await puppeteer.launch({
headless: true,
executablePath: execPath,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage"],
});
const pages = await createPages(browser, PAGES_PER_BROWSER);
const inst = {
browser,
availablePages: pages,
pdfCount: 0,
lastRestartTime: Date.now(),
restarting: false,
id,
};
return inst;
}
export async function initBrowser() {
for (let i = 0; i < BROWSER_COUNT; i++) {
const inst = await launchInstance(i);
instances.push(inst);
}
logger.info(`Browser pool ready (${BROWSER_COUNT} browsers × ${PAGES_PER_BROWSER} pages = ${BROWSER_COUNT * PAGES_PER_BROWSER} total)`);
}
export async function closeBrowser() {
for (const inst of instances) {
for (const page of inst.availablePages) {
await page.close().catch(() => { });
}
inst.availablePages.length = 0;
await inst.browser.close().catch(() => { });
}
instances.length = 0;
}
/** Build a Puppeteer-compatible PDFOptions object from user-supplied render options. */
export function buildPdfOptions(options) {
const result = {
format: options.format || "A4",
landscape: options.landscape || false,
printBackground: options.printBackground !== false,
margin: options.margin || { top: "0", right: "0", bottom: "0", left: "0" },
};
if (options.headerTemplate !== undefined)
result.headerTemplate = options.headerTemplate;
if (options.footerTemplate !== undefined)
result.footerTemplate = options.footerTemplate;
if (options.displayHeaderFooter !== undefined)
result.displayHeaderFooter = options.displayHeaderFooter;
if (options.scale !== undefined)
result.scale = options.scale;
if (options.pageRanges)
result.pageRanges = options.pageRanges;
if (options.preferCSSPageSize !== undefined)
result.preferCSSPageSize = options.preferCSSPageSize;
if (options.width)
result.width = options.width;
if (options.height)
result.height = options.height;
return result;
}
export async function renderPdf(html, options = {}) {
const { page, instance } = await acquirePage();
try {
await page.setJavaScriptEnabled(false);
const startTime = Date.now();
let timeoutId;
const result = await Promise.race([
(async () => {
await page.setContent(html, { waitUntil: "domcontentloaded", timeout: 15_000 });
await page.addStyleTag({ content: "* { margin: 0; padding: 0; } body { margin: 0; }" });
const pdf = await page.pdf(buildPdfOptions(options));
return Buffer.from(pdf);
})(),
new Promise((_, reject) => {
timeoutId = setTimeout(() => reject(new Error("PDF_TIMEOUT")), 30_000);
}),
]).finally(() => clearTimeout(timeoutId));
const durationMs = Date.now() - startTime;
logger.info(`PDF rendered in ${durationMs}ms (html, ${result.length} bytes)`);
return { pdf: result, durationMs };
}
finally {
releasePage(page, instance);
}
}
export async function renderUrlPdf(url, options = {}) {
const { page, instance } = await acquirePage();
try {
await page.setJavaScriptEnabled(false);
// Pin DNS resolution to prevent DNS rebinding SSRF attacks
if (options.hostResolverRules) {
const client = await page.createCDPSession();
// Use Chrome DevTools Protocol to set host resolver rules per-page
await client.send("Network.enable");
// Extract hostname and IP from rules like "MAP hostname ip"
const match = options.hostResolverRules.match(/^MAP\s+(\S+)\s+(\S+)$/);
if (match) {
const [, hostname, ip] = match;
await page.setRequestInterception(true);
page.on("request", (request) => {
const reqUrl = new URL(request.url());
if (reqUrl.hostname === hostname) {
// For HTTP, rewrite to IP with Host header
if (reqUrl.protocol === "http:") {
reqUrl.hostname = ip;
request.continue({
url: reqUrl.toString(),
headers: { ...request.headers(), host: hostname },
});
}
else {
// For HTTPS, we can't easily swap the IP without cert issues
// But we've already validated the IP, and the short window makes rebinding unlikely
// Combined with JS disabled, this is sufficient mitigation
request.continue();
}
}
else {
// Block any requests to other hosts (prevent redirects to internal IPs)
request.abort("blockedbyclient");
}
});
}
}
const startTime = Date.now();
let timeoutId;
const result = await Promise.race([
(async () => {
await page.goto(url, {
waitUntil: options.waitUntil || "domcontentloaded",
timeout: 30_000,
});
const pdf = await page.pdf(buildPdfOptions(options));
return Buffer.from(pdf);
})(),
new Promise((_, reject) => {
timeoutId = setTimeout(() => reject(new Error("PDF_TIMEOUT")), 30_000);
}),
]).finally(() => clearTimeout(timeoutId));
const durationMs = Date.now() - startTime;
logger.info(`PDF rendered in ${durationMs}ms (url, ${result.length} bytes)`);
return { pdf: result, durationMs };
}
finally {
releasePage(page, instance);
}
}