Loading...
Loading...
Search and scrape public web content with headless Chrome and DuckDuckGo using safe practices.
npx skill4agent add besoeasy/open-skills using-web-scraping/robots.txtUser-Agentlogin_requiredtitlemeta descriptionmaincanonicalconst { chromium } = require('playwright');
async function ddgSearchAndScrape(query) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage({ userAgent: 'open-skills-bot/1.0' });
// DuckDuckGo search
await page.goto('https://duckduckgo.com/');
await page.fill('input[name="q"]', query);
await page.keyboard.press('Enter');
await page.waitForSelector('.result__title a');
// collect top result URL
const href = await page.getAttribute('.result__title a', 'href');
if (!href) { await browser.close(); return []; }
// visit result and extract
await page.goto(href, { waitUntil: 'domcontentloaded' });
const title = await page.title();
const description = await page.locator('meta[name="description"]').getAttribute('content').catch(() => null);
const article = await page.locator('article, main, #content').first().innerText().catch(() => null);
await browser.close();
return [{ url: href, title, description, text: article }];
}
// usage
// ddgSearchAndScrape('open-source agent runtimes').then(console.log);You are an agent with a web-scraping skill. For any `search:` task, use DuckDuckGo to find relevant pages, then open each page in a headless Chrome instance (Playwright/Puppeteer) and extract `title`, `meta description`, `main text`, and `canonical` URL. Always:
- Check and respect robots.txt
- Rate-limit requests (<=1 req/sec)
- Use a clear `User-Agent` and do not execute arbitrary page JS
Return results as JSON: [{url,title,description,text}] or `login_required` if a page needs authentication.npm i playwrightnpx playwright installpip install playwrightplaywright installpage.route