const puppeteer = require('puppeteer'); const fs = require('fs'); const CSV_FILE = './casino_affiliate_sites.csv'; const CP_FILE = './scraper_checkpoint.json'; // Casino brand keywords for link detection const CASINO_KW = [ 'bet365','888casino','skyvegas','pokerstars','partypoker', 'unibet','bwin','betway','williamhill','ladbrokes', 'betfair','draftkings','fanduel','betmgm','caesars', 'barstoolsports','leovegas','bovada','ignitioncasino', 'mrplay','jackpotcity','casumo','playojo','22bet', 'paddy power','1xbet','betonline','intertops','reddog', 'luckystrike','betclic','betsson','hardrock','betano', 'grosvenor','coral','skybet','mr.green','betsafe', ]; // Domains to skip (social, news, etc.) const SKIP = [ 'youtube.com','facebook.com','twitter.com','linkedin.com','reddit.com', 'instagram.com','tiktok.com','pinterest.com','medium.com','github.com', 'wikipedia.org','google.','amazon.','web.archive.org', 'www.googletagmanager.com','www.google-analytics.com', 'fonts.googleapis.com','cdnjs.cloudflare.com','static.', ]; async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } // Seed sites: known casino review portals to start crawling from const SEED_URLS = [ 'https://www.casino.org/reviews/', 'https://casino.guru/casino-reviews', 'https://www.askgamblers.com/online-casinos/reviews', 'https://chipy.com/casinos', 'https://www.racingpost.com/online-casino/best-sites/', 'https://slotcatalog.com/en/best-online-casinos', 'https://www.gambling.com/uk/online-casinos', 'https://next.io/online-casinos-uk/', 'https://first.com/casino/best-casinos', 'https://www.oddschecker.com/casino-bonus', ]; /** * Fetch a page with puppeteer, extract all outbound links. * Returns {url, title, outboundLinks:[domain,...]} or null on failure. */ async function crawlSite(browser, url) { try { const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120'); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 }); // Wait briefly for lazy-loaded links await sleep(2000); // Extract all hrefs from the page const result = await page.evaluate((selfDomain) => { const links = new Set(); const anchors = document.querySelectorAll('a[href]'); for (const a of anchors) { const href = a.href; try { const u = new URL(href); // Filter: must be http/https, not same domain, not skip list const d = u.hostname.replace('www.', ''); if (!['http','https'].includes(u.protocol)) continue; if (d === selfDomain) continue; links.add(d); } catch {} } return [...links]; }, await page.evaluate(() => window.location.hostname)); const title = await page.title(); await page.close(); return { url, title, outboundLinks: result }; } catch (err) { console.log(` [fail] ${url}: ${err.message}`); return null; } } /** * Count how many different casino brand domains a page links to. */ function countCasinoBrands(outboundLinks) { const matched = new Set(); for (const domain of outboundLinks) { const lo = domain.toLowerCase(); for (const kw of CASINO_KW) { if (lo.includes(kw.toLowerCase())) matched.add(kw); break; } } return matched.size; } // Load/save checkpoint function loadCP() { try { return JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch { return null; } } function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); } (async () => { console.log('══════════ Casino Affiliate Crawler v2 ══════════'); console.log(`Seeds : ${SEED_URLS.length}`); console.log(`Goal : 1000+ sites with ≥5 casino brand links\n`); const cp = loadCP(); const allSites = new Map(); // domain → {url, title, casinoBrands: n} const visited = new Set(); // URLs we already crawled for outbound links const candidates = []; // Queue of candidate DOMAINS to discover if (cp) { console.log(`Resuming checkpoint...`); for (const d of Object.keys(cp.sites)) allSites.set(d, cp.sites[d]); for (const u of cp.visited || []) visited.add(u); for (const c of cp.candidates || []) candidates.push(c); console.log(` ${allSites.size} known sites, ${visited.size} visited, ${candidates.length} candidates\n`); } const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox','--disable-dev-shm-usage','--disable-setuid-sandbox'], }); // ── Phase 1: Crawl seed sites & extract outbound links ──────── console.log('▶ Phase 1: Crawling seed sites...\n'); for (let i = 0; i < SEED_URLS.length; i++) { const sUrl = SEED_URLS[i]; if (visited.has(sUrl)) continue; visited.add(sUrl); console.log(`[${i+1}/${SEED_URLS.length}] ${sUrl}`); const data = await crawlSite(browser, sUrl); if (!data) { await sleep(2000); continue; } // The seed itself is a confirmed affiliate site const dName = new URL(sUrl).hostname.replace('www.', ''); allSites.set(dName, { url: sUrl, title: data.title, casinoBrands: 99 }); // Add outbound domains to candidates for (const od of data.outboundLinks) { if (SKIP.some(k => od.includes(k))) continue; if (!allSites.has(od) && !candidates.find(c => c.domain === od)) { candidates.push({ domain: od, from: dName }); } } console.log(` → Found ${data.outboundLinks.length} outbound links\n`); saveCP({ sites: Object.fromEntries(allSites), visited: [...visited], candidates: candidates.filter(c => !allSites.has(c.domain)), }); await sleep(3000); } // ── Phase 2: Verify candidates by checking their outbound links ─────── console.log(`▶ Phase 2: Verifying ${candidates.length} candidate sites...\n`); const CONC = 3; // concurrent pages in puppeteer let verified = allSites.size; for (let i = 0; i < candidates.length && verified < 1200; i += CONC) { const batch = candidates.slice(i, i + CONC); const checks = await Promise.all( batch.map(async (cand) => { if (visited.has(`https://${cand.domain}`)) return null; let tryUrls = [ `https://${cand.domain}`, cand.domain.startsWith('http') ? cand.domain : `http://${cand.domain}`, ]; for (const u of tryUrls) { if (visited.has(u)) break; visited.add(u); const data = await crawlSite(browser, u); if (!data) continue; const brandCount = countCasinoBrands(data.outboundLinks); return { domain: cand.domain, url: u, title: data.title, brands: brandCount }; } return null; }) ); for (const result of checks) { if (!result) continue; // Must link to ≥5 different casino brands AND have outbound links suggesting it's an affiliate/review site if (result.brands >= 5 && result.url) { allSites.set(result.domain, result); verified++; console.log(` ✓ ${result.domain} (${result.brands} brands)`); // Discover more candidates from this newly-verified site's outbound links await crawlSite(browser, result.url).then(d => { if (!d) return; for (const od of d.outboundLinks || []) { if (SKIP.some(k => od.includes(k))) continue; if (!allSites.has(od) && !candidates.find(c => c.domain === od)) { candidates.push({ domain: od, from: result.domain }); } } }); } } // Save checkpoint every 20 sites if (verified % 20 === 0 || verified === allSites.size) { saveCP({ sites: Object.fromEntries(allSites), visited: [...visited], candidates: candidates.filter(c => !allSites.has(c.domain)), }); console.log(` ${verified} verified, ${candidates.length - i} remaining\n`); } await sleep(4000); // Rate limit to avoid IP bans } await browser.close(); // ── Write CSV ─────── const header = 'url,title,domain,casino_brands'; const sorted = [...allSites.values()] .sort((a, b) => a.url.localeCompare(b.url)) .map(v => `"${v.url}","${(v.title||'').replace(/"/g,"''")}","${v.domain}",${v.casino_brands}`); fs.writeFileSync(CSV_FILE, [header, ...sorted].join('\n'), 'utf8'); console.log(`\n══════ ${allSites.size} verified sites → ${CSV_FILE} ══════`); })();