'use strict'; const { execFile } = require('child_process'); const fs = require('fs'); const CSV_FILE = './casino_affiliates_final.csv'; const CHECKPOINT = './crawlsave.json'; const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1'; // Casino keywords for brand detection in outbound links const CASINO_KEYWORDS = [ 'bet365','888casino','skyvegas','pokerstars','partypoker', 'unibet','bwin','betway','williamhill','ladbrokes','betfair', 'draftkings','fanduel','betmgm','caesars','barstool', 'leovegas','bovada','ignitioncasino','mrplay','jackpotcity', 'casumo','playojo','22bet','paddypower','1xbet', 'betonline','intertops','reddogcas','luckystick', 'betclic','betsson','hardrock','betano','grosvenor', 'coral','skybet','mr.green','betsafe','comeon.se', 'slots.lv','bc.game','pin-up','stake.cas', '/casino/','casinos.','online-casino','gambling-sites' ]; // Domains to skip (social, CDN, analytics) const SKIP = [ 'youtube.','facebook.','twitter','.instagram.', 'linkedin','tiktok.','wikipedia.','pinterest', 'medium.','forbes.','google.','amazon.','static.', 'cdn.','fonts.googleapis.','flickr.','imgur.', 'github.com','t.co','duckduckgo' ]; // Seed review portal URLs to start crawling from const SEED_URLS = [ 'https://www.casino.org/reviews/', 'https://casino.guru/casino-reviews', 'https://chipy.com/casinos', 'https://next.io/online-casinos-uk/', 'https://first.com/casino/best-casinos', 'https://slotcatalog.com/en/best-online-casinos', 'https://www.whichbingo.co.uk/casino-sites/', 'https://gg.co.uk/online-casinos/top-20/', 'https://www.oddschecker.com/casino-bonus', 'https://www.racingpost.com/online-casino/best-sites/', 'https://www.livecasinocomparer.com/online-casino/', 'https://www.freep.com/story/sports/online-casino-rankings/', 'https://gamingamerica.com/online-casinos', 'https://www.legalsportsreport.com/online-casinos/', 'https://www.sportsline.com/casinos/', 'https://deadspin.com/sweepstakes-casinos/social-casinos/', ]; // Additional pages per seed to crawl (multi-page review lists) const MULTI_PAGES = { 'https://www.poke.org/reviews/': ['page2/','page3/'], 'https://casino.guru/casino-reviews': ['?p=2','?p=3','?p=4','?p5'], }; function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } // Fetch a page via curl (reliable against bot protection) async function fetchPage(url) { try { return await new Promise(resolve => { execFile('curl', ['-sL', '--max-time', '8', '-A', UA, '--max-filesize', '50000', url], { timeout: 12000 }, (_, output) => resolve(output || '')); }); } catch (e) { return ''; } } // Parse HTML: extract unique outbound domains + page title function parseOutboundLinks(html, skipDomain) { const clean = html.replace(//gi, '') .replace(//gi, ''); const doms = new Set(); for (const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) { try { const u = new URL(m[1]); if (!['http:', 'https:'].includes(u.protocol)) continue; let d = u.hostname.replace('www.', ''); if (d === skipDomain || SKIP.some(k => d.includes(k))) continue; doms.add(d); } catch {} // ignore broken hrefs } // Try to extract page title from tag let title = ''; const tt = clean.match(/<title[^>]*>([^<]{10,250})<\/title>/i); if (tt) title = tt[1].trim(); return { doms: [...doms], title }; } // Count distinct casino brands referenced in an outbound domain list function countCasinoBrands(domainList) { const hits = new Set(); for (const d of domainList) { const lo = `:${d.toLowerCase()}`; // prepends colon to catch paths like ':/bet365-reviews' if (lo.includes('casino') || lo.includes('/casin') || lo.includes('/gambl')) { hits.add(d); continue; } for (const kw of CASINO_KEYWORDS) { if (kw.length < 3) continue; const kl = kw.toLowerCase(); if (lo.includes(kl) || d.toLowerCase().includes(kl.replace('/', ''))) { hits.add(d); break; } } } return hits.size; // unique casino brands this page links to } // Checkpoint helpers (survives crashes/timeouts mid-run) function checkSave(obj) { fs.writeFileSync(CHECKPOINT, JSON.stringify(obj)); } function checkLoad() { try { if (fs.existsSync(CHECKPOINT)) return JSON.parse(fs.readFileSync(CHECKPOINT, 'utf8')); } catch {} return null; } // ═══════ MAIN CRAWL LOGIC ═══════ (async function main() { console.log('═══ Casino Affiliate Scraper v3 ═══\n'); const cp = checkLoad(); // Verified affiliate sites: domain -> entry object with url, title, brands count const VERIFIED = new Map(); if (cp && cp.verified) { for (const [k, v] of Object.entries(cp.verified)) VERIFIED.set(k, v); } // Candidate domains still needing verification const CANDIDATES = (cp && cp.candidates) ? [...new Set(cp.candidates)] : []; let seedsDone = (cp && cp.seedsDone) || 0; let candIdx = (cp && cp.checkIdx) || 0; console.log(`${VERIFIED.size} already verified, ${CANDIDATES.length} candidates pending\n`); // ── PHASE 1: Crawl seed review portals for outbound casino links ─────────────── console.log('Phase-1: Scraping seed pages...\n'); for (let si = seedsDone; si < SEED_URLS.length; si++) { const sUrl = SEED_URLS[si]; let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; } console.log(`[${si+1}/${SEED_URLS.length}] ${sUrl}`); const html = await fetchPage(sUrl); if (!html || html.length < 400) { console.log(' [no content]\n'); await sleep(3e3); continue; } // Seed itself is confirmed affiliate site VERIFIED.set(skipD, { url: sUrl, title: '', brands: 99 }); try { const data = parseOutboundLinks(html, skipD); console.log(` → ${data.doms.length} outbound domains found\n`); // Add seed's outbound links to candidate pool for Phase-2 verification for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); } } catch (e) { console.log(' [parse error] ' + e.message); } seedsDone = si + 1; checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx }); await sleep(2500); // rate limit between seed crawls } console.log(`\nSeeds done. Found ${CANDIDATES.length} domains to verify.` + `\nphase-2: verifying candidates...\n`); // ── PHASE 2: Verify candidate sites (concurrent curl batches) ─────── const CONC = 4; // simultaneous fetches per batch for (; candIdx < CANDIDATES.length && VERIFIED.size < 1300; candIdx += CONC) { const batch = CANDIDATES.slice(candIdx, candIdx + CONC); const results = []; try { results.push(...(await Promise.all(batch.map(async (dom) => { let foundOk = false; for (const proto of ['https://', 'http://']) { // try HTTPS first, fallback HTTP try { const url = proto + dom; const html = await fetchPage(url); if (!html || html.length < 400) continue; let actualUrl = url; // after redirects use real final URL const locMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/); if (locMatch && locMatch[1].startsWith('http')) actualUrl = locMatch[1]; try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize let selfD, data; try { selfD = new URL(actualUrl).hostname.replace('www.', '') || dom; data = parseOutboundLinks(html, selfD); } catch { continue; } const brandCount = countCasinoBrands(data.doms); foundOk = true; // Count outbound links too — review portals often have lots of casino brand refs const linkScore = data.doms.length > 10 ? Math.min(data.doms.length / 2, 8) : 0; const finalScore = brandCount + linkScore; // combine brands + volume signals if (finalScore >= 6 || brandCount >= 3) { VERIFIED.set(dom, { url: actualUrl, title: data.title, brands: brandCount }); console.log(' ✓ ' + VERIFIED.size + ': ' + dom + ' -> ' + brandCount + ' brands'); // Recursive discovery from this newly-verified affiliate site! for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); } } } catch (e) { /* individual attempt failed, try next proto */ continue; } if (foundOk) break; // stop trying after success } return { ok: foundOk }; }))) ); // map over batch entries } catch (e) { console.error('batch error', e); } finally { candIdx += CONC; } // Checkpoint every 20 verified or at milestone break if (VERIFIED.size % 15 === 0 || candIdx >= CANDIDATES.length - CONC) { checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx }); console.log(' [CKPT] VERIF:' + VERIFIED.size + ' pending:' + (CANDIDATES.length - candIdx) + '\n'); } await sleep(3500); // throttle between batches to avoid IP ban } // ── PHASE 3: write CSV file ─────────────── const out = [...VERIFIED.values()] .sort((a, b) => (a.title || a.url).localeCompare(b.title || b.url)) .map(v => { const t = (v.title || '').replace(/"/g, "'"); return '"' + v.url + '","' + t + '","' + v.domain + '",' + v.brands; }); fs.writeFileSync(CSV_FILE, ['url,title,domain,casino_brands_linked', ...out].join('\n'), 'utf8'); console.log('\n═══ DONE: ' + VERIFIED.size + ' sites saved → ' + CSV_FILE + ' ═══\n'); })();