const { execFile } = require('child_process'); const fs = require('fs'); const CP_FILE = './.cp.json'; const CSV_OUT = './casino_affiliate_sites.csv'; const OUTFILE = './seed_expand.json'; // Casino brand / keyword fingerprints const CASINO_KW = [ "casino" , "bet365", "betfair" , "888.", "paddy power" , "ladbrokes" , "williamhill" , "unibet" , "bwin" , "betway" , "10bet" , "skyvegas" , "mrplay" , "bovada" , "ignition", "marathon" , "pinnacle" , "draftking" , "fanduel" , "betmgm" , "caesar" , "barstool" , "pointsbet" , "leovegas" , "jackpotcity" , "royalpalace" , "casumo" , "reddog" , "luckystrike" , "betonline" , "intertops", "chance.com", "betsson", "betclic", "22bet", "1xbet", "stake.", "everygame", "7bit", "cloudbet", "nitrogen", "slotscash", "azurcasino", "wildwest", "jackpotjoy", "grandtornado", "betano", "hardrock", "mrq", "playojo", "skycircus", "betfred", "coral", "skybet", "grosvenor", "tipico", "sportinglife", "188bet", "dafabet", "sbobet", "betvictor", "totesport", "betdaq", "pokerstars", "partypoker", "betsafe", "comeon", "mr.green", "red dog", "spinplanet", "casinozilla", "bonusboss", "slotsup", "slotcatalog", "pin-up casino", "bitsler", "playtech", "evolution gaming", "microgaming", "netent", "pragmatic play","play'n go", "red tiger", "yggdrasil", "push gaming", "relax gaming", "foxium", ].map(k => k.toLowerCase()); const SKIP_PATS = [ "youtube.com","youtu.be","reddit.com","facebook.com","twitter.com","x.com", "linkedin.com","tiktok.com","wikipedia.","pinterest.","instagram.", "medium.com","forbes.com","nytimes.com","amazon.","ebay.","microsoft.", "play.google.com","web.archive.org","duckduckgo.","startpage.","brave.com", "t.co","imgur.com","flickr.com","goodreads.","steamcommunity.", "github.com","stackoverflow.","stackexchange.","apple.com","google.", "cdn.","cdnjs.cloudflare.com","fonts.googleapis.","ajax.googleapis.", "gravatar.com","facebook.com","open.graph.facebook.","platform.twitter.", ]; function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } function getDomain(url) { try { const u = new URL(url); let h = u.hostname; if (h.startsWith('www.')) h = h.slice(4); return h.toLowerCase(); } catch { return url.toLowerCase(); } } function isSkip(d) { for (const s of SKIP_PATS) if (d.includes(s)) return true; return false; } // Deep count: how many DISTINCT casino-linked outbound domains does a page have? function deepCountCasinoLinks(html, selfDomain) { const stripped = html.replace(//gi, ' ').replace(//gi, ' '); const seen = new Set(); const casinoDomains = new Set(); for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) { let link = m[1]; if (!link.startsWith('http')) continue; const d = getDomain(link); if (d === selfDomain || isSkip(d) || seen.has(d)) continue; seen.add(d); const lo = link.toLowerCase(); for (const kw of CASINO_KW) { if (lo.includes(kw) && kw.length > 3) { casinoDomains.add(d); break; } } } return casinoDomains.size; } // Extract all outbound link domains from a page (not just casino ones) function extractOutboundLinks(html, selfDomain) { const stripped = html.replace(//gi, ' ').replace(//gi, ' '); const outbounds = new Set(); for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) { let link = m[1]; if (!link.startsWith('http')) continue; const d = getDomain(link); if (d !== selfDomain && !isSkip(d) && outbounds.size < 200) { outbounds.add(d); } } return [...outbounds]; } // Fetch page HTML via curl async function fetchHtml(url) { try { const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36'; return await new Promise((resolve) => { execFile('curl', ['-sL', '--max-time', '10', '-A', ua, '--max-filesize', '50000', url], { timeout: 15000 }, (_, s) => resolve(s || '') ); }); } catch { return ''; } } // ═══════ MAIN ═══════ (async () => { console.log('═══ Transitive Crawler v11 ── expand seed sites ═══\n'); // Load checkpoint from v9 let cp = null; try { if (fs.existsSync(CP_FILE)) cp = JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch {} if (!cp || !cp.sites || cp.sites.length === 0) { console.log('No checkpoint found. Exiting.'); return; } const allSites = [...cp.sites]; let verified = [...(cp.verified || [])]; const verifiedDomains = new Set(verified.map(v => v.domain)); console.log(`Loaded ${allSites.length} seeds. Already verified: ${verified.length}`); console.log(`Goal: verify each, then expand via outbound links.\n`); // ── Step 1: Verify all seed sites (concurrent BATCH of 5) ─────── const CONC = 5; let totalChecked = 0; const toCheck = allSites.filter(s => !verifiedDomains.has(s.domain)); for (let i = 0; i < toCheck.length; i += CONC) { const batch = toCheck.slice(i, i + CONC); const jobs = batch.map(async (site) => { const html = await fetchHtml(site.url ? site.url : `https://${site.domain}`); const count = deepCountCasinoLinks(html, site.domain); return { site, count }; }); const results = await Promise.all(jobs); for (const { site, count } of results) { if (count >= 5) { verified.push({ url: site.url || `https://${site.domain}`, title: site.title, domain: site.domain, casinoLinks: count }); verifiedDomains.add(site.domain); } } totalChecked += batch.length; if (totalChecked % 30 === 0) console.log(`Verify: ${totalChecked}/${toCheck.length} → ${verified.length} pass ≥5`); await sleep(400); } console.log(`\nStep 1 done: ${verified.length} verified from ${allSites.length} seeds\n`); // ── Step 2: Transitive expansion via outbound links ──────────── const checkedOutbounds = new Set(); let expansionRound = 0; const MAX_ROUNDS = 5; while (expansionRound < MAX_ROUNDS) { expansionRound++; // Which sites to scan for outbound links this round? const verifiedSitesList = [...new Map(verified.map(v => [v.domain, v])).values()]; const sitesToScan = verifiedSitesList.filter(s => !checkedOutbounds.has(s.domain)); if (sitesToScan.length === 0) { console.log(`Round ${expansionRound}: nothing new to scan. Stopping.`); break; } console.log(`══ Expansion round ${expansionRound}: scanning ${sitesToScan.length} verified sites ══`); const newDomains = []; for (let si = 0; si < sitesToScan.length; si++) { const site = sitesToScan[si]; checkedOutbounds.add(site.domain); try { const url = site.url ? site.url : `https://${site.domain}`; const html = await fetchHtml(url); if (!html || html.length < 500) continue; const outboundDomains = extractOutboundLinks(html, site.domain); for (const od of outboundDomains) { if (!verifiedDomains.has(od)) { newDomains.push({ url: `https://${od}`, title: od, domain: od }); } } } catch {} if (si % 30 === 0) console.log(` Scanned ${si}/${sitesToScan.length}, found ${newDomains.length} new`); await sleep(300); } // Deduplicate new domains const seenNew = new Set(); const uniqueNew = []; for (const nd of newDomains) { if (!seenNew.has(nd.domain)) { seenNew.add(nd.domain); uniqueNew.push(nd); } } console.log(` → ${uniqueNew.length} unique new domains to verify\n`); // Verify the newly discovered domains const BATCH_V = 5; for (let i = 0; i < uniqueNew.length; i += BATCH_V) { const batch = uniqueNew.slice(i, i + BATCH_V); const jobs = batch.map(async (vsite) => { const html = await fetchHtml(vsite.url); const count = deepCountCasinoLinks(html, vsite.domain); return { site: vsite, count }; }); const results = await Promise.all(jobs); for (const { site, count } of results) { if (count >= 5) { verified.push({ url: `https://${site.domain}`, title: site.title || site.domain, domain: site.domain, casinoLinks: count }); verifiedDomains.add(site.domain); } } if (i % 100 === 0) console.log(` verified ${verified.length} total`); await sleep(350); } console.log(` Round ${expansionRound} done: ${verified.length} total verified\n`); await sleep(2000); } // ── Write final CSV ─────────────── const header = 'url,title,domain,casino_links'; const allData = [...new Map(verified.map(v => [v.domain, v])).values()]; const rows = allData.map(v => { const t = (v.title || '').replace(/"/g, "'"); return `"${v.url}","${t}","${v.domain}",${v.casinoLinks}`; }); fs.writeFileSync(CSV_OUTPUT, [header, ...rows].join('\n'), 'utf8'); console.log(`\n══════════ ${allData.length} verified site → ${CSV_OUTPUT} ══════════`); })();