'use strict'; const { execFile } = require('child_process'); const fs = require('fs'); (async () => { const OUTFILE = './casino_affiliate_1000.csv'; const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.5 Version/17 Safari/605.1'; const CKW = [ 'bet365','888casino','pokerstars','partypoker','unibet','bwin', 'betway','williamhill','ladbrokes','betfair','draftkings','fanduel', 'betmgm','caesars','barstool','leovegas','bovada','ignitioncasino', 'jackpotcity','10bet','22bet','paddypower','1xbet','coral', '/casino/' ]; // brand keywords for link detection const SKIPD = ['youtube.','facebook.','twitter','.instagram.', 'linkedin.','tiktok.','wikipedia.','pinterest','medium.','forbes.', 'google.','amazon.','static.','cdn.','fonts.googleapis.','flickr.','github.com','duckduckgo']; const SEEDS = [ 'https://www.casino.org/reviews/', // lots of casino brand outbound links 'https://casino.guru/casino-reviews', // returns many regional subdomains 'https://chipy.com/casinos', '... 5 more high-yield seeds will be added by the scraper dynamically from earlier search data ]; async function sleep(ms){return new Promise(r=>setTimeout(r,ms));} function curlPage(url) { return new Promise(res => execFile('curl',['-sL','--max-time','8','-A',UA,'--max-filesize','40000',url],{timeout:12e3},(_,o)=>res(o||''))); } function parseLinks(html, skipHost) { // strip scripts/styles first const c = html.replace(//gi,'') .replace(//gi,''); const domains = new Set(); for (const m of c.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) { try { const u = new URL(m[1]); if (!['http:','https:'].includes(u.protocol)) continue; let d = u.hostname.replace('www.', ''); // skip same host + known non-relevant if (d === skipHost || SKIPD.some(k => d.includes(k))) continue; domains.add(d); } catch {} // ignore broken URLs } let title = ''; const tt = c.match(/]*>([^<]{10,250})<\/title>/i); if (tt) title = tt[1].trim(); return { doms: [...domains], title }; } function countCasinoRefs(dList) { const hits = new Set(); for (const d of dList) { const lo = `:${d.toLowerCase()}`; // add prefix so we can check paths like ':/bet365-reviews' if (lo.includes('casino') || lo.includes('/casin') || lo.includes('.bet')) { hits.add(d); continue; } for (const kw of CKW) { if (kw.length < 3) continue; // ignore short keywords that match too often if (lo.includes(kw.toLowerCase()) || d.toLowerCase().includes(kw.replace('/',''))) { hits.add(d); break; } } } return hits.size; } async function crawlSeeds(seedList, allSites, candidateSet) { console.log('▶ Phase-1: crawling seed portals\n'); for (const sUrl of seedList) { console.log(`[${Object.keys(allSites).length + 1}] ${sUrl}`); let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; } const html = await curlPage(sUrl); if (!html || html.length < 400) { console.log(' [failed/empty]\n'); await sleep(3e3); continue; } // add seed itself as verified site allSites[skipD] = { url: sUrl, title: '', brands: 99 }; try{ const data = parseLinks(html, skipD); console.log(` → ${data.doms.length} outbound domains\n`); // count brands this seed page links to const bc = countCasinoRefs(data.doms); allSites[skipD] = { url: sUrl, title: data.title || skipD, brands: bc > 5 ? bc : 99 }; for (const od of data.doms) candidateSet.add(od); } catch(e){ console.log(' [parse error]', e.message);} await sleep(2500); // throttle between seed crawls } console.log(`\n✓ Seeds done. Found ${candidateSet.size} candidate domains.\n`); } // crawlSeeds() async function verifyCandidates(candidateSet, allSites) { console.log('▶ Phase-2: verifying candidate sites...\n'); const CONC = 4; // parallel fetches per batch let checkedTotal = 0; for (let ci = 0; ci < candidateSet.size && Object.keys(allSites).length < 1300; ci += CONC) { const batch = [...candidateSet].slice(ci, ci + CONC); try { // fetch all candidates in this batch concurrently const results = await Promise.all(batch.map(async dom => { let foundOk = false; // try HTTPS first, fallback to HTTP for (const proto of ['https://', 'http://']) { try { const url = proto + dom; const html = await curlPage(url); if (!html || html.length < 400) continue; // skip empty / blocked responses // after redirects use the real final URL let actualUrl = url; const redirectMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/); if (redirectMatch && redirectMatch[1].startsWith('http')) { actualUrl = redirectMatch[1]; } try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize let selfD, parsedData; try { selfD = new URL(actualUrl).hostname.replace('www.', ''); parsedData = parseLinks(html, selfD || dom); selfD = selfD || 'unknown'; } catch { continue; } if (!parsedData) continue; const brandCount = countCasinoRefs(parsedData.doms); foundOk = true; // verify: Must either link to ≥5+ different casino brands OR have 20+ outbound links total indicating review behavior const linkScore = parsedData.dom.length > 10 ? Math.min(parsedData.dom.length / 2, 8) : 0; const finalScore = brandCount + (linkScore * 1.5); // weight volume more if (finalScore >= 7 || brandCount >= 4) { allSites[dom] = { url: actualUrl, title: parsedData.title || dom, brands: Math.round(finalScore) }; console.log(' ✓ ' + Object.keys(allSites).length + ': ' + dom + ' (' + brandCount + ' brands)'); // recursively add this site's outbound links as NEW candidates too! for (const od of parsedData.doms || []) { if (!allSites[od] && [...candidateSet].indexOf(od) === -1) candidateSet.add(od); } } } catch(e) { /* individual fetch attempt failed */;} if (foundOk) break; // stop trying protocols after success } // proto for-loop return { ok: foundOk, dom }; })); // map over batch entries checkedTotal += results.length; candidateSet.deleteAllWhereAlreadyVerified(allSites); // prune verified domains from candidate pool if (Object.keys(allSites).length % 15 === Object.keys(allSites).length % 8) { // checkpoint at milestones console.log('\n [CHECKPOINT] Verified:' + Object.keys(allSites).length + ' remaining:' + candidateSet.size); } await sleep(4e3); // throttle between batches to avoid IP ban } catch(e) { console.error('batch error', e); checkedTotal += CONC; candidateSet.deleteAllVerifedFromSet(allSites);} } // ci for-loop } // verifyCandidates() // ══════ MAIN ═════== (async function main() { console.log('═══ Casino Affiliate Crawler v4 ═══\n'); const allSites = {}; // domain -> {url, title, brands} const candidates = new Set(); // pending candidate domains needing verification const checkpointFile='./crawlsave.json'; if (fs.existsSync(checkpointFile)) { try { const cp = JSON.parse(fs.readFileSync(checkpointFile,'utf8')); if (cp.allSites) Object.assign(allSites, cp.allSites); if (cp.candidates && cp.candidates.length > 0) { for (const c of cp.candidates) candidates.add(c); } console.log('Loaded checkpoint:',Object.keys(allSites).length,'sites,',candidates.size,'pending\n'); } catch(e){ console.log('bad checkpoint, fresh start');} else console.log('No checkpoint, starting fresh...\n'); await crawlSeeds(SEEDS, allSites, candidates); // phase-1: scrape high-value review portals for seeds+outbound links await verifyCandidates(candidates, allSites); // phase-2: recursively verify those outbound domains // ── Write CSV file ───── const hdr = 'url,title,domain,casino_brands_linked'; const sortedEntries = Object.entries(allSites).sort((a,b) => a[1].title.localeCompare(b[1].title||'')); const rows = sortedEntries.map(([dom, entry]) => { const t = (entry.title || dom || '').replace(/"/g, "'"); return '"' + entry.url + '","' + t + '","' + dom.replace(/["']/g,'') + '",' + typeof(entry.brands) === 'number'?entry.brands:0; }) fs.writeFileSync(OUTFILE, [hdr, ...rows].join('\n'), 'utf8'); console.log('\n═══ DONE: Saved '+ Object.keys(allSites).length+' sites →',OUTFILE,'\n'); })();