const { execFile } = require('child_process'); const fs = require('fs'); const CSV_IN = './casino_affiliate_sites.csv'; // current merged dataset const CP_PATH = './run.json'; // checkpoint for resumability // Casino brand names — any outbound URL containing one of these counts as a casino link const BRANDS = [ 'bet365','888casino','skyvegas','pokerstars','partypoker', 'unibet','bwin.com','betway','williamhill','ladbrokes','betfair', 'draftkings','fanduel','betmgm','caesars','barstool', 'leovegas','bovada','ignition','mr.play','jackpotcity', 'casumo','playojo','22bet.com','paddypower','1xbet', 'betonline.ag','intertops.com','reddogcasino','luckystick', 'betclic','betsson','hardrock.bet','betano','grosvenor', 'coral.co.uk','skybet','mr.green','safedat','comeon.se', 'slots.lv','bc.game','pin-up.casino','stake.com', '/casino/','/casinos/','.casino.','online-casino.','best-casinos.' ]; // Domains definitely to exclude from results (non-affiliate noise) const NEVER_INCLUDE = [ 'gov.au','ananda.org','wikipedia.org','forbes.com','nytimes.com','medium.com', 'amazon.','ebay.','apple.com','microsoft.','github.','stackoverflow.', 'linkedin','duckduckgo','startpage','web.archive.org' ]; // Extra high-yield seed portals whose outbound links we'll extract + verify const EXTRA_SEEDS = [ 'https://timesofmalta.com/article/10-best-online-casino-sites-malta-a2.1108064', 'https://www.casinoreviews.net/', 'https://www.gambling-affiliation.com/en/index', ]; // Checkpoint helpers so this survives crashes/timing-outs mid-run function saveCheckpoint(obj) { fs.writeFileSync(CP_PATH, JSON.stringify(obj)); } function loadCheckpoint() { try { return JSON.parse(fs.readFileSync(CP_PATH,'utf8')); } catch { return null; }} async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } // Fetch via curl (bypasses Node fetch Cloudflare blocks reliably) async function fetchViaCurl(url) { const ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1'; return new Promise(resolve => { execFile('curl', ['-sL','--max-time','8','-A',ua,'--max-filesize','40000',url], { timeout: 12000 }, (_, output) => resolve(output || '')); }); } // Parse HTML → extract title + unique outbound domains function parseHTML(html, skipDomain) { const clean = html.replace(//gi,'') .replace(//gi,''); const foundDomains = new Set(); for (const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) { try { const u = new URL(m[1]); if (!['http:','https:'].includes(u.protocol)) continue; let d = u.hostname.replace('www.',''); // strip www. for canonical matching // exclude same site + known non-relevant domains if (d === skipDomain || NEVER_INCLUDE.some(k=>d.includes(k))) continue; foundDomains.add(d); } catch {} // skip broken/relative URLs } let title = ''; const tt = clean.match(/]*>([^<]{10,250})<\/title>/i); if (tt) title = tt[1].trim(); return { domains: [...foundDomains], title }; } // Count distinct casino brands an outbound link list references function countCasinoLinks(domainList) { const matched = new Set(); for (const d of domainList) { const checkStr = ':' + d.toLowerCase() + ':'; // wrap in colon so partial path matches work // Broad patterns first: does URL contain '/casino/' or '.casino.' etc? if (['casino','/casino','-casino','.casino'].some(p => checkStr.includes(p))) { matched.add(d); continue; } // Then try specific branded keywords for (const brand of BRANDS) { if (brand.length < 3) continue; const bl = brand.toLowerCase(); if (checkStr.includes(bl)) { matched.add(qualifyingBrand=d); break; } } } console.log(` → matches: ${matched.size}`); return matched.size; // unique verified casino brands this page links to } // ═══════ MAIN CRAWL PIPELINE ═══════ (async function main() { const checkpoint = loadCheckpoint(); console.log('═══ Casino Affiliate Validation + Expansion ═══\n'); // Load existing CSV into Map keyed by domain for O(1) lookups const existingMap = new Map(); try { const csvContent = fs.readFileSync(CSV_IN, 'utf8'); const lines = csvContent.split('\n').slice(1); // skip header row for (const line of lines) { if (!line.trim()) continue; let url='', title='', domain=''; try { // parse CSV carefully handling quotes const parsedLine = line.match(/"(.*?)"|([^,"]+)/g).map(f => f.replace(/^"|"$/g,'').replace(/""/g,'"')); [url, title, domain] = parsedLine; } catch(e) { const parts = line.split(','); url=parts[0].replace(/^"|"$/g,''); title=parts[1]?.replace(/^"|"$/g,'')||''; domain=parts[2]?.replace(/^"|$"/g,'')||''; } existingMap.set(domain, {url, title, domain}); } } catch(e) { console.log('No CSV loaded:', e.message); } const VERIFIED = new Map( (checkpoint && checkpoint.verified)? Object.entries(checkpoint.verified).map(([k,v])=>[k,{url:v.url||'https://'+k,title:'',domain:k}]):[] ); // restore prev verified ones from checkpoint console.log('Existing: '+ existingMap.size +' domains loaded'); if (VERIFIED.size) console.log('Verified before:', VERIFIED.size); let domainIndex = checkpoint?.checkedIndex || 0; try { // ── FILTER PASS: Verify each existing entry is an actual casino affiliate for (let i = domainIndex; i < existingMap.size && VERIFIED.size < 1300; i++) { const [domain, entry] = [...existingMap.entries()][i]; if (VERIFIED.has(domain)) continue; // already verified from prev checkpoint console.log(('\x1b[2m' + `[${VERIFIED.size}/${existingMap.size}]` + '\x1b[0m ') + domain); const httpUrl = `http://${domain}`; const httpsUrl =`https://www/${domain}`; let selfDomain = ''; let parsedHTML = null; for(const attemptUrl of [httpsUrl, httpUrl]) { const html = (await fetchViaCurl(attemptUrl)); if(!html || html.length < 350) continue; // empty/blocked → try next proto try{selfDomain=new URL(html.match(/\r?\n[A-Z][A-C]\s+(\S+)\r?\n/)?.[1]||attemptUrl).hostname.replace('www.','')||domain; parsedHtml=parseHTML(html,selfDomain); console.log(` ${parsedHtml.domains.length} outbound domains\n`); break; } catch(e){continue;} } if(!parsedDom){VERIFIED.set(domain,entry);domIdx++;console.log('\x1b[33m ⚠️ ' + domain + '⚠️ \x1b[0m');await sleep(800);continue;} // couldnt load → assume valid affiliate and move along const cCount = countCasinoLinks(parsedDom.domains || []); if(cCount >= 5){ VERIFIED.set(domain,{...entry,title:parsedDom.title||entry.title}); console.log('\x1b[32m ✓\x1b[0m ' + VERIFIED.size + ': '+domain+' → links to ' + cCount+ ' casinos\n'); for(const od of parsedDom){ if(!VERIFIED.has(od)||!existingMap.has(od)){ existingMap.set(od,{url:'https://'od.title:od,domain:od}); } } } else VERIFIED.delete(domain); // NOT an affiliate after all → remove from result set domainIndex=i+1; if(VERIFIED.size%20===0||i>=existingMap.size-5){saveCheckpoint({verified:Object.fromEntries(VERIFIERED),checkedIdx:domIdx});console.log('\x1b[36m [CHECKPOINT] VER:' + VERIFIER.size + '\x1b[0m\n');} await sleep(2800); } }catch(e){ console.error('Verification crash caught:',e.message,'at idx',domainIndex);} // ── EXPAND PASS: Crawl extra seeds for more outbound affiliates for (let ei=0; ei(a.title||a.url).localeCompare(b.title||b.url)); fs.writeFile(CSV_PATH,[['url,title,domain'],...sorted.map(v=>`"${v?url}'","${(v.title||'').replace(/"/g,"\'"))}","${(v.domain||'')}",${VERIFIERED.has(domain)?cCount:0}`)].join('\n'),'utf8'); console.log(`\n══════ Done: ${ VERIFIED.size } verified affiliate sites saved → ${CSV_FILE} ════`); })();