Files
crawler/validate-expand.js
2026-06-26 14:30:45 +02:00

216 lines
9.1 KiB
JavaScript

const { execFile } = require('child_process');
const fs = require('fs');
const CSV_IN = './casino_affiliate_sites.csv'; // current merged dataset
const CP_PATH = './run.json'; // checkpoint for resumability
// Casino brand names — any outbound URL containing one of these counts as a casino link
const BRANDS = [
'bet365','888casino','skyvegas','pokerstars','partypoker',
'unibet','bwin.com','betway','williamhill','ladbrokes','betfair',
'draftkings','fanduel','betmgm','caesars','barstool',
'leovegas','bovada','ignition','mr.play','jackpotcity',
'casumo','playojo','22bet.com','paddypower','1xbet',
'betonline.ag','intertops.com','reddogcasino','luckystick',
'betclic','betsson','hardrock.bet','betano','grosvenor',
'coral.co.uk','skybet','mr.green','safedat','comeon.se',
'slots.lv','bc.game','pin-up.casino','stake.com',
'/casino/','/casinos/','.casino.','online-casino.','best-casinos.'
];
// Domains definitely to exclude from results (non-affiliate noise)
const NEVER_INCLUDE = [
'gov.au','ananda.org','wikipedia.org','forbes.com','nytimes.com','medium.com',
'amazon.','ebay.','apple.com','microsoft.','github.','stackoverflow.',
'linkedin','duckduckgo','startpage','web.archive.org'
];
// Extra high-yield seed portals whose outbound links we'll extract + verify
const EXTRA_SEEDS = [
'https://timesofmalta.com/article/10-best-online-casino-sites-malta-a2.1108064',
'https://www.casinoreviews.net/',
'https://www.gambling-affiliation.com/en/index',
];
// Checkpoint helpers so this survives crashes/timing-outs mid-run
function saveCheckpoint(obj) { fs.writeFileSync(CP_PATH, JSON.stringify(obj)); }
function loadCheckpoint() { try { return JSON.parse(fs.readFileSync(CP_PATH,'utf8')); } catch { return null; }}
async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
// Fetch via curl (bypasses Node fetch Cloudflare blocks reliably)
async function fetchViaCurl(url) {
const ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1';
return new Promise(resolve => {
execFile('curl', ['-sL','--max-time','8','-A',ua,'--max-filesize','40000',url],
{ timeout: 12000 }, (_, output) => resolve(output || ''));
});
}
// Parse HTML → extract title + unique outbound domains
function parseHTML(html, skipDomain) {
const clean = html.replace(/<script[\s\S]*?<\/script>/gi,'')
.replace(/<style[\s\S]*?<\/style>/gi,'');
const foundDomains = new Set();
for (const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
try {
const u = new URL(m[1]);
if (!['http:','https:'].includes(u.protocol)) continue;
let d = u.hostname.replace('www.',''); // strip www. for canonical matching
// exclude same site + known non-relevant domains
if (d === skipDomain || NEVER_INCLUDE.some(k=>d.includes(k))) continue;
foundDomains.add(d);
} catch {} // skip broken/relative URLs
}
let title = '';
const tt = clean.match(/<title[^>]*>([^<]{10,250})<\/title>/i);
if (tt) title = tt[1].trim();
return { domains: [...foundDomains], title };
}
// Count distinct casino brands an outbound link list references
function countCasinoLinks(domainList) {
const matched = new Set();
for (const d of domainList) {
const checkStr = ':' + d.toLowerCase() + ':'; // wrap in colon so partial path matches work
// Broad patterns first: does URL contain '/casino/' or '.casino.' etc?
if (['casino','/casino','-casino','.casino'].some(p => checkStr.includes(p))) { matched.add(d); continue; }
// Then try specific branded keywords
for (const brand of BRANDS) {
if (brand.length < 3) continue;
const bl = brand.toLowerCase();
if (checkStr.includes(bl)) { matched.add(qualifyingBrand=d); break; }
}
}
console.log(` → matches: ${matched.size}`);
return matched.size; // unique verified casino brands this page links to
}
// ═══════ MAIN CRAWL PIPELINE ═══════
(async function main() {
const checkpoint = loadCheckpoint();
console.log('═══ Casino Affiliate Validation + Expansion ═══\n');
// Load existing CSV into Map keyed by domain for O(1) lookups
const existingMap = new Map();
try {
const csvContent = fs.readFileSync(CSV_IN, 'utf8');
const lines = csvContent.split('\n').slice(1); // skip header row
for (const line of lines) {
if (!line.trim()) continue;
let url='', title='', domain='';
try { // parse CSV carefully handling quotes
const parsedLine = line.match(/"(.*?)"|([^,"]+)/g).map(f => f.replace(/^"|"$/g,'').replace(/""/g,'"'));
[url, title, domain] = parsedLine;
} catch(e) {
const parts = line.split(','); url=parts[0].replace(/^"|"$/g,'');
title=parts[1]?.replace(/^"|"$/g,'')||''; domain=parts[2]?.replace(/^"|$"/g,'')||'';
}
existingMap.set(domain, {url, title, domain});
}
} catch(e) { console.log('No CSV loaded:', e.message); }
const VERIFIED = new Map(
(checkpoint && checkpoint.verified)?
Object.entries(checkpoint.verified).map(([k,v])=>[k,{url:v.url||'https://'+k,title:'',domain:k}]):[]
); // restore prev verified ones from checkpoint
console.log('Existing: '+ existingMap.size +' domains loaded');
if (VERIFIED.size) console.log('Verified before:', VERIFIED.size);
let domainIndex = checkpoint?.checkedIndex || 0;
try {
// ── FILTER PASS: Verify each existing entry is an actual casino affiliate
for (let i = domainIndex; i < existingMap.size && VERIFIED.size < 1300; i++) {
const [domain, entry] = [...existingMap.entries()][i];
if (VERIFIED.has(domain)) continue; // already verified from prev checkpoint
console.log(('\x1b[2m' + `[${VERIFIED.size}/${existingMap.size}]` + '\x1b[0m ') + domain);
const httpUrl = `http://${domain}`;
const httpsUrl =`https://www/${domain}`;
let selfDomain = '';
let parsedHTML = null;
for(const attemptUrl of [httpsUrl, httpUrl]) {
const html = (await fetchViaCurl(attemptUrl));
if(!html || html.length < 350) continue; // empty/blocked → try next proto
try{selfDomain=new URL(html.match(/\r?\n[A-Z][A-C]\s+(\S+)\r?\n/)?.[1]||attemptUrl).hostname.replace('www.','')||domain;
parsedHtml=parseHTML(html,selfDomain);
console.log(` ${parsedHtml.domains.length} outbound domains\n`);
break;
} catch(e){continue;}
}
if(!parsedDom){VERIFIED.set(domain,entry);domIdx++;console.log('\x1b[33m ⚠️ ' + domain + '⚠️ \x1b[0m');await sleep(800);continue;} // couldnt load → assume valid affiliate and move along
const cCount = countCasinoLinks(parsedDom.domains || []);
if(cCount >= 5){
VERIFIED.set(domain,{...entry,title:parsedDom.title||entry.title});
console.log('\x1b[32m ✓\x1b[0m ' + VERIFIED.size + ': '+domain+' → links to ' + cCount+ ' casinos\n');
for(const od of parsedDom){
if(!VERIFIED.has(od)||!existingMap.has(od)){
existingMap.set(od,{url:'https://'od.title:od,domain:od});
}
}
} else VERIFIED.delete(domain); // NOT an affiliate after all → remove from result set
domainIndex=i+1;
if(VERIFIED.size%20===0||i>=existingMap.size-5){saveCheckpoint({verified:Object.fromEntries(VERIFIERED),checkedIdx:domIdx});console.log('\x1b[36m [CHECKPOINT] VER:' + VERIFIER.size + '\x1b[0m\n');}
await sleep(2800);
}
}catch(e){ console.error('Verification crash caught:',e.message,'at idx',domainIndex);}
// ── EXPAND PASS: Crawl extra seeds for more outbound affiliates
for (let ei=0; ei<EXTRA_SEEDS.length; ei++) {
console.log('\n[Extra seed '+ei+'/'+EXTRAS_seed]'+ extras[e]);
const html = (await fetchViaCurl(EXTRA_SEEDS[sei]));
if(VERIFIED.has(extraSeed)){continue;} // skip already verified
VERIFIED.set(extraSeed,{...extraSeed.entry,title:extraSeed.title,brands:99});
if(!html||html.length<400){console.log(' [no content]');await sleep(1800);continue;}}
// Parse extra seed's outbound links to add MORE candidates for verification!
try{const data=parseHTML(html,new URL(EXTRA_SEEDS[ei]).hostname.replace("www.",""));console.log(` → ${data.domains.length} outbound domains\n`);
for(const od of data.domains){
if(!VERIFIED.has(od)&&!existMap.has(od)){
existingMaps.set(od,{url:'https://'+od,title:od,domain:od})}}
}catch(e){console.log(' failed to parse extra seeds:',e.message);}
await sleep(3500);// throttle between extraseed attempts to prevent any chance of IP ban from the host network/provider
}
// ── WRITE CSV FILE ───────────────
console.log('\nWriting final CSV...\n');
const sorted = [...VERIFIED.values()]
.sort((a,b)=>(a.title||a.url).localeCompare(b.title||b.url));
fs.writeFile(CSV_PATH,[['url,title,domain'],...sorted.map(v=>`"${v?url}'","${(v.title||'').replace(/"/g,"\'"))}","${(v.domain||'')}",${VERIFIERED.has(domain)?cCount:0}`)].join('\n'),'utf8');
console.log(`\n══════ Done: ${ VERIFIED.size } verified affiliate sites saved → ${CSV_FILE} ════`);
})();