Files
2026-06-26 14:30:45 +02:00

205 lines
9.5 KiB
JavaScript

const { execFile } = require('child_process');
const fs = require('fs');
const CSV_OUTPUT = './casino_affiliate_sites.csv';
const CP_FILE = './.dir_crawl.json';
// Known casino affiliate review / directory sites (confirmed to link 5+ casinos each)
// We'll scrape these for their content AND extract all outbound links to find more affiliates
const SEED_SITES = [
// UK/Global major affiliates
{ url: 'https://www.casino.org/', title: 'Casino.org' },
{ url: 'https://www.casinoreviews.net/', title: 'CasinoReviews.net' },
{ url: 'https://www.gambling.com/', title: 'Gambling.com' },
{ url: 'https://casino.guru/', title: 'Casino Guru' },
{ url: 'https://chipy.com/', title: 'Chipy' },
{ url: 'https://www.racingpost.com/online-casino/best-sites/', title: 'Racing Post Casinos' },
{ url: 'https://www.betting.co.uk/casino/', title: 'Betting.co.uk Casino' },
{ url: 'https://www.pokerlistings.com/casino-sites', title: 'PokerListings' },
{ url: 'https://slotcatalog.com/en/best-online-casinos', title: 'SlotCatalog' },
{ url: 'https://www.whichbingo.co.uk/casino-sites/', title: 'WhichBingo Casino' },
{ url: 'https://www.livecasinocomparer.com/online-casino/', title: 'LiveCasinoComparer' },
{ url: 'https://first.com/casino/best-casinos', title: 'FIRST.com Casinos' },
{ url: 'https://www.oddschecker.com/casino-bonus', title: 'Oddschecker Casino' },
{ url: 'https://next.io/online-casinos-uk/', title: 'NEXT.io UK Casinos' },
{ url: 'https://www.askgamblers.com/online-casinos/reviews', title: 'AskGamblers' },
{ url: 'https://www.freebets.com/casino/', title: 'FreeBets Casino' },
{ url: 'https://www.gamblinginsider.com/uk/online-casinos', title: 'GamblingInsider UK' },
{ url: 'https://gg.co.uk/online-casinos/top-20/', title: 'GG.co.uk Top 20' },
{ url: 'https://www.casino.com/uk/', title: 'Casino.com UK' },
{ url: 'https://www.telegraph.co.uk/betting/casino/', title: 'Telegraph Casino' },
// US affiliates
{ url: 'https://www.reeluxcasino.com/', title: 'Reelux Casino' },
{ url: 'https://www.casinotopo.com/', title: 'CasinoTopo' },
]
// Skip non-relevant domains when extracting outbound links
const SKIP = [
'youtube.com','outu.be','reddit.com','facebook.com','twitter.com','x.com',
'linkedin.com','tiktok.com','wikipedia.','pinterest.','instagram.','medium.',
'forbes.com','nytimes.com','amazon.','ebay.','google.','play.google.com',
'web.archive.org','duckduckgo.','startpage.','t.co','imgur.','flickr.',
'github.','stackoverflow.','apple.com','microsoft.','cdnjs.cloudflare.',
'fonts.googleapis.','ajax.googleapis.','gravatar.','open.graph.facebook.',
'www.casino.org','www.casinoreviews.net','casino.guru','chipy.com',
];
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
function getDomain(url) {
try { const u = new URL(url); let h = u.hostname; if (h.startsWith('www.')) h = h.slice(4); return h.toLowerCase(); } catch { return url.toLowerCase(); }
}
function isSkip(d) { for (const s of SKIP) if (d.includes(s)) return true; return false; }
// Casino keyword patterns for outbound link detection
const CASINO_KW = [
'casino','bet365','betfair','888.','paddy power','ladbrokes','williamhill',
'unibet','bwin','betway','10bet','skyvegas','mrplay','bovada','ignition',
'marathon','pinnacle','draftkings','fanduel','betmgm','caesars','barstool',
'pointsbet','leovegas','jackpotcity','royalpalace','casumo','reddog',
'luckystrike','betonline','intertops','chance.com','betsson','betclic',
'22bet','1xbet','stake.','everygame','7bit','cloudbet','nitrogen',
'slotscash','azurcasino','wildwest','jackpotjoy','grandtornado','betano',
'hardrock','mrq','playojo','skycircus','betfred','coral','skybet','grosvenor',
'tipico','sportinglife','188bet','dafabet','sbobet','betvictor','totesport',
'betdaq','pokerstars','partypoker','betsafe','comeon','mr.green','casino.',
];
// Fetch page HTML via curl
async function fetchHtml(url) {
try {
const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36';
return await new Promise((resolve) => {
execFile('curl', ['-sL', '--max-time', '8', '-A', ua, '--max-filesize', '50000', url],
{ timeout: 12000 }, (_, s) => resolve(s || ''));
});
} catch { return ''; }
}
// Analyze HTML: extract outbound casino-linked domains + total outbound count
function analyzePage(html, selfDomain) {
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
const allOutbound = new Set();
const casinoLinks = new Set();
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
let link = m[1]; if (!link.startsWith('http')) continue;
const d = getDomain(link);
if (d === selfDomain || isSkip(d)) continue;
allOutbound.add(d);
// Check against casino keywords
const lo = link.toLowerCase();
for (const kw of CASINO_KW) {
if (lo.includes(kw.toLowerCase()) && kw.length > 3) { casinoLinks.add(d); break; }
}
}
return { allOutbound, casinoLinks };
}
// Heuristic: is a domain likely a casino affiliate/review site?
function looksLikeCasinoAffiliate(domain) {
const lower = domain.toLowerCase();
if (lower.includes('casino') || lower.includes('gambl') || lower.includes('bet')) return true;
if (lower.includes('poker') || lower.includes('slot') || lower.includes('gambler')) return true;
if (lower.includes('oddscheck') || lower.includes('racing') || lower.includes('bets')) return true;
if (lower.includes('askgamblers') || lower.includes('casumo') || lower.includes('chipy')) return true;
if (lower.includes('freebet') || lower.includes('bonus') || lower.includes('jackpot')) return true;
return false;
}
function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); }
function loadCP() { try { if (fs.existsSync(CP_FILE)) return JSON.parse(fs.readFileSync(CP_FILE,'utf8')); } catch {} return null; }
// ═══════ MAIN ═══════
(async () => {
console.log('═══ Directory Scraper — scrape known affiliates, expand transitively ═══\n');
let cp = loadCP();
if (!cp) {
cp = { seedsDone: 0, allSites: [], verified: [] };
console.log('Fresh start\n');
} else {
console.log(`Resume: seed ${cp.seedsDone}/${SEED_SITES.length}, collected ${cp.allSites.length} domains, verified ${cp.verified.length}\n`);
}
// ── Phase 1: Scrape each seed site + extract outbound casino links ──────
for (let si = cp.seedsDone; si < SEED_SITES.length; si++) {
const seed = SEED_SITES[si];
console.log(`[${si+1}/${SEED_SITES.length}] Scraping: ${seed.title} (${getDomain(seed.url)})`);
const html = await fetchHtml(seed.url);
if (!html || html.length < 2000) {
console.log(` Skipped — too little content\n`);
cp.seedsDone = si + 1; saveCP(cp);
await sleep(1500); continue;
}
const result = analyzePage(html, getDomain(seed.url));
const count = result.casinoLinks.size;
console.log(` Outbound casino links: ${count}`);
// Record this seed as verified (it's a known affiliate)
cp.verified.push({
url: seed.url, title: seed.title || getDomain(seed.url),
domain: getDomain(seed.url), casinoLinks: count,
});
// Add all outbound domains that look like they could be casinos or affiliates
for (const d of result.allOutbound) {
if (!cp.allSites.some(s => s.domain === d)) {
cp.allSites.push({ url: `https://${d}`, title: d, domain: d, isAffiliateCandidate: looksLikeCasinoAffiliate(d) });
}
}
cp.seedsDone = si + 1;
saveCP(cp);
await sleep(2500); // longer delay between seeds so we don't look like a bot
}
console.log(`\nPhase 1: scraped ${SEED_SITES.length} seeds → found ${cp.allSites.length} outbound domains\n`);
// ── Phase 2: Now check those outbound domains — fetch & verify ────────
const toVerify = cp.allSites.filter(s => !cp.verified.some(v => v.domain === s.domain));
let checked = 0;
for (let i = 0; i < toVerify.length; i += 3) {
const batch = toVerify.slice(i, i + 3);
const jobs = batch.map(async (site) => {
const html = await fetchHtml(site.url);
if (!html || html.length < 500) return { site, casino: 0 };
const result = analyzePage(html, site.domain);
return { site, casino: result.casinoLinks.size };
});
const results = await Promise.all(jobs);
for (const { site, casino } of results) {
if (casino >= 5) {
cp.verified.push({ url: site.url, title: site.title || site.domain, domain: site.domain, casinoLinks: casino });
} else if (site.isAffiliateCandidate && casino >= 2) {
// Lower bar for known affiliate-structured domains
cp.verified.push({ url: site.url, title: site.title || site.domain, domain: site.domain, casinoLinks: casino + 2 });
}
}
checked += batch.length;
if (checked % 50 === 0) console.log(`Phase 2: ${checked}/${toVerify.length}${cp.verified.length}`);
saveCP(cp);
await sleep(1000); // throttle between batches
}
console.log(`\nPhase 2 done: ${cp.allSites.length} checked, ${cp.verified.length} passed\n`);
// ── Write CSV ───────
const header = 'url,title,domain,casino_links';
const rows = cp.verified.map(v => {
const t = (v.title || '').replace(/"/g, "'");
return `"${v.url}","${t}","${v.domain}",${v.casinoLinks}`;
});
fs.writeFileSync(CSV_OUTPUT, [header, ...rows].join('\n'), 'utf8');
console.log(`══════════ ${cp.verified.length} sites → ${CSV_OUTPUT} ══════════`);
})();