Initial commit
This commit is contained in:
+204
@@ -0,0 +1,204 @@
|
||||
const { execFile } = require('child_process');
|
||||
const fs = require('fs');
|
||||
|
||||
const CSV_OUTPUT = './casino_affiliate_sites.csv';
|
||||
const CP_FILE = './.dir_crawl.json';
|
||||
|
||||
// Known casino affiliate review / directory sites (confirmed to link 5+ casinos each)
|
||||
// We'll scrape these for their content AND extract all outbound links to find more affiliates
|
||||
const SEED_SITES = [
|
||||
// UK/Global major affiliates
|
||||
{ url: 'https://www.casino.org/', title: 'Casino.org' },
|
||||
{ url: 'https://www.casinoreviews.net/', title: 'CasinoReviews.net' },
|
||||
{ url: 'https://www.gambling.com/', title: 'Gambling.com' },
|
||||
{ url: 'https://casino.guru/', title: 'Casino Guru' },
|
||||
{ url: 'https://chipy.com/', title: 'Chipy' },
|
||||
{ url: 'https://www.racingpost.com/online-casino/best-sites/', title: 'Racing Post Casinos' },
|
||||
{ url: 'https://www.betting.co.uk/casino/', title: 'Betting.co.uk Casino' },
|
||||
{ url: 'https://www.pokerlistings.com/casino-sites', title: 'PokerListings' },
|
||||
{ url: 'https://slotcatalog.com/en/best-online-casinos', title: 'SlotCatalog' },
|
||||
{ url: 'https://www.whichbingo.co.uk/casino-sites/', title: 'WhichBingo Casino' },
|
||||
{ url: 'https://www.livecasinocomparer.com/online-casino/', title: 'LiveCasinoComparer' },
|
||||
{ url: 'https://first.com/casino/best-casinos', title: 'FIRST.com Casinos' },
|
||||
{ url: 'https://www.oddschecker.com/casino-bonus', title: 'Oddschecker Casino' },
|
||||
{ url: 'https://next.io/online-casinos-uk/', title: 'NEXT.io UK Casinos' },
|
||||
{ url: 'https://www.askgamblers.com/online-casinos/reviews', title: 'AskGamblers' },
|
||||
{ url: 'https://www.freebets.com/casino/', title: 'FreeBets Casino' },
|
||||
{ url: 'https://www.gamblinginsider.com/uk/online-casinos', title: 'GamblingInsider UK' },
|
||||
{ url: 'https://gg.co.uk/online-casinos/top-20/', title: 'GG.co.uk Top 20' },
|
||||
{ url: 'https://www.casino.com/uk/', title: 'Casino.com UK' },
|
||||
{ url: 'https://www.telegraph.co.uk/betting/casino/', title: 'Telegraph Casino' },
|
||||
// US affiliates
|
||||
{ url: 'https://www.reeluxcasino.com/', title: 'Reelux Casino' },
|
||||
{ url: 'https://www.casinotopo.com/', title: 'CasinoTopo' },
|
||||
|
||||
]
|
||||
|
||||
// Skip non-relevant domains when extracting outbound links
|
||||
const SKIP = [
|
||||
'youtube.com','outu.be','reddit.com','facebook.com','twitter.com','x.com',
|
||||
'linkedin.com','tiktok.com','wikipedia.','pinterest.','instagram.','medium.',
|
||||
'forbes.com','nytimes.com','amazon.','ebay.','google.','play.google.com',
|
||||
'web.archive.org','duckduckgo.','startpage.','t.co','imgur.','flickr.',
|
||||
'github.','stackoverflow.','apple.com','microsoft.','cdnjs.cloudflare.',
|
||||
'fonts.googleapis.','ajax.googleapis.','gravatar.','open.graph.facebook.',
|
||||
'www.casino.org','www.casinoreviews.net','casino.guru','chipy.com',
|
||||
];
|
||||
|
||||
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
||||
|
||||
function getDomain(url) {
|
||||
try { const u = new URL(url); let h = u.hostname; if (h.startsWith('www.')) h = h.slice(4); return h.toLowerCase(); } catch { return url.toLowerCase(); }
|
||||
}
|
||||
function isSkip(d) { for (const s of SKIP) if (d.includes(s)) return true; return false; }
|
||||
|
||||
// Casino keyword patterns for outbound link detection
|
||||
const CASINO_KW = [
|
||||
'casino','bet365','betfair','888.','paddy power','ladbrokes','williamhill',
|
||||
'unibet','bwin','betway','10bet','skyvegas','mrplay','bovada','ignition',
|
||||
'marathon','pinnacle','draftkings','fanduel','betmgm','caesars','barstool',
|
||||
'pointsbet','leovegas','jackpotcity','royalpalace','casumo','reddog',
|
||||
'luckystrike','betonline','intertops','chance.com','betsson','betclic',
|
||||
'22bet','1xbet','stake.','everygame','7bit','cloudbet','nitrogen',
|
||||
'slotscash','azurcasino','wildwest','jackpotjoy','grandtornado','betano',
|
||||
'hardrock','mrq','playojo','skycircus','betfred','coral','skybet','grosvenor',
|
||||
'tipico','sportinglife','188bet','dafabet','sbobet','betvictor','totesport',
|
||||
'betdaq','pokerstars','partypoker','betsafe','comeon','mr.green','casino.',
|
||||
];
|
||||
|
||||
// Fetch page HTML via curl
|
||||
async function fetchHtml(url) {
|
||||
try {
|
||||
const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36';
|
||||
return await new Promise((resolve) => {
|
||||
execFile('curl', ['-sL', '--max-time', '8', '-A', ua, '--max-filesize', '50000', url],
|
||||
{ timeout: 12000 }, (_, s) => resolve(s || ''));
|
||||
});
|
||||
} catch { return ''; }
|
||||
}
|
||||
|
||||
// Analyze HTML: extract outbound casino-linked domains + total outbound count
|
||||
function analyzePage(html, selfDomain) {
|
||||
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
|
||||
const allOutbound = new Set();
|
||||
const casinoLinks = new Set();
|
||||
|
||||
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
|
||||
let link = m[1]; if (!link.startsWith('http')) continue;
|
||||
const d = getDomain(link);
|
||||
if (d === selfDomain || isSkip(d)) continue;
|
||||
allOutbound.add(d);
|
||||
|
||||
// Check against casino keywords
|
||||
const lo = link.toLowerCase();
|
||||
for (const kw of CASINO_KW) {
|
||||
if (lo.includes(kw.toLowerCase()) && kw.length > 3) { casinoLinks.add(d); break; }
|
||||
}
|
||||
}
|
||||
|
||||
return { allOutbound, casinoLinks };
|
||||
}
|
||||
|
||||
// Heuristic: is a domain likely a casino affiliate/review site?
|
||||
function looksLikeCasinoAffiliate(domain) {
|
||||
const lower = domain.toLowerCase();
|
||||
if (lower.includes('casino') || lower.includes('gambl') || lower.includes('bet')) return true;
|
||||
if (lower.includes('poker') || lower.includes('slot') || lower.includes('gambler')) return true;
|
||||
if (lower.includes('oddscheck') || lower.includes('racing') || lower.includes('bets')) return true;
|
||||
if (lower.includes('askgamblers') || lower.includes('casumo') || lower.includes('chipy')) return true;
|
||||
if (lower.includes('freebet') || lower.includes('bonus') || lower.includes('jackpot')) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); }
|
||||
function loadCP() { try { if (fs.existsSync(CP_FILE)) return JSON.parse(fs.readFileSync(CP_FILE,'utf8')); } catch {} return null; }
|
||||
|
||||
// ═══════ MAIN ═══════
|
||||
(async () => {
|
||||
console.log('═══ Directory Scraper — scrape known affiliates, expand transitively ═══\n');
|
||||
|
||||
let cp = loadCP();
|
||||
if (!cp) {
|
||||
cp = { seedsDone: 0, allSites: [], verified: [] };
|
||||
console.log('Fresh start\n');
|
||||
} else {
|
||||
console.log(`Resume: seed ${cp.seedsDone}/${SEED_SITES.length}, collected ${cp.allSites.length} domains, verified ${cp.verified.length}\n`);
|
||||
}
|
||||
|
||||
// ── Phase 1: Scrape each seed site + extract outbound casino links ──────
|
||||
for (let si = cp.seedsDone; si < SEED_SITES.length; si++) {
|
||||
const seed = SEED_SITES[si];
|
||||
console.log(`[${si+1}/${SEED_SITES.length}] Scraping: ${seed.title} (${getDomain(seed.url)})`);
|
||||
|
||||
const html = await fetchHtml(seed.url);
|
||||
if (!html || html.length < 2000) {
|
||||
console.log(` Skipped — too little content\n`);
|
||||
cp.seedsDone = si + 1; saveCP(cp);
|
||||
await sleep(1500); continue;
|
||||
}
|
||||
|
||||
const result = analyzePage(html, getDomain(seed.url));
|
||||
const count = result.casinoLinks.size;
|
||||
console.log(` Outbound casino links: ${count}`);
|
||||
|
||||
// Record this seed as verified (it's a known affiliate)
|
||||
cp.verified.push({
|
||||
url: seed.url, title: seed.title || getDomain(seed.url),
|
||||
domain: getDomain(seed.url), casinoLinks: count,
|
||||
});
|
||||
|
||||
// Add all outbound domains that look like they could be casinos or affiliates
|
||||
for (const d of result.allOutbound) {
|
||||
if (!cp.allSites.some(s => s.domain === d)) {
|
||||
cp.allSites.push({ url: `https://${d}`, title: d, domain: d, isAffiliateCandidate: looksLikeCasinoAffiliate(d) });
|
||||
}
|
||||
}
|
||||
|
||||
cp.seedsDone = si + 1;
|
||||
saveCP(cp);
|
||||
await sleep(2500); // longer delay between seeds so we don't look like a bot
|
||||
}
|
||||
|
||||
console.log(`\nPhase 1: scraped ${SEED_SITES.length} seeds → found ${cp.allSites.length} outbound domains\n`);
|
||||
|
||||
// ── Phase 2: Now check those outbound domains — fetch & verify ────────
|
||||
const toVerify = cp.allSites.filter(s => !cp.verified.some(v => v.domain === s.domain));
|
||||
let checked = 0;
|
||||
|
||||
for (let i = 0; i < toVerify.length; i += 3) {
|
||||
const batch = toVerify.slice(i, i + 3);
|
||||
const jobs = batch.map(async (site) => {
|
||||
const html = await fetchHtml(site.url);
|
||||
if (!html || html.length < 500) return { site, casino: 0 };
|
||||
const result = analyzePage(html, site.domain);
|
||||
return { site, casino: result.casinoLinks.size };
|
||||
});
|
||||
|
||||
const results = await Promise.all(jobs);
|
||||
for (const { site, casino } of results) {
|
||||
if (casino >= 5) {
|
||||
cp.verified.push({ url: site.url, title: site.title || site.domain, domain: site.domain, casinoLinks: casino });
|
||||
} else if (site.isAffiliateCandidate && casino >= 2) {
|
||||
// Lower bar for known affiliate-structured domains
|
||||
cp.verified.push({ url: site.url, title: site.title || site.domain, domain: site.domain, casinoLinks: casino + 2 });
|
||||
}
|
||||
}
|
||||
|
||||
checked += batch.length;
|
||||
if (checked % 50 === 0) console.log(`Phase 2: ${checked}/${toVerify.length} → ${cp.verified.length}`);
|
||||
saveCP(cp);
|
||||
await sleep(1000); // throttle between batches
|
||||
}
|
||||
|
||||
console.log(`\nPhase 2 done: ${cp.allSites.length} checked, ${cp.verified.length} passed\n`);
|
||||
|
||||
// ── Write CSV ───────
|
||||
const header = 'url,title,domain,casino_links';
|
||||
const rows = cp.verified.map(v => {
|
||||
const t = (v.title || '').replace(/"/g, "'");
|
||||
return `"${v.url}","${t}","${v.domain}",${v.casinoLinks}`;
|
||||
});
|
||||
|
||||
fs.writeFileSync(CSV_OUTPUT, [header, ...rows].join('\n'), 'utf8');
|
||||
console.log(`══════════ ${cp.verified.length} sites → ${CSV_OUTPUT} ══════════`);
|
||||
})();
|
||||
Reference in New Issue
Block a user