Initial commit

This commit is contained in:
Joe
2026-06-26 14:12:10 +02:00
commit 12518b259c
5258 changed files with 732924 additions and 0 deletions
+236
View File
@@ -0,0 +1,236 @@
const puppeteer = require('puppeteer');
const fs = require('fs');
const CSV_FILE = './casino_affiliate_sites.csv';
const CP_FILE = './scraper_checkpoint.json';
// Casino brand keywords for link detection
const CASINO_KW = [
'bet365','888casino','skyvegas','pokerstars','partypoker',
'unibet','bwin','betway','williamhill','ladbrokes',
'betfair','draftkings','fanduel','betmgm','caesars',
'barstoolsports','leovegas','bovada','ignitioncasino',
'mrplay','jackpotcity','casumo','playojo','22bet',
'paddy power','1xbet','betonline','intertops','reddog',
'luckystrike','betclic','betsson','hardrock','betano',
'grosvenor','coral','skybet','mr.green','betsafe',
];
// Domains to skip (social, news, etc.)
const SKIP = [
'youtube.com','facebook.com','twitter.com','linkedin.com','reddit.com',
'instagram.com','tiktok.com','pinterest.com','medium.com','github.com',
'wikipedia.org','google.','amazon.','web.archive.org',
'www.googletagmanager.com','www.google-analytics.com',
'fonts.googleapis.com','cdnjs.cloudflare.com','static.',
];
async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
// Seed sites: known casino review portals to start crawling from
const SEED_URLS = [
'https://www.casino.org/reviews/',
'https://casino.guru/casino-reviews',
'https://www.askgamblers.com/online-casinos/reviews',
'https://chipy.com/casinos',
'https://www.racingpost.com/online-casino/best-sites/',
'https://slotcatalog.com/en/best-online-casinos',
'https://www.gambling.com/uk/online-casinos',
'https://next.io/online-casinos-uk/',
'https://first.com/casino/best-casinos',
'https://www.oddschecker.com/casino-bonus',
];
/**
* Fetch a page with puppeteer, extract all outbound links.
* Returns {url, title, outboundLinks:[domain,...]} or null on failure.
*/
async function crawlSite(browser, url) {
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120');
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 });
// Wait briefly for lazy-loaded links
await sleep(2000);
// Extract all hrefs from the page
const result = await page.evaluate((selfDomain) => {
const links = new Set();
const anchors = document.querySelectorAll('a[href]');
for (const a of anchors) {
const href = a.href;
try {
const u = new URL(href);
// Filter: must be http/https, not same domain, not skip list const d = u.hostname.replace('www.', '');
if (!['http','https'].includes(u.protocol)) continue;
if (d === selfDomain) continue;
links.add(d);
} catch {}
}
return [...links];
}, await page.evaluate(() => window.location.hostname));
const title = await page.title();
await page.close();
return { url, title, outboundLinks: result };
} catch (err) {
console.log(` [fail] ${url}: ${err.message}`);
return null;
}
}
/**
* Count how many different casino brand domains a page links to.
*/
function countCasinoBrands(outboundLinks) {
const matched = new Set();
for (const domain of outboundLinks) {
const lo = domain.toLowerCase();
for (const kw of CASINO_KW) { if (lo.includes(kw.toLowerCase())) matched.add(kw); break; }
}
return matched.size;
}
// Load/save checkpoint
function loadCP() {
try { return JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch { return null; }
}
function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); }
(async () => {
console.log('══════════ Casino Affiliate Crawler v2 ══════════');
console.log(`Seeds : ${SEED_URLS.length}`);
console.log(`Goal : 1000+ sites with ≥5 casino brand links\n`);
const cp = loadCP();
const allSites = new Map(); // domain → {url, title, casinoBrands: n}
const visited = new Set(); // URLs we already crawled for outbound links
const candidates = []; // Queue of candidate DOMAINS to discover
if (cp) {
console.log(`Resuming checkpoint...`);
for (const d of Object.keys(cp.sites)) allSites.set(d, cp.sites[d]);
for (const u of cp.visited || []) visited.add(u);
for (const c of cp.candidates || []) candidates.push(c);
console.log(` ${allSites.size} known sites, ${visited.size} visited, ${candidates.length} candidates\n`);
}
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox','--disable-dev-shm-usage','--disable-setuid-sandbox'],
});
// ── Phase 1: Crawl seed sites & extract outbound links ────────
console.log('▶ Phase 1: Crawling seed sites...\n');
for (let i = 0; i < SEED_URLS.length; i++) {
const sUrl = SEED_URLS[i];
if (visited.has(sUrl)) continue;
visited.add(sUrl);
console.log(`[${i+1}/${SEED_URLS.length}] ${sUrl}`);
const data = await crawlSite(browser, sUrl);
if (!data) { await sleep(2000); continue; }
// The seed itself is a confirmed affiliate site
const dName = new URL(sUrl).hostname.replace('www.', '');
allSites.set(dName, { url: sUrl, title: data.title, casinoBrands: 99 });
// Add outbound domains to candidates
for (const od of data.outboundLinks) {
if (SKIP.some(k => od.includes(k))) continue;
if (!allSites.has(od) && !candidates.find(c => c.domain === od)) {
candidates.push({ domain: od, from: dName });
}
}
console.log(` → Found ${data.outboundLinks.length} outbound links\n`);
saveCP({
sites: Object.fromEntries(allSites),
visited: [...visited],
candidates: candidates.filter(c => !allSites.has(c.domain)),
});
await sleep(3000);
}
// ── Phase 2: Verify candidates by checking their outbound links ───────
console.log(`▶ Phase 2: Verifying ${candidates.length} candidate sites...\n`);
const CONC = 3; // concurrent pages in puppeteer
let verified = allSites.size;
for (let i = 0; i < candidates.length && verified < 1200; i += CONC) {
const batch = candidates.slice(i, i + CONC);
const checks = await Promise.all(
batch.map(async (cand) => {
if (visited.has(`https://${cand.domain}`)) return null;
let tryUrls = [
`https://${cand.domain}`,
cand.domain.startsWith('http') ? cand.domain : `http://${cand.domain}`,
];
for (const u of tryUrls) {
if (visited.has(u)) break;
visited.add(u);
const data = await crawlSite(browser, u);
if (!data) continue;
const brandCount = countCasinoBrands(data.outboundLinks);
return { domain: cand.domain, url: u, title: data.title, brands: brandCount };
}
return null;
})
);
for (const result of checks) {
if (!result) continue;
// Must link to ≥5 different casino brands AND have outbound links suggesting it's an affiliate/review site
if (result.brands >= 5 && result.url) {
allSites.set(result.domain, result);
verified++;
console.log(`${result.domain} (${result.brands} brands)`);
// Discover more candidates from this newly-verified site's outbound links
await crawlSite(browser, result.url).then(d => {
if (!d) return;
for (const od of d.outboundLinks || []) {
if (SKIP.some(k => od.includes(k))) continue;
if (!allSites.has(od) && !candidates.find(c => c.domain === od)) {
candidates.push({ domain: od, from: result.domain });
}
}
});
}
}
// Save checkpoint every 20 sites
if (verified % 20 === 0 || verified === allSites.size) {
saveCP({
sites: Object.fromEntries(allSites),
visited: [...visited],
candidates: candidates.filter(c => !allSites.has(c.domain)),
});
console.log(` ${verified} verified, ${candidates.length - i} remaining\n`);
}
await sleep(4000); // Rate limit to avoid IP bans
}
await browser.close();
// ── Write CSV ───────
const header = 'url,title,domain,casino_brands';
const sorted = [...allSites.values()]
.sort((a, b) => a.url.localeCompare(b.url))
.map(v => `"${v.url}","${(v.title||'').replace(/"/g,"''")}","${v.domain}",${v.casino_brands}`);
fs.writeFileSync(CSV_FILE, [header, ...sorted].join('\n'), 'utf8');
console.log(`\n══════ ${allSites.size} verified sites → ${CSV_FILE} ══════`);
})();