237 lines
8.4 KiB
JavaScript
237 lines
8.4 KiB
JavaScript
const puppeteer = require('puppeteer');
|
|
const fs = require('fs');
|
|
|
|
const CSV_FILE = './casino_affiliate_sites.csv';
|
|
const CP_FILE = './scraper_checkpoint.json';
|
|
|
|
// Casino brand keywords for link detection
|
|
const CASINO_KW = [
|
|
'bet365','888casino','skyvegas','pokerstars','partypoker',
|
|
'unibet','bwin','betway','williamhill','ladbrokes',
|
|
'betfair','draftkings','fanduel','betmgm','caesars',
|
|
'barstoolsports','leovegas','bovada','ignitioncasino',
|
|
'mrplay','jackpotcity','casumo','playojo','22bet',
|
|
'paddy power','1xbet','betonline','intertops','reddog',
|
|
'luckystrike','betclic','betsson','hardrock','betano',
|
|
'grosvenor','coral','skybet','mr.green','betsafe',
|
|
];
|
|
|
|
// Domains to skip (social, news, etc.)
|
|
const SKIP = [
|
|
'youtube.com','facebook.com','twitter.com','linkedin.com','reddit.com',
|
|
'instagram.com','tiktok.com','pinterest.com','medium.com','github.com',
|
|
'wikipedia.org','google.','amazon.','web.archive.org',
|
|
'www.googletagmanager.com','www.google-analytics.com',
|
|
'fonts.googleapis.com','cdnjs.cloudflare.com','static.',
|
|
];
|
|
|
|
async function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
|
|
|
// Seed sites: known casino review portals to start crawling from
|
|
const SEED_URLS = [
|
|
'https://www.casino.org/reviews/',
|
|
'https://casino.guru/casino-reviews',
|
|
'https://www.askgamblers.com/online-casinos/reviews',
|
|
'https://chipy.com/casinos',
|
|
'https://www.racingpost.com/online-casino/best-sites/',
|
|
'https://slotcatalog.com/en/best-online-casinos',
|
|
'https://www.gambling.com/uk/online-casinos',
|
|
'https://next.io/online-casinos-uk/',
|
|
'https://first.com/casino/best-casinos',
|
|
'https://www.oddschecker.com/casino-bonus',
|
|
];
|
|
|
|
/**
|
|
* Fetch a page with puppeteer, extract all outbound links.
|
|
* Returns {url, title, outboundLinks:[domain,...]} or null on failure.
|
|
*/
|
|
async function crawlSite(browser, url) {
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120');
|
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 });
|
|
|
|
// Wait briefly for lazy-loaded links
|
|
await sleep(2000);
|
|
|
|
// Extract all hrefs from the page
|
|
const result = await page.evaluate((selfDomain) => {
|
|
const links = new Set();
|
|
const anchors = document.querySelectorAll('a[href]');
|
|
for (const a of anchors) {
|
|
const href = a.href;
|
|
try {
|
|
const u = new URL(href);
|
|
// Filter: must be http/https, not same domain, not skip list const d = u.hostname.replace('www.', '');
|
|
if (!['http','https'].includes(u.protocol)) continue;
|
|
if (d === selfDomain) continue;
|
|
links.add(d);
|
|
} catch {}
|
|
}
|
|
return [...links];
|
|
}, await page.evaluate(() => window.location.hostname));
|
|
|
|
const title = await page.title();
|
|
await page.close();
|
|
return { url, title, outboundLinks: result };
|
|
} catch (err) {
|
|
console.log(` [fail] ${url}: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Count how many different casino brand domains a page links to.
|
|
*/
|
|
function countCasinoBrands(outboundLinks) {
|
|
const matched = new Set();
|
|
for (const domain of outboundLinks) {
|
|
const lo = domain.toLowerCase();
|
|
for (const kw of CASINO_KW) { if (lo.includes(kw.toLowerCase())) matched.add(kw); break; }
|
|
}
|
|
return matched.size;
|
|
}
|
|
|
|
// Load/save checkpoint
|
|
function loadCP() {
|
|
try { return JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch { return null; }
|
|
}
|
|
function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); }
|
|
|
|
(async () => {
|
|
console.log('══════════ Casino Affiliate Crawler v2 ══════════');
|
|
console.log(`Seeds : ${SEED_URLS.length}`);
|
|
console.log(`Goal : 1000+ sites with ≥5 casino brand links\n`);
|
|
|
|
const cp = loadCP();
|
|
const allSites = new Map(); // domain → {url, title, casinoBrands: n}
|
|
const visited = new Set(); // URLs we already crawled for outbound links
|
|
const candidates = []; // Queue of candidate DOMAINS to discover
|
|
|
|
if (cp) {
|
|
console.log(`Resuming checkpoint...`);
|
|
for (const d of Object.keys(cp.sites)) allSites.set(d, cp.sites[d]);
|
|
for (const u of cp.visited || []) visited.add(u);
|
|
for (const c of cp.candidates || []) candidates.push(c);
|
|
|
|
console.log(` ${allSites.size} known sites, ${visited.size} visited, ${candidates.length} candidates\n`);
|
|
}
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: ['--no-sandbox','--disable-dev-shm-usage','--disable-setuid-sandbox'],
|
|
});
|
|
|
|
// ── Phase 1: Crawl seed sites & extract outbound links ────────
|
|
console.log('▶ Phase 1: Crawling seed sites...\n');
|
|
|
|
for (let i = 0; i < SEED_URLS.length; i++) {
|
|
const sUrl = SEED_URLS[i];
|
|
if (visited.has(sUrl)) continue;
|
|
|
|
visited.add(sUrl);
|
|
console.log(`[${i+1}/${SEED_URLS.length}] ${sUrl}`);
|
|
|
|
const data = await crawlSite(browser, sUrl);
|
|
if (!data) { await sleep(2000); continue; }
|
|
|
|
// The seed itself is a confirmed affiliate site
|
|
const dName = new URL(sUrl).hostname.replace('www.', '');
|
|
allSites.set(dName, { url: sUrl, title: data.title, casinoBrands: 99 });
|
|
|
|
// Add outbound domains to candidates
|
|
for (const od of data.outboundLinks) {
|
|
if (SKIP.some(k => od.includes(k))) continue;
|
|
if (!allSites.has(od) && !candidates.find(c => c.domain === od)) {
|
|
candidates.push({ domain: od, from: dName });
|
|
}
|
|
}
|
|
|
|
console.log(` → Found ${data.outboundLinks.length} outbound links\n`);
|
|
saveCP({
|
|
sites: Object.fromEntries(allSites),
|
|
visited: [...visited],
|
|
candidates: candidates.filter(c => !allSites.has(c.domain)),
|
|
});
|
|
await sleep(3000);
|
|
}
|
|
|
|
// ── Phase 2: Verify candidates by checking their outbound links ───────
|
|
console.log(`▶ Phase 2: Verifying ${candidates.length} candidate sites...\n`);
|
|
|
|
const CONC = 3; // concurrent pages in puppeteer
|
|
let verified = allSites.size;
|
|
|
|
for (let i = 0; i < candidates.length && verified < 1200; i += CONC) {
|
|
const batch = candidates.slice(i, i + CONC);
|
|
|
|
const checks = await Promise.all(
|
|
batch.map(async (cand) => {
|
|
if (visited.has(`https://${cand.domain}`)) return null;
|
|
|
|
let tryUrls = [
|
|
`https://${cand.domain}`,
|
|
cand.domain.startsWith('http') ? cand.domain : `http://${cand.domain}`,
|
|
];
|
|
|
|
for (const u of tryUrls) {
|
|
if (visited.has(u)) break;
|
|
visited.add(u);
|
|
|
|
const data = await crawlSite(browser, u);
|
|
if (!data) continue;
|
|
|
|
const brandCount = countCasinoBrands(data.outboundLinks);
|
|
return { domain: cand.domain, url: u, title: data.title, brands: brandCount };
|
|
}
|
|
return null;
|
|
})
|
|
);
|
|
|
|
for (const result of checks) {
|
|
if (!result) continue;
|
|
|
|
// Must link to ≥5 different casino brands AND have outbound links suggesting it's an affiliate/review site
|
|
if (result.brands >= 5 && result.url) {
|
|
allSites.set(result.domain, result);
|
|
verified++;
|
|
console.log(` ✓ ${result.domain} (${result.brands} brands)`);
|
|
|
|
// Discover more candidates from this newly-verified site's outbound links
|
|
await crawlSite(browser, result.url).then(d => {
|
|
if (!d) return;
|
|
for (const od of d.outboundLinks || []) {
|
|
if (SKIP.some(k => od.includes(k))) continue;
|
|
if (!allSites.has(od) && !candidates.find(c => c.domain === od)) {
|
|
candidates.push({ domain: od, from: result.domain });
|
|
}
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
// Save checkpoint every 20 sites
|
|
if (verified % 20 === 0 || verified === allSites.size) {
|
|
saveCP({
|
|
sites: Object.fromEntries(allSites),
|
|
visited: [...visited],
|
|
candidates: candidates.filter(c => !allSites.has(c.domain)),
|
|
});
|
|
console.log(` ${verified} verified, ${candidates.length - i} remaining\n`);
|
|
}
|
|
|
|
await sleep(4000); // Rate limit to avoid IP bans
|
|
}
|
|
|
|
await browser.close();
|
|
|
|
// ── Write CSV ───────
|
|
const header = 'url,title,domain,casino_brands';
|
|
const sorted = [...allSites.values()]
|
|
.sort((a, b) => a.url.localeCompare(b.url))
|
|
.map(v => `"${v.url}","${(v.title||'').replace(/"/g,"''")}","${v.domain}",${v.casino_brands}`);
|
|
|
|
fs.writeFileSync(CSV_FILE, [header, ...sorted].join('\n'), 'utf8');
|
|
console.log(`\n══════ ${allSites.size} verified sites → ${CSV_FILE} ══════`);
|
|
})();
|