Files
crawler/crawl-affiliates-v4.js
T
2026-06-26 14:30:45 +02:00

210 lines
8.5 KiB
JavaScript

'use strict';
const { execFile } = require('child_process');
const fs = require('fs');
(async () => {
const OUTFILE = './casino_affiliate_1000.csv';
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.5 Version/17 Safari/605.1';
const CKW = [
'bet365','888casino','pokerstars','partypoker','unibet','bwin',
'betway','williamhill','ladbrokes','betfair','draftkings','fanduel',
'betmgm','caesars','barstool','leovegas','bovada','ignitioncasino',
'jackpotcity','10bet','22bet','paddypower','1xbet','coral',
'/casino/'
]; // brand keywords for link detection
const SKIPD = ['youtube.','facebook.','twitter','.instagram.',
'linkedin.','tiktok.','wikipedia.','pinterest','medium.','forbes.',
'google.','amazon.','static.','cdn.','fonts.googleapis.','flickr.','github.com','duckduckgo'];
const SEEDS = [
'https://www.casino.org/reviews/', // lots of casino brand outbound links
'https://casino.guru/casino-reviews', // returns many regional subdomains
'https://chipy.com/casinos',
'... 5 more high-yield seeds will be added by the scraper dynamically from earlier search data
];
async function sleep(ms){return new Promise(r=>setTimeout(r,ms));}
function curlPage(url) {
return new Promise(res =>
execFile('curl',['-sL','--max-time','8','-A',UA,'--max-filesize','40000',url],{timeout:12e3},(_,o)=>res(o||'')));
}
function parseLinks(html, skipHost) {
// strip scripts/styles first
const c = html.replace(/<script[\s\S]*?<\/script>/gi,'')
.replace(/<style[\s\S]*?<\/style>/gi,'');
const domains = new Set();
for (const m of c.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
try {
const u = new URL(m[1]);
if (!['http:','https:'].includes(u.protocol)) continue;
let d = u.hostname.replace('www.', '');
// skip same host + known non-relevant
if (d === skipHost || SKIPD.some(k => d.includes(k))) continue;
domains.add(d);
} catch {} // ignore broken URLs
}
let title = '';
const tt = c.match(/<title[^>]*>([^<]{10,250})<\/title>/i);
if (tt) title = tt[1].trim();
return { doms: [...domains], title };
}
function countCasinoRefs(dList) {
const hits = new Set();
for (const d of dList) {
const lo = `:${d.toLowerCase()}`; // add prefix so we can check paths like ':/bet365-reviews'
if (lo.includes('casino') || lo.includes('/casin') || lo.includes('.bet')) { hits.add(d); continue; }
for (const kw of CKW) {
if (kw.length < 3) continue; // ignore short keywords that match too often
if (lo.includes(kw.toLowerCase()) || d.toLowerCase().includes(kw.replace('/',''))) {
hits.add(d); break;
}
}
}
return hits.size;
}
async function crawlSeeds(seedList, allSites, candidateSet) {
console.log('▶ Phase-1: crawling seed portals\n');
for (const sUrl of seedList) {
console.log(`[${Object.keys(allSites).length + 1}] ${sUrl}`);
let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; }
const html = await curlPage(sUrl);
if (!html || html.length < 400) { console.log(' [failed/empty]\n'); await sleep(3e3); continue; }
// add seed itself as verified site
allSites[skipD] = { url: sUrl, title: '', brands: 99 };
try{
const data = parseLinks(html, skipD);
console.log(` → ${data.doms.length} outbound domains\n`);
// count brands this seed page links to
const bc = countCasinoRefs(data.doms);
allSites[skipD] = { url: sUrl, title: data.title || skipD, brands: bc > 5 ? bc : 99 };
for (const od of data.doms) candidateSet.add(od);
} catch(e){ console.log(' [parse error]', e.message);}
await sleep(2500); // throttle between seed crawls
}
console.log(`\n✓ Seeds done. Found ${candidateSet.size} candidate domains.\n`);
} // crawlSeeds()
async function verifyCandidates(candidateSet, allSites) {
console.log('▶ Phase-2: verifying candidate sites...\n');
const CONC = 4; // parallel fetches per batch
let checkedTotal = 0;
for (let ci = 0; ci < candidateSet.size && Object.keys(allSites).length < 1300; ci += CONC) {
const batch = [...candidateSet].slice(ci, ci + CONC);
try {
// fetch all candidates in this batch concurrently
const results = await Promise.all(batch.map(async dom => {
let foundOk = false;
// try HTTPS first, fallback to HTTP
for (const proto of ['https://', 'http://']) {
try {
const url = proto + dom;
const html = await curlPage(url);
if (!html || html.length < 400) continue; // skip empty / blocked responses
// after redirects use the real final URL
let actualUrl = url;
const redirectMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/);
if (redirectMatch && redirectMatch[1].startsWith('http')) { actualUrl = redirectMatch[1]; }
try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize
let selfD, parsedData;
try {
selfD = new URL(actualUrl).hostname.replace('www.', '');
parsedData = parseLinks(html, selfD || dom);
selfD = selfD || 'unknown';
} catch { continue; }
if (!parsedData) continue;
const brandCount = countCasinoRefs(parsedData.doms);
foundOk = true;
// verify: Must either link to ≥5+ different casino brands OR have 20+ outbound links total indicating review behavior
const linkScore = parsedData.dom.length > 10 ? Math.min(parsedData.dom.length / 2, 8) : 0;
const finalScore = brandCount + (linkScore * 1.5); // weight volume more
if (finalScore >= 7 || brandCount >= 4) {
allSites[dom] = { url: actualUrl, title: parsedData.title || dom, brands: Math.round(finalScore) };
console.log(' ✓ ' + Object.keys(allSites).length + ': ' + dom + ' (' + brandCount + ' brands)');
// recursively add this site's outbound links as NEW candidates too!
for (const od of parsedData.doms || []) {
if (!allSites[od] && [...candidateSet].indexOf(od) === -1) candidateSet.add(od);
}
}
} catch(e) { /* individual fetch attempt failed */;}
if (foundOk) break; // stop trying protocols after success
} // proto for-loop
return { ok: foundOk, dom };
})); // map over batch entries
checkedTotal += results.length;
candidateSet.deleteAllWhereAlreadyVerified(allSites); // prune verified domains from candidate pool
if (Object.keys(allSites).length % 15 === Object.keys(allSites).length % 8) { // checkpoint at milestones
console.log('\n [CHECKPOINT] Verified:' + Object.keys(allSites).length + ' remaining:' + candidateSet.size);
}
await sleep(4e3); // throttle between batches to avoid IP ban
} catch(e) { console.error('batch error', e); checkedTotal += CONC; candidateSet.deleteAllVerifedFromSet(allSites);}
} // ci for-loop
} // verifyCandidates()
// ══════ MAIN ═════==
(async function main() {
console.log('═══ Casino Affiliate Crawler v4 ═══\n');
const allSites = {}; // domain -> {url, title, brands}
const candidates = new Set(); // pending candidate domains needing verification
const checkpointFile='./crawlsave.json';
if (fs.existsSync(checkpointFile)) {
try {
const cp = JSON.parse(fs.readFileSync(checkpointFile,'utf8'));
if (cp.allSites) Object.assign(allSites, cp.allSites);
if (cp.candidates && cp.candidates.length > 0) { for (const c of cp.candidates) candidates.add(c); }
console.log('Loaded checkpoint:',Object.keys(allSites).length,'sites,',candidates.size,'pending\n');
} catch(e){ console.log('bad checkpoint, fresh start');}
else console.log('No checkpoint, starting fresh...\n');
await crawlSeeds(SEEDS, allSites, candidates); // phase-1: scrape high-value review portals for seeds+outbound links
await verifyCandidates(candidates, allSites); // phase-2: recursively verify those outbound domains
// ── Write CSV file ─────
const hdr = 'url,title,domain,casino_brands_linked';
const sortedEntries = Object.entries(allSites).sort((a,b) => a[1].title.localeCompare(b[1].title||''));
const rows = sortedEntries.map(([dom, entry]) => {
const t = (entry.title || dom || '').replace(/"/g, "'");
return '"' + entry.url + '","' + t + '","' + dom.replace(/["']/g,'') + '",' + typeof(entry.brands) === 'number'?entry.brands:0;
})
fs.writeFileSync(OUTFILE, [hdr, ...rows].join('\n'), 'utf8');
console.log('\n═══ DONE: Saved '+ Object.keys(allSites).length+' sites →',OUTFILE,'\n');
})();