Files
crawler/transitive-v12.js
2026-06-26 14:30:45 +02:00

228 lines
9.3 KiB
JavaScript

const { execFile } = require('child_process');
const fs = require('fs');
const CP_FILE = './.cp.json';
const CSV_OUTPUT = './casino_affiliate_sites.csv';
// Casino brand / keyword fingerprints (broadened)
const CASINO_KW = [
"casino" , "bet365", "betfair" , "888.", "paddy power" , "ladbrokes" , "williamhill" ,
"unibet" , "bwin" , "betway" , "10bet" , "skyvegas" , "mrplay" , "bovada" , "ignition",
"marathon" , "pinnacle" , "draftking" , "fanduel" , "betmgm" ,
"caesar" , "barstool" , "pointsbet" , "leovegas" , "jackpotcity" ,
"royalpalace" , "casumo" , "reddog" , "luckystrike" , "betonline" , "intertops",
"chance.com", "betsson", "betclic", "22bet", "1xbet", "stake.",
"everygame", "7bit", "cloudbet", "nitrogen", "slotscash",
"azurcasino", "wildwest", "jackpotjoy", "grandtornado", "betano", "hardrock",
"mrq", "playojo", "skycircus", "betfred", "coral", "skybet", "grosvenor",
"tipico", "sportinglife", "188bet", "dafabet", "sbobet", "betvictor",
"totesport", "betdaq", "pokerstars", "partypoker", "betsafe", "comeon",
"mr.green", "red dog", "spinplanet", "casinozilla", "bonusboss",
"slotsup", "slotcatalog", "pin-up casino", "bitsler", "playtech",
"evolution gaming", "microgaming", "netent", "pragmatic play","play'n go",
"red tiger", "yggdrasil", "push gaming", "relax gaming", "foxium",
// Broader gambling terms in URLs
"/casino/", "/casinos/", "/gambling/", "/betting/", "/bets/",
"slot", "pokies", "rtp", "jackpot", "bonus",
].map(k => k.toLowerCase());
const SKIP_PATS = [
"youtube.com","youtu.be","reddit.com","facebook.com","twitter.com","x.com",
"linkedin.com","tiktok.com","wikipedia.","pinterest.","instagram.",
"medium.com","forbes.com","nytimes.com","amazon.","ebay.","microsoft.",
"play.google.com","web.archive.org","duckduckgo.","startpage.","brave.com",
"t.co","imgur.com","flickr.com","goodreads.","steamcommunity.",
"github.com","stackoverflow.","stackexchange.","apple.com","google.",
"cdn.","cdnjs.cloudflare.com","fonts.googleapis.","ajax.googleapis.",
"gravatar.com","facebook.com","open.graph.facebook.","platform.twitter.",
];
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
function getDomain(url) {
try { const u = new URL(url); let h = u.hostname; if (h.startsWith('www.')) h = h.slice(4); return h.toLowerCase(); }
catch { return url.toLowerCase().replace(/^https?:\/\//, ''); }
}
function isSkip(d) { for (const s of SKIP_PATS) if (d.includes(s)) return true; return false; }
function analyzeHtml(html, selfDomain) {
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
const seenDomains = new Set();
const casinoDomains = new Set();
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
let link = m[1]; if (!link.startsWith('http')) continue;
const d = getDomain(link);
if (d === selfDomain || isSkip(d) || seenDomains.has(d)) continue;
seenDomains.add(d);
// Check against casino keywords — any match means it's a casino-adjacent link
const lo = link.toLowerCase();
for (const kw of CASINO_KW) {
if (lo.includes(kw) && kw.length > 2) { casinoDomains.add(d); break; }
}
}
return { casinoDomains: casinoDomains.size, allOutbound: seenDomains.size };
}
async function fetchHtml(url) {
try {
const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36';
return await new Promise((resolve) => {
execFile('curl', ['-sL', '--max-time', '8', '-A', ua, '--max-filesize', '40000', url], { timeout: 12000 }, (_, s) => resolve(s || ''));
});
} catch { return ''; }
}
function extractAllOutbound(html, selfDomain) {
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
const outbounds = new Set();
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
let link = m[1]; if (!link.startsWith('http')) continue;
const d = getDomain(link);
if (d !== selfDomain && !isSkip(d) && outbounds.size < 300) outbounds.add(d);
}
return [...outbounds];
}
// ═══════ MAIN ═══════
(async () => {
console.log('═══ Casino Affiliate Crawler v12 (transitive, low threshold) ═══\n');
let cp = null;
try { if (fs.existsSync(CP_FILE)) cp = JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch {}
if (!cp || !cp.sites || cp.sites.length === 0) {
console.error('No checkpoint. Run find-casino-affiliates-v9.js first.');
return;
}
const allSites = [...cp.sites];
let verified = [...(cp.verified || [])];
const verifiedMap = new Map(verified.map(v => [v.domain, v]));
console.log(`Loaded ${allSites.length} seeds, ${verified.length} pre-verified\n`);
// ── Step 1: Verify ALL seed sites (batch of 5) ────────
const CONC = 5;
let totalChecked = 0;
const toVerifySeeds = allSites.filter(s => !verifiedMap.has(s.domain));
for (let i = 0; i < toVerifySeeds.length; i += CONC) {
const batch = toVerifySeeds.slice(i, i + CONC);
const jobs = batch.map(async (s) => {
const html = await fetchHtml(s.url ? s.url : `https://${s.domain}`);
if (!html || html.length < 300) return { site: s, casino: 0 };
const result = analyzeHtml(html, s.domain);
return { site: s, casino: result.casinoDomains };
});
const results = await Promise.all(jobs);
for (const { site, casino } of results) {
// Use lenient threshold: ≥3 casino-links OR ≥15 total outbound links
if (casino >= 3) {
verified.push({ url: site.url || `https://${site.domain}`, title: site.title || site.domain, domain: site.domain, casinoLinks: casino });
verifiedMap.set(site.domain, verified[verified.length - 1]);
}
}
totalChecked += batch.length;
if (totalChecked % 40 === 0) console.log(`Seed verify: ${totalChecked}/${toVerifySeeds.length}${verified.length} pass ≥3`);
await sleep(350);
}
console.log(`\nStep 1 done: ${verified.length} verified from ${allSites.length} seeds\n`);
// ── Step 2: Transitive expansion via outbound links (≥3 rounds) ─
const scannedDomains = new Set(verified.map(v => v.domain));
let roundNum = 0;
while (roundNum < 4) {
roundNum++;
console.log(`══ Expansion Round ${roundNum}: scanning for new domains ══`);
// Build pool of verified sites to scan this round
const scanPool = [...verifiedMap.values()];
const newSites = [];
for (const site of scanPool) {
try {
const url = site.url || `https://${site.domain}`;
const html = await fetchHtml(url);
if (!html || html.length < 300) continue;
const outboundDomains = extractAllOutbound(html, site.domain);
for (const od of outboundDomains) {
if (!verifiedMap.has(od)) {
newSites.push({ url: `https://${od}`, title: od, domain: od });
}
}
} catch {}
if (scannedDomains.size % 30 === 29) console.log(` Scanned ${scannedDomains.size} domains`);
await sleep(250);
}
// Deduplicate new sites — keep only unseen
const seenNew = new Set();
const uniqueNewSites = [];
for (const ns of newSites) {
if (!seenNew.has(ns.domain)) { seenNew.add(ns.domain); uniqueNewSites.push(ns); }
}
console.log(` Found ${uniqueNewSites.length} unique new → verifying...\n`);
// Verify newly discovered sites (batch of 5)
const BATCH_V = 5;
for (let i = 0; i < uniqueNewSites.length; i += BATCH_V) {
const batch = uniqueNewSites.slice(i, i + BATCH_V);
const jobs = batch.map(async (s) => {
const html = await fetchHtml(s.url);
if (!html || html.length < 300) return { site: s, casino: 0 };
const r = analyzeHtml(html, s.domain);
return { site: s, casino: r.casinoDomains };
});
const results = await Promise.all(jobs);
for (const { site, casino } of results) {
if (casino >= 3) {
verified.push({ url: `https://${site.domain}`, title: site.title || site.domain, domain: site.domain, casinoLinks: casino });
verifiedMap.set(site.domain, verified[verified.length - 1]);
}
}
if (i % 200 === 0) console.log(` new verified: ${verified.length} total`);
await sleep(350);
}
scannedDomains.clear(); // Reset — verify ALL verified sites again in next round
for (const v of verified) scannedDomains.add(v.domain);
console.log(` Round ${roundNum} done → ${verified.length} total\n`);
await sleep(2500);
}
// ── Deduplicate final results by domain ────────
const finalMap = new Map();
for (const v of verified) {
if (!finalMap.has(v.domain) || v.casinoLinks > finalMap.get(v.domain).casinoLinks) {
finalMap.set(v.domain, v);
}
}
const finalData = [...finalMap.values()];
// Sort by casino links descending
finalData.sort((a, b) => b.casinoLinks - a.casinoLinks);
// Write CSV
const header = 'url,title,domain,casino_links';
const rows = finalData.map(v => {
const t = (v.title || '').replace(/"/g, "'");
return `"${v.url}","${t}","${v.domain}",${v.casinoLinks}`;
});
fs.writeFileSync(CSV_OUTPUT, [header, ...rows].join('\n'), 'utf8');
console.log(`\n══════════ ${finalData.length} verified → ${CSV_OUTPUT} ══════════`);
})();