Initial commit

This commit is contained in:
Joe
2026-06-26 14:12:10 +02:00
commit 12518b259c
5258 changed files with 732924 additions and 0 deletions
+229
View File
@@ -0,0 +1,229 @@
'use strict';
const { execFile } = require('child_process');
const fs = require('fs');
const CSV_FILE = './casino_affiliates_final.csv';
const CHECKPOINT = './crawlsave.json';
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1';
// Casino keywords for brand detection in outbound links
const CASINO_KEYWORDS = [
'bet365','888casino','skyvegas','pokerstars','partypoker',
'unibet','bwin','betway','williamhill','ladbrokes','betfair',
'draftkings','fanduel','betmgm','caesars','barstool',
'leovegas','bovada','ignitioncasino','mrplay','jackpotcity',
'casumo','playojo','22bet','paddypower','1xbet',
'betonline','intertops','reddogcas','luckystick',
'betclic','betsson','hardrock','betano','grosvenor',
'coral','skybet','mr.green','betsafe','comeon.se',
'slots.lv','bc.game','pin-up','stake.cas',
'/casino/','casinos.','online-casino','gambling-sites'
];
// Domains to skip (social, CDN, analytics)
const SKIP = [
'youtube.','facebook.','twitter','.instagram.',
'linkedin','tiktok.','wikipedia.','pinterest',
'medium.','forbes.','google.','amazon.','static.',
'cdn.','fonts.googleapis.','flickr.','imgur.',
'github.com','t.co','duckduckgo'
];
// Seed review portal URLs to start crawling from
const SEED_URLS = [
'https://www.casino.org/reviews/',
'https://casino.guru/casino-reviews',
'https://chipy.com/casinos',
'https://next.io/online-casinos-uk/',
'https://first.com/casino/best-casinos',
'https://slotcatalog.com/en/best-online-casinos',
'https://www.whichbingo.co.uk/casino-sites/',
'https://gg.co.uk/online-casinos/top-20/',
'https://www.oddschecker.com/casino-bonus',
'https://www.racingpost.com/online-casino/best-sites/',
'https://www.livecasinocomparer.com/online-casino/',
'https://www.freep.com/story/sports/online-casino-rankings/',
'https://gamingamerica.com/online-casinos',
'https://www.legalsportsreport.com/online-casinos/',
'https://www.sportsline.com/casinos/',
'https://deadspin.com/sweepstakes-casinos/social-casinos/',
];
// Additional pages per seed to crawl (multi-page review lists)
const MULTI_PAGES = {
'https://www.poke.org/reviews/': ['page2/','page3/'],
'https://casino.guru/casino-reviews': ['?p=2','?p=3','?p=4','?p5'],
};
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
// Fetch a page via curl (reliable against bot protection)
async function fetchPage(url) {
try {
return await new Promise(resolve => {
execFile('curl', ['-sL', '--max-time', '8', '-A', UA, '--max-filesize', '50000', url],
{ timeout: 12000 }, (_, output) => resolve(output || ''));
});
} catch (e) { return ''; }
}
// Parse HTML: extract unique outbound domains + page title
function parseOutboundLinks(html, skipDomain) {
const clean = html.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '');
const doms = new Set();
for (const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
try {
const u = new URL(m[1]);
if (!['http:', 'https:'].includes(u.protocol)) continue;
let d = u.hostname.replace('www.', '');
if (d === skipDomain || SKIP.some(k => d.includes(k))) continue;
doms.add(d);
} catch {} // ignore broken hrefs
}
// Try to extract page title from <title> tag
let title = '';
const tt = clean.match(/<title[^>]*>([^<]{10,250})<\/title>/i);
if (tt) title = tt[1].trim();
return { doms: [...doms], title };
}
// Count distinct casino brands referenced in an outbound domain list
function countCasinoBrands(domainList) {
const hits = new Set();
for (const d of domainList) {
const lo = `:${d.toLowerCase()}`; // prepends colon to catch paths like ':/bet365-reviews'
if (lo.includes('casino') || lo.includes('/casin') || lo.includes('/gambl')) { hits.add(d); continue; }
for (const kw of CASINO_KEYWORDS) {
if (kw.length < 3) continue;
const kl = kw.toLowerCase();
if (lo.includes(kl) || d.toLowerCase().includes(kl.replace('/', ''))) { hits.add(d); break; }
}
}
return hits.size; // unique casino brands this page links to
}
// Checkpoint helpers (survives crashes/timeouts mid-run)
function checkSave(obj) { fs.writeFileSync(CHECKPOINT, JSON.stringify(obj)); }
function checkLoad() {
try { if (fs.existsSync(CHECKPOINT)) return JSON.parse(fs.readFileSync(CHECKPOINT, 'utf8')); } catch {}
return null;
}
// ═══════ MAIN CRAWL LOGIC ═══════
(async function main() {
console.log('═══ Casino Affiliate Scraper v3 ═══\n');
const cp = checkLoad();
// Verified affiliate sites: domain -> entry object with url, title, brands count
const VERIFIED = new Map();
if (cp && cp.verified) { for (const [k, v] of Object.entries(cp.verified)) VERIFIED.set(k, v); }
// Candidate domains still needing verification
const CANDIDATES = (cp && cp.candidates) ? [...new Set(cp.candidates)] : [];
let seedsDone = (cp && cp.seedsDone) || 0;
let candIdx = (cp && cp.checkIdx) || 0;
console.log(`${VERIFIED.size} already verified, ${CANDIDATES.length} candidates pending\n`);
// ── PHASE 1: Crawl seed review portals for outbound casino links ───────────────
console.log('Phase-1: Scraping seed pages...\n');
for (let si = seedsDone; si < SEED_URLS.length; si++) {
const sUrl = SEED_URLS[si];
let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; }
console.log(`[${si+1}/${SEED_URLS.length}] ${sUrl}`);
const html = await fetchPage(sUrl);
if (!html || html.length < 400) { console.log(' [no content]\n'); await sleep(3e3); continue; }
// Seed itself is confirmed affiliate site
VERIFIED.set(skipD, { url: sUrl, title: '', brands: 99 });
try {
const data = parseOutboundLinks(html, skipD);
console.log(`${data.doms.length} outbound domains found\n`);
// Add seed's outbound links to candidate pool for Phase-2 verification
for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); }
} catch (e) { console.log(' [parse error] ' + e.message); }
seedsDone = si + 1;
checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx });
await sleep(2500); // rate limit between seed crawls
}
console.log(`\nSeeds done. Found ${CANDIDATES.length} domains to verify.` + `\nphase-2: verifying candidates...\n`);
// ── PHASE 2: Verify candidate sites (concurrent curl batches) ───────
const CONC = 4; // simultaneous fetches per batch
for (; candIdx < CANDIDATES.length && VERIFIED.size < 1300; candIdx += CONC) {
const batch = CANDIDATES.slice(candIdx, candIdx + CONC);
const results = [];
try {
results.push(...(await Promise.all(batch.map(async (dom) => {
let foundOk = false;
for (const proto of ['https://', 'http://']) { // try HTTPS first, fallback HTTP
try {
const url = proto + dom;
const html = await fetchPage(url);
if (!html || html.length < 400) continue;
let actualUrl = url; // after redirects use real final URL
const locMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/);
if (locMatch && locMatch[1].startsWith('http')) actualUrl = locMatch[1];
try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize
let selfD, data;
try {
selfD = new URL(actualUrl).hostname.replace('www.', '') || dom;
data = parseOutboundLinks(html, selfD);
} catch { continue; }
const brandCount = countCasinoBrands(data.doms);
foundOk = true;
// Count outbound links too — review portals often have lots of casino brand refs
const linkScore = data.doms.length > 10 ? Math.min(data.doms.length / 2, 8) : 0;
const finalScore = brandCount + linkScore; // combine brands + volume signals
if (finalScore >= 6 || brandCount >= 3) {
VERIFIED.set(dom, { url: actualUrl, title: data.title, brands: brandCount });
console.log(' ✓ ' + VERIFIED.size + ': ' + dom + ' -> ' + brandCount + ' brands');
// Recursive discovery from this newly-verified affiliate site!
for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); }
}
} catch (e) { /* individual attempt failed, try next proto */ continue; }
if (foundOk) break; // stop trying after success
}
return { ok: foundOk };
}))) ); // map over batch entries
} catch (e) { console.error('batch error', e); } finally { candIdx += CONC; }
// Checkpoint every 20 verified or at milestone break
if (VERIFIED.size % 15 === 0 || candIdx >= CANDIDATES.length - CONC) {
checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx });
console.log(' [CKPT] VERIF:' + VERIFIED.size + ' pending:' + (CANDIDATES.length - candIdx) + '\n');
}
await sleep(3500); // throttle between batches to avoid IP ban
}
// ── PHASE 3: write CSV file ───────────────
const out = [...VERIFIED.values()]
.sort((a, b) => (a.title || a.url).localeCompare(b.title || b.url))
.map(v => {
const t = (v.title || '').replace(/"/g, "'");
return '"' + v.url + '","' + t + '","' + v.domain + '",' + v.brands;
});
fs.writeFileSync(CSV_FILE, ['url,title,domain,casino_brands_linked', ...out].join('\n'), 'utf8');
console.log('\n═══ DONE: ' + VERIFIED.size + ' sites saved → ' + CSV_FILE + ' ═══\n');
})();