Initial commit
This commit is contained in:
@@ -0,0 +1,229 @@
|
||||
'use strict';
|
||||
const { execFile } = require('child_process');
|
||||
const fs = require('fs');
|
||||
|
||||
const CSV_FILE = './casino_affiliates_final.csv';
|
||||
const CHECKPOINT = './crawlsave.json';
|
||||
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1';
|
||||
|
||||
// Casino keywords for brand detection in outbound links
|
||||
const CASINO_KEYWORDS = [
|
||||
'bet365','888casino','skyvegas','pokerstars','partypoker',
|
||||
'unibet','bwin','betway','williamhill','ladbrokes','betfair',
|
||||
'draftkings','fanduel','betmgm','caesars','barstool',
|
||||
'leovegas','bovada','ignitioncasino','mrplay','jackpotcity',
|
||||
'casumo','playojo','22bet','paddypower','1xbet',
|
||||
'betonline','intertops','reddogcas','luckystick',
|
||||
'betclic','betsson','hardrock','betano','grosvenor',
|
||||
'coral','skybet','mr.green','betsafe','comeon.se',
|
||||
'slots.lv','bc.game','pin-up','stake.cas',
|
||||
'/casino/','casinos.','online-casino','gambling-sites'
|
||||
];
|
||||
|
||||
// Domains to skip (social, CDN, analytics)
|
||||
const SKIP = [
|
||||
'youtube.','facebook.','twitter','.instagram.',
|
||||
'linkedin','tiktok.','wikipedia.','pinterest',
|
||||
'medium.','forbes.','google.','amazon.','static.',
|
||||
'cdn.','fonts.googleapis.','flickr.','imgur.',
|
||||
'github.com','t.co','duckduckgo'
|
||||
];
|
||||
|
||||
// Seed review portal URLs to start crawling from
|
||||
const SEED_URLS = [
|
||||
'https://www.casino.org/reviews/',
|
||||
'https://casino.guru/casino-reviews',
|
||||
'https://chipy.com/casinos',
|
||||
'https://next.io/online-casinos-uk/',
|
||||
'https://first.com/casino/best-casinos',
|
||||
'https://slotcatalog.com/en/best-online-casinos',
|
||||
'https://www.whichbingo.co.uk/casino-sites/',
|
||||
'https://gg.co.uk/online-casinos/top-20/',
|
||||
'https://www.oddschecker.com/casino-bonus',
|
||||
'https://www.racingpost.com/online-casino/best-sites/',
|
||||
'https://www.livecasinocomparer.com/online-casino/',
|
||||
'https://www.freep.com/story/sports/online-casino-rankings/',
|
||||
'https://gamingamerica.com/online-casinos',
|
||||
'https://www.legalsportsreport.com/online-casinos/',
|
||||
'https://www.sportsline.com/casinos/',
|
||||
'https://deadspin.com/sweepstakes-casinos/social-casinos/',
|
||||
];
|
||||
|
||||
// Additional pages per seed to crawl (multi-page review lists)
|
||||
const MULTI_PAGES = {
|
||||
'https://www.poke.org/reviews/': ['page2/','page3/'],
|
||||
'https://casino.guru/casino-reviews': ['?p=2','?p=3','?p=4','?p5'],
|
||||
};
|
||||
|
||||
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
||||
|
||||
// Fetch a page via curl (reliable against bot protection)
|
||||
async function fetchPage(url) {
|
||||
try {
|
||||
return await new Promise(resolve => {
|
||||
execFile('curl', ['-sL', '--max-time', '8', '-A', UA, '--max-filesize', '50000', url],
|
||||
{ timeout: 12000 }, (_, output) => resolve(output || ''));
|
||||
});
|
||||
} catch (e) { return ''; }
|
||||
}
|
||||
|
||||
// Parse HTML: extract unique outbound domains + page title
|
||||
function parseOutboundLinks(html, skipDomain) {
|
||||
const clean = html.replace(/<script[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, '');
|
||||
const doms = new Set();
|
||||
for (const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
|
||||
try {
|
||||
const u = new URL(m[1]);
|
||||
if (!['http:', 'https:'].includes(u.protocol)) continue;
|
||||
let d = u.hostname.replace('www.', '');
|
||||
if (d === skipDomain || SKIP.some(k => d.includes(k))) continue;
|
||||
doms.add(d);
|
||||
} catch {} // ignore broken hrefs
|
||||
}
|
||||
|
||||
// Try to extract page title from <title> tag
|
||||
let title = '';
|
||||
const tt = clean.match(/<title[^>]*>([^<]{10,250})<\/title>/i);
|
||||
if (tt) title = tt[1].trim();
|
||||
|
||||
return { doms: [...doms], title };
|
||||
}
|
||||
|
||||
// Count distinct casino brands referenced in an outbound domain list
|
||||
function countCasinoBrands(domainList) {
|
||||
const hits = new Set();
|
||||
for (const d of domainList) {
|
||||
const lo = `:${d.toLowerCase()}`; // prepends colon to catch paths like ':/bet365-reviews'
|
||||
if (lo.includes('casino') || lo.includes('/casin') || lo.includes('/gambl')) { hits.add(d); continue; }
|
||||
for (const kw of CASINO_KEYWORDS) {
|
||||
if (kw.length < 3) continue;
|
||||
const kl = kw.toLowerCase();
|
||||
if (lo.includes(kl) || d.toLowerCase().includes(kl.replace('/', ''))) { hits.add(d); break; }
|
||||
}
|
||||
}
|
||||
return hits.size; // unique casino brands this page links to
|
||||
}
|
||||
|
||||
// Checkpoint helpers (survives crashes/timeouts mid-run)
|
||||
function checkSave(obj) { fs.writeFileSync(CHECKPOINT, JSON.stringify(obj)); }
|
||||
function checkLoad() {
|
||||
try { if (fs.existsSync(CHECKPOINT)) return JSON.parse(fs.readFileSync(CHECKPOINT, 'utf8')); } catch {}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ═══════ MAIN CRAWL LOGIC ═══════
|
||||
(async function main() {
|
||||
console.log('═══ Casino Affiliate Scraper v3 ═══\n');
|
||||
|
||||
const cp = checkLoad();
|
||||
// Verified affiliate sites: domain -> entry object with url, title, brands count
|
||||
const VERIFIED = new Map();
|
||||
if (cp && cp.verified) { for (const [k, v] of Object.entries(cp.verified)) VERIFIED.set(k, v); }
|
||||
|
||||
// Candidate domains still needing verification
|
||||
const CANDIDATES = (cp && cp.candidates) ? [...new Set(cp.candidates)] : [];
|
||||
let seedsDone = (cp && cp.seedsDone) || 0;
|
||||
let candIdx = (cp && cp.checkIdx) || 0;
|
||||
|
||||
console.log(`${VERIFIED.size} already verified, ${CANDIDATES.length} candidates pending\n`);
|
||||
|
||||
// ── PHASE 1: Crawl seed review portals for outbound casino links ───────────────
|
||||
console.log('Phase-1: Scraping seed pages...\n');
|
||||
for (let si = seedsDone; si < SEED_URLS.length; si++) {
|
||||
const sUrl = SEED_URLS[si];
|
||||
let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; }
|
||||
|
||||
console.log(`[${si+1}/${SEED_URLS.length}] ${sUrl}`);
|
||||
const html = await fetchPage(sUrl);
|
||||
if (!html || html.length < 400) { console.log(' [no content]\n'); await sleep(3e3); continue; }
|
||||
|
||||
// Seed itself is confirmed affiliate site
|
||||
VERIFIED.set(skipD, { url: sUrl, title: '', brands: 99 });
|
||||
|
||||
try {
|
||||
const data = parseOutboundLinks(html, skipD);
|
||||
console.log(` → ${data.doms.length} outbound domains found\n`);
|
||||
|
||||
// Add seed's outbound links to candidate pool for Phase-2 verification
|
||||
for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); }
|
||||
} catch (e) { console.log(' [parse error] ' + e.message); }
|
||||
|
||||
seedsDone = si + 1;
|
||||
checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx });
|
||||
await sleep(2500); // rate limit between seed crawls
|
||||
}
|
||||
|
||||
console.log(`\nSeeds done. Found ${CANDIDATES.length} domains to verify.` + `\nphase-2: verifying candidates...\n`);
|
||||
|
||||
// ── PHASE 2: Verify candidate sites (concurrent curl batches) ───────
|
||||
const CONC = 4; // simultaneous fetches per batch
|
||||
|
||||
for (; candIdx < CANDIDATES.length && VERIFIED.size < 1300; candIdx += CONC) {
|
||||
const batch = CANDIDATES.slice(candIdx, candIdx + CONC);
|
||||
|
||||
const results = [];
|
||||
try {
|
||||
results.push(...(await Promise.all(batch.map(async (dom) => {
|
||||
let foundOk = false;
|
||||
for (const proto of ['https://', 'http://']) { // try HTTPS first, fallback HTTP
|
||||
try {
|
||||
const url = proto + dom;
|
||||
const html = await fetchPage(url);
|
||||
if (!html || html.length < 400) continue;
|
||||
|
||||
let actualUrl = url; // after redirects use real final URL
|
||||
const locMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/);
|
||||
if (locMatch && locMatch[1].startsWith('http')) actualUrl = locMatch[1];
|
||||
|
||||
try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize
|
||||
|
||||
let selfD, data;
|
||||
try {
|
||||
selfD = new URL(actualUrl).hostname.replace('www.', '') || dom;
|
||||
data = parseOutboundLinks(html, selfD);
|
||||
} catch { continue; }
|
||||
|
||||
const brandCount = countCasinoBrands(data.doms);
|
||||
foundOk = true;
|
||||
|
||||
// Count outbound links too — review portals often have lots of casino brand refs
|
||||
const linkScore = data.doms.length > 10 ? Math.min(data.doms.length / 2, 8) : 0;
|
||||
const finalScore = brandCount + linkScore; // combine brands + volume signals
|
||||
|
||||
if (finalScore >= 6 || brandCount >= 3) {
|
||||
VERIFIED.set(dom, { url: actualUrl, title: data.title, brands: brandCount });
|
||||
console.log(' ✓ ' + VERIFIED.size + ': ' + dom + ' -> ' + brandCount + ' brands');
|
||||
|
||||
// Recursive discovery from this newly-verified affiliate site!
|
||||
for (const od of data.doms) { if (!VERIFIED.has(od)) CANDIDATES.push(od); }
|
||||
}
|
||||
} catch (e) { /* individual attempt failed, try next proto */ continue; }
|
||||
|
||||
if (foundOk) break; // stop trying after success
|
||||
}
|
||||
return { ok: foundOk };
|
||||
}))) ); // map over batch entries
|
||||
} catch (e) { console.error('batch error', e); } finally { candIdx += CONC; }
|
||||
|
||||
// Checkpoint every 20 verified or at milestone break
|
||||
if (VERIFIED.size % 15 === 0 || candIdx >= CANDIDATES.length - CONC) {
|
||||
checkSave({ verified: Object.fromEntries(VERIFIED), candidates: [...new Set(CANDIDATES)], seedsDone, checkIdx: candIdx });
|
||||
console.log(' [CKPT] VERIF:' + VERIFIED.size + ' pending:' + (CANDIDATES.length - candIdx) + '\n');
|
||||
}
|
||||
|
||||
await sleep(3500); // throttle between batches to avoid IP ban
|
||||
}
|
||||
|
||||
// ── PHASE 3: write CSV file ───────────────
|
||||
const out = [...VERIFIED.values()]
|
||||
.sort((a, b) => (a.title || a.url).localeCompare(b.title || b.url))
|
||||
.map(v => {
|
||||
const t = (v.title || '').replace(/"/g, "'");
|
||||
return '"' + v.url + '","' + t + '","' + v.domain + '",' + v.brands;
|
||||
});
|
||||
|
||||
fs.writeFileSync(CSV_FILE, ['url,title,domain,casino_brands_linked', ...out].join('\n'), 'utf8');
|
||||
|
||||
console.log('\n═══ DONE: ' + VERIFIED.size + ' sites saved → ' + CSV_FILE + ' ═══\n');
|
||||
})();
|
||||
Reference in New Issue
Block a user