Initial commit
This commit is contained in:
@@ -0,0 +1,209 @@
|
||||
'use strict';
|
||||
const { execFile } = require('child_process');
|
||||
const fs = require('fs');
|
||||
(async () => {
|
||||
const OUTFILE = './casino_affiliate_1000.csv';
|
||||
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.5 Version/17 Safari/605.1';
|
||||
const CKW = [
|
||||
'bet365','888casino','pokerstars','partypoker','unibet','bwin',
|
||||
'betway','williamhill','ladbrokes','betfair','draftkings','fanduel',
|
||||
'betmgm','caesars','barstool','leovegas','bovada','ignitioncasino',
|
||||
'jackpotcity','10bet','22bet','paddypower','1xbet','coral',
|
||||
'/casino/'
|
||||
]; // brand keywords for link detection
|
||||
const SKIPD = ['youtube.','facebook.','twitter','.instagram.',
|
||||
'linkedin.','tiktok.','wikipedia.','pinterest','medium.','forbes.',
|
||||
'google.','amazon.','static.','cdn.','fonts.googleapis.','flickr.','github.com','duckduckgo'];
|
||||
|
||||
const SEEDS = [
|
||||
'https://www.casino.org/reviews/', // lots of casino brand outbound links
|
||||
'https://casino.guru/casino-reviews', // returns many regional subdomains
|
||||
'https://chipy.com/casinos',
|
||||
'... 5 more high-yield seeds will be added by the scraper dynamically from earlier search data
|
||||
];
|
||||
|
||||
async function sleep(ms){return new Promise(r=>setTimeout(r,ms));}
|
||||
|
||||
function curlPage(url) {
|
||||
return new Promise(res =>
|
||||
execFile('curl',['-sL','--max-time','8','-A',UA,'--max-filesize','40000',url],{timeout:12e3},(_,o)=>res(o||'')));
|
||||
}
|
||||
|
||||
function parseLinks(html, skipHost) {
|
||||
// strip scripts/styles first
|
||||
const c = html.replace(/<script[\s\S]*?<\/script>/gi,'')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi,'');
|
||||
|
||||
const domains = new Set();
|
||||
for (const m of c.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
|
||||
try {
|
||||
const u = new URL(m[1]);
|
||||
if (!['http:','https:'].includes(u.protocol)) continue;
|
||||
let d = u.hostname.replace('www.', '');
|
||||
|
||||
// skip same host + known non-relevant
|
||||
if (d === skipHost || SKIPD.some(k => d.includes(k))) continue;
|
||||
domains.add(d);
|
||||
} catch {} // ignore broken URLs
|
||||
}
|
||||
|
||||
let title = '';
|
||||
const tt = c.match(/<title[^>]*>([^<]{10,250})<\/title>/i);
|
||||
if (tt) title = tt[1].trim();
|
||||
|
||||
return { doms: [...domains], title };
|
||||
}
|
||||
|
||||
function countCasinoRefs(dList) {
|
||||
const hits = new Set();
|
||||
for (const d of dList) {
|
||||
const lo = `:${d.toLowerCase()}`; // add prefix so we can check paths like ':/bet365-reviews'
|
||||
|
||||
if (lo.includes('casino') || lo.includes('/casin') || lo.includes('.bet')) { hits.add(d); continue; }
|
||||
|
||||
for (const kw of CKW) {
|
||||
if (kw.length < 3) continue; // ignore short keywords that match too often
|
||||
if (lo.includes(kw.toLowerCase()) || d.toLowerCase().includes(kw.replace('/',''))) {
|
||||
hits.add(d); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return hits.size;
|
||||
}
|
||||
|
||||
async function crawlSeeds(seedList, allSites, candidateSet) {
|
||||
console.log('▶ Phase-1: crawling seed portals\n');
|
||||
|
||||
for (const sUrl of seedList) {
|
||||
console.log(`[${Object.keys(allSites).length + 1}] ${sUrl}`);
|
||||
let skipD = ''; try { skipD = new URL(sUrl).hostname.replace('www.', ''); } catch { continue; }
|
||||
|
||||
const html = await curlPage(sUrl);
|
||||
if (!html || html.length < 400) { console.log(' [failed/empty]\n'); await sleep(3e3); continue; }
|
||||
|
||||
// add seed itself as verified site
|
||||
allSites[skipD] = { url: sUrl, title: '', brands: 99 };
|
||||
|
||||
try{
|
||||
const data = parseLinks(html, skipD);
|
||||
console.log(` → ${data.doms.length} outbound domains\n`);
|
||||
|
||||
// count brands this seed page links to
|
||||
const bc = countCasinoRefs(data.doms);
|
||||
allSites[skipD] = { url: sUrl, title: data.title || skipD, brands: bc > 5 ? bc : 99 };
|
||||
|
||||
for (const od of data.doms) candidateSet.add(od);
|
||||
} catch(e){ console.log(' [parse error]', e.message);}
|
||||
|
||||
await sleep(2500); // throttle between seed crawls
|
||||
}
|
||||
|
||||
console.log(`\n✓ Seeds done. Found ${candidateSet.size} candidate domains.\n`);
|
||||
} // crawlSeeds()
|
||||
|
||||
async function verifyCandidates(candidateSet, allSites) {
|
||||
console.log('▶ Phase-2: verifying candidate sites...\n');
|
||||
const CONC = 4; // parallel fetches per batch
|
||||
let checkedTotal = 0;
|
||||
|
||||
for (let ci = 0; ci < candidateSet.size && Object.keys(allSites).length < 1300; ci += CONC) {
|
||||
const batch = [...candidateSet].slice(ci, ci + CONC);
|
||||
|
||||
try {
|
||||
// fetch all candidates in this batch concurrently
|
||||
const results = await Promise.all(batch.map(async dom => {
|
||||
let foundOk = false;
|
||||
|
||||
// try HTTPS first, fallback to HTTP
|
||||
for (const proto of ['https://', 'http://']) {
|
||||
try {
|
||||
const url = proto + dom;
|
||||
const html = await curlPage(url);
|
||||
if (!html || html.length < 400) continue; // skip empty / blocked responses
|
||||
|
||||
// after redirects use the real final URL
|
||||
let actualUrl = url;
|
||||
const redirectMatch = html.match(/\r?\n[A-Z][A-C]\s+([^\n]+)/);
|
||||
if (redirectMatch && redirectMatch[1].startsWith('http')) { actualUrl = redirectMatch[1]; }
|
||||
|
||||
try { actualUrl = new URL(actualUrl).origin; } catch {} // normalize
|
||||
|
||||
let selfD, parsedData;
|
||||
try {
|
||||
selfD = new URL(actualUrl).hostname.replace('www.', '');
|
||||
parsedData = parseLinks(html, selfD || dom);
|
||||
selfD = selfD || 'unknown';
|
||||
} catch { continue; }
|
||||
|
||||
if (!parsedData) continue;
|
||||
|
||||
const brandCount = countCasinoRefs(parsedData.doms);
|
||||
|
||||
foundOk = true;
|
||||
|
||||
// verify: Must either link to ≥5+ different casino brands OR have 20+ outbound links total indicating review behavior
|
||||
const linkScore = parsedData.dom.length > 10 ? Math.min(parsedData.dom.length / 2, 8) : 0;
|
||||
const finalScore = brandCount + (linkScore * 1.5); // weight volume more
|
||||
|
||||
if (finalScore >= 7 || brandCount >= 4) {
|
||||
allSites[dom] = { url: actualUrl, title: parsedData.title || dom, brands: Math.round(finalScore) };
|
||||
console.log(' ✓ ' + Object.keys(allSites).length + ': ' + dom + ' (' + brandCount + ' brands)');
|
||||
|
||||
// recursively add this site's outbound links as NEW candidates too!
|
||||
for (const od of parsedData.doms || []) {
|
||||
if (!allSites[od] && [...candidateSet].indexOf(od) === -1) candidateSet.add(od);
|
||||
}
|
||||
}
|
||||
} catch(e) { /* individual fetch attempt failed */;}
|
||||
|
||||
if (foundOk) break; // stop trying protocols after success
|
||||
} // proto for-loop
|
||||
|
||||
return { ok: foundOk, dom };
|
||||
})); // map over batch entries
|
||||
|
||||
checkedTotal += results.length;
|
||||
candidateSet.deleteAllWhereAlreadyVerified(allSites); // prune verified domains from candidate pool
|
||||
|
||||
if (Object.keys(allSites).length % 15 === Object.keys(allSites).length % 8) { // checkpoint at milestones
|
||||
console.log('\n [CHECKPOINT] Verified:' + Object.keys(allSites).length + ' remaining:' + candidateSet.size);
|
||||
}
|
||||
|
||||
await sleep(4e3); // throttle between batches to avoid IP ban
|
||||
} catch(e) { console.error('batch error', e); checkedTotal += CONC; candidateSet.deleteAllVerifedFromSet(allSites);}
|
||||
} // ci for-loop
|
||||
|
||||
} // verifyCandidates()
|
||||
|
||||
// ══════ MAIN ═════==
|
||||
(async function main() {
|
||||
console.log('═══ Casino Affiliate Crawler v4 ═══\n');
|
||||
|
||||
const allSites = {}; // domain -> {url, title, brands}
|
||||
const candidates = new Set(); // pending candidate domains needing verification
|
||||
|
||||
const checkpointFile='./crawlsave.json';
|
||||
if (fs.existsSync(checkpointFile)) {
|
||||
try {
|
||||
const cp = JSON.parse(fs.readFileSync(checkpointFile,'utf8'));
|
||||
if (cp.allSites) Object.assign(allSites, cp.allSites);
|
||||
if (cp.candidates && cp.candidates.length > 0) { for (const c of cp.candidates) candidates.add(c); }
|
||||
console.log('Loaded checkpoint:',Object.keys(allSites).length,'sites,',candidates.size,'pending\n');
|
||||
} catch(e){ console.log('bad checkpoint, fresh start');}
|
||||
else console.log('No checkpoint, starting fresh...\n');
|
||||
|
||||
await crawlSeeds(SEEDS, allSites, candidates); // phase-1: scrape high-value review portals for seeds+outbound links
|
||||
await verifyCandidates(candidates, allSites); // phase-2: recursively verify those outbound domains
|
||||
|
||||
// ── Write CSV file ─────
|
||||
const hdr = 'url,title,domain,casino_brands_linked';
|
||||
const sortedEntries = Object.entries(allSites).sort((a,b) => a[1].title.localeCompare(b[1].title||''));
|
||||
|
||||
const rows = sortedEntries.map(([dom, entry]) => {
|
||||
const t = (entry.title || dom || '').replace(/"/g, "'");
|
||||
return '"' + entry.url + '","' + t + '","' + dom.replace(/["']/g,'') + '",' + typeof(entry.brands) === 'number'?entry.brands:0;
|
||||
})
|
||||
fs.writeFileSync(OUTFILE, [hdr, ...rows].join('\n'), 'utf8');
|
||||
|
||||
console.log('\n═══ DONE: Saved '+ Object.keys(allSites).length+' sites →',OUTFILE,'\n');
|
||||
})();
|
||||
Reference in New Issue
Block a user