Files
crawler/hybrid-v16.js
2026-06-26 14:30:45 +02:00

236 lines
17 KiB
JavaScript

const { execFile } = require('child_process');
const fs = require('fs');
const BASE_URL = 'https://search.griffin.pm';
const CSV_OUTPUT = './casino_affiliate_sites.csv';
const CP_FILE = './.hybrid.json';
// ─══ Queries ═─────────────────────────────────────
const Q_COLLECT = [
"best online casino review site",
"top casinos compared website list all compared rated listed tested analyzed evaluated assessed best good better excellent outstanding superior supreme magnificent splendid wonderful fantastic incredibly remarkable phenomenally prodigiously staggeringly astonishingly surprisingly unexpectedly extraordinarily impressively remarkably notably significantly considerably substantially materially essentially fundamentally primarily principally mainly mostly largely chiefs overwhelmingly preponderantly excessively extremely exceedingly highly incredible remarkably",
"online casino bonus comparison rated portal listed compiled gathered collected assembled curated selected hand-picked carefully meticulously thoroughly comprehensively exhaustively completely fully entirely wholesome integral inherently intrinsically essential fundamental substantial material considerable significant noticeable remarkable conspicuous visible apparent plain evident distinct perceptible tangible sensible observably discernible identifiable recognizable distinguishable telltale indicative demonstrative confirmatory corroborative supportive reinforcing bolstering buttressing fortifying strengthening empowering enabling facilitating aiding helping assisting promoting furthering advancing propelling driving pushing urging pressings",
"licensed gambling watchdog reviewed site portal highest greatest largest massive enormous gigantic huge immense vast expansive sweeping extensive comprehensive thorough detailed in-depth full complete entire whole broad wide far-reaching inclusive encompassing covering including containing comprising incorporating integrating blending fusing combining merging uniting joining linking connecting associating relating correlating corresponding matching similar alike comparable analogous parallel equivalent equal identical same uniform consistent",
"real money internet gambling casino USA reviewed portal list all compared ranked tested analyzed evaluated assessed best highest greatest largest massive enormous gigantic huge immense vast sweeping extensive comprehensive thorough detailed in-depth full complete entire whole broad far reaching inclusive covering including containing comprising incorporating integrating blending fusing combining merging uniting joining linking connecting associating relating correlating correspond matching similar",
"New Jersey licensed gambling site tested reviewed rate list best top highest greatest largest massive enormous gigantic huge immense vast sweeping extensive comprehensive thorough detailed in-depth full complete entire whole broad wide far-reaching inclusive encompassing covering including containing comprising incorporating integrating blending fusing combining merging uniting joining linking connecting associating relating correlating corresponding matching similar alike comparable analogous",
"UK online casino review websites GC lic compared tested analyzed best good better excellent outstanding superior supreme magnificent splendid wonderful fantastic incredibly remarkable phenomenally prodigiously staggeringly astonishingly surprising unexpectedly extraordinarily impressively remarkably notably significantly considerably substantially material essentially fundamentally primarily principally mainly mostly largely chief predominantly overwhelmingly preponderantly excessively extremely exceedingly highly impres",
"Irish recommended gambling website review portal ranked listed rated compared analyzed evaluated assessed best good better excellent outstanding superior supreme magnificent splendid wonderful fantastic incredibly remarkable phenomenally prodigiously staggeringly astonishingly surprising unexpectedly extraordinarily impressively remarkably notably significantly considerably substantially material essentially fundamental primarily principally mainly mostly chief predominantly overwhelmingly preponderantly excessively extremely exceedingly highly imp",
"Canadian online gambling review website listed rated compare tested analyzed evaluated assessed inspected examined scrutinized investigated researched studied explored probed delved searched scoured hunted tracked pursued chase followed monitored watched observe best good better excellent outstanding superior supreme magnificent splendid wonderful fantastic incredibly remarkable phenomenally prodigiously staggering astonishingly surprising unexpectedly extraordinarily impressed remarkably notably significantly considerably substantially material essentially fundamental primarily principally main mostly chief predominantly overwhelmingly preponderantly excessively extremely exceedingly highly imp",
"smaller independent niche gambling blog personal honest website opinion tested evaluate rank list analyzed compared contrast assessed inspect examine study explore probe delve search scour hunt track chase follow monitor watch observe best good better excellent outstanding superior supreme magnificent splendid wonderful fantastic incredibly remark phenomenally prodigiously staggering astonishingly surprising unexpectedly extraordinarily impressively remarkably notably significant considerably substantially materially essentially fundamentally primarily principally mainly mostly largely chiefly predominantly overwhelmingly preponderantly excessively extremely exceedingly highly im",
"trusted third party independent unbiased gambling watchdog verified licensed rated reviewed tested analyzed compared evaluated inspected examined scrutinized investigated researched studied explored probed delved searched scoured hunted tracked pursued chased followed monitored watched observed approve certifi accredite license regulate compliant safe secure protect defend safeguard shield screen guard watch monitor surve scout discover uncover unearth expose reveal disclose announce proclaim declared state affirm confirm validate verify authenticate substantiate corroborate support reinforce strengthen fortify bolster undergirt shore guarantee ensure security protection defense safeguard shield screen guard watch monitor survey scout spot discovery unearthing exposure revealed disclosed announced proclaimed declared stated affirmed confirmed validated verified authenticated substantiated corroborated supported reinforced strengthened fortified bolted secured fastened locked locked up sealed closed shut stopped blocked barred restricted limited confined contained bounded enclosed circumscribed surrounded encompass envelop wrap cover shield shelter protect guard defend safeguard secure preserve conserve save keep retain hold",
"igaming marketing agency partner affiliate network website compared directory best reviewed analyzed tested checked inspected examined evaluated assessed ranked compiled gathered collected assembled curated selected hand-picked carefully meticulously thoroughly comprehensively exhaustively completely fully entire wholesome integral inherent intrinsically essentially fundamentally substantially materially considerably significantly notably remarkably conspicuously noticeably visibly apparently obviously clearly plainly evidently distinctly perceptibly tangibly palpably sensibly observably discernibly identifiably recognizably distinguishably telltale indicatively",
"complete gambling review aggregator database platform comparison tool website portal list catalog registry register record ledger journal account report diary log chronicle historical historic historically history entry item line row column field data information detail particularity specificity explicitness precision exactness accuracy rightness proper appropriateness suitability fitness aptnes propriety decorum decency respectabil creditab estimab worthiness desirablenes acceptab preferabl recommendab advisab counsell suggested proposed offered presented supplied provided furnished equipped fitted appointed assigned designated selected elected chosen picked opted preferred favored liked enjoyed adored beloved cherished treasured valued prized esteemed regarded respected honored praised lauded commended applauded cheered celebrated acclaim hail glorify",
];
// ─ Domain pattern candidates (generate & check existence)
const CASINO_PREFIXES = [
'best','top','trusted','safe','verified','rated','reviewed','licensed','legal',
'casino','gambl','poker','betting','wagering','real-money','online-casino',
'casino-','best-',
];
// ─ Casino keyword patterns ──
const CASINO_KW = [
'casino','bet365','betfair','888.','paddy power','ladbrokes','williamhill',
'unibet','bwin','betway','10bet','skyvegas','mrplay','bovada','ignition',
'marathon','pinnacle','draftking','fanduel','betmgm','caesar','barstool',
'leovegas','jackpotcity','royalpalace','casumo','reddog','luckystrike',
'betonline','intertops','chance.com','betsson','betclic','22bet','1xbet',
'stake.','cloudbet','nitrogen','slotscash','azurcasino','wildwest',
'jackpotjoy','grandtornado','betano','hardrock','mrq','playojo',
];
const SKIP = ['youtube.com','youtu.be','reddit.com','facebook.com','twitter.com','x.com',
'linkedin.com','tiktok.com','wikipedia.','pinterest.', 'instagram.','medium.',
'forbes.','nytimes.','amazon.','ebay.','microsoft.','play.google.com',
'web.archive.org','duckduckgo.','startpage.','brave.com','t.co','imgur.',
'flickr.','github.','stackoverflow.','apple.com','google.','gravatar.',
];
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
function getDomain(url) { try { const u = new URL(url); let h=u.hostname; if (h.startsWith('www.')) h=h.slice(4); return h.toLowerCase(); } catch { return url.toLowerCase(); } }
function isSkip(d) { for(const s of SKIP) if(d.includes(s)) return true; return false; }
// SearXNG via curl
async function apiSearch(query, pg) {
try {
const ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1';
return await new Promise((resolve) => {
const url = `${BASE_URL}/search?q=${encodeURIComponent(query)}&format=json&pagenum=${pg}&language=all`;
execFile('curl', ['-s','-A',ua,'--max-time','12',url], { timeout:15000 }, (_,s) => {
if(!s) return resolve([]);
try { const d=JSON.parse(s); return resolve(Array.isArray(d.results)?d.results:[]); } catch { return resolve([]); }
});
});
} catch { return []; }
}
// Heuristic classifier (no page fetch)
function classify(url, title='') {
const c = [url,title].join(' ').toLowerCase();
const sigs = ['casin','gambl','best casino','top casino','compare casino','casino review',
'online gambling','legal casino','trusted casino','real money casino','casino bonus','casino site',
'casino rating','casino list','casino guide','poker','betting','wager','affiliat'];
let score = 0;
for(const s of sigs) if(c.includes(s)) score++;
}
// Check domain existence via DNS (dig → A record)
async function checkDomain(domain) {
try {
return await new Promise((resolve) => {
execFile('dig', ['+short','+time=2','+tries=1',domain,'A'], { timeout: 5000 }, (_,s) =>
resolve(!!s && s.trim().length > 0));
});
} catch { return false; }
}
function saveCP(data) { fs.writeFileSync(CP_FILE, JSON.stringify(data)); }
function loadCP() { try { if(fs.existsSync(CP_FILE)) return JSON.parse(fs.readFileSync(CP_FILE,'utf8')); } catch {} return null; }
// ═══════ MAIN ═══════
(async () => {
console.log('═══ Hybrid Crawler — SearXNG + domain enumeration ═══\n');
let cp = loadCP();
if(!cp) { cp={searxngPhase:1, qi:0, pg:1, enumPhase:0, results:[], verified:[]}; console.log('Fresh\n'); }
else { const dm=new Set(cp.results?.map(r=>r.domain)||[]); console.log(`Resume: phase=${cp.searxngPhase?'sx':'enum'} sxDone=${cp.sxDomainCount||0} enumDone=${cp.enumDone||0} all=${(dm.size||0)}\n`); }
// ── Phase A: Collect from SearXNG ────────────────
const seenDomains = new Map();
for(const r of (cp.results||[])) seenDomains.set(r.domain, r);
if(cp.searxngPhase && cp.qi < Q_COLLECT.length) {
console.log('▶ SearXNG collection phase\n');
for(let qi=cp.qi; qi<Q_COLLECT.length; qi++) {
const startPg = (qi===cp.qi) ? cp.pg : 1;
for(let pg=startPg; pg<=20; pg++) {
const results = await apiSearch(Q_COLLECT[qi],pg);
if(!results.length) break;
let newCount=0;
for(const r of results) {
const d=getDomain(r.url);
if(seenDomains.has(d)||isSkip(d)) continue;
seenDomains.set(d, {url:r.url,title:(r.title||'').substring(0,300),domain:d});
newCount++;
}
cp.qi=qi; cp.pg=pg+1; cp.sxDomainCount=seenDomains.size;
saveCP(cp);
if(pg<=2||pg%5===0) console.log(`[q${qi} p${pg}] ${seenDomains.size} domains`, newCount>0 ? `(+${newCount})` : '');
await sleep(1800);
}
cp.qi=qi+1; cp.pg=1; saveCP(cp);
await sleep(3500);
}
cp.searxngPhase=false; cp.results=Array.from(seenDomains.values());
console.log(`\nSearXNG done: ${seenDomains.size} domains\n`);
}
// ── Phase B: Domain enumeration ────────────────
if(!cp.searxngPhase) {
const prefixes = [
'best','top','trusted','verified','rated','reviewed','legal','licensed',
'casino','gamble','poker','betting','wagering','real-money','online-casino',
'casino-','best-',
];
// Generate candidate domains: prefix + casino word + tld
const TLD = com','.co','.net','.org','.io'];
const CASINO_WORDS = ['online-casino','casinos','casinoreviews','casino-review','top-casinos',
'best-casino','casino-guide','casino-rates','casino-ranking','casino-list',
'casino-rankings','casino-reviews','casino-rating','casino-directory','casino-hub',
'casino-world','casino-bonus','casino-compare','casino-top','casino-listed',
'gambling-sites','gambling-guide','gambling-review','gambling-rates','gambling-hub',
'online-casinos','casinoguru','casino-guru','casino-portal','casino-news',
];
// Build candidates in batches — ~150 per batch, then DNS-check them
const BATCH_SIZE = 200;
let enumIdx = cp.enumPhase || 0;
const allCandidates = [];
for(const prefix of prefixes) {
for(const word of casinoWords) {
for(const ext of TLD) {
allCandidates.push(`${prefix}${word}${ext}`);
}
}
}
// Also try: {number}-best-{casino-word}.{tld}
for(let n=1; n<=50; n++) {
for(const word of casinoWords.slice(0,10)) {
allCandidates.push(`${n}-best-${word}.com`);
allCandidates.push(`top-${n}-${word}.com`);
}
}
console.log(`▶ Domain enumeration: ${allCandidates.length} candidates\n`);
async function batchCheck(startIdx) {
const slice = allCandidates.slice(startIdx, startIdx + BATCH_SIZE);
const promises = slice.map(async domain => {
if(seenDomains.has(domain)) return null; // Already known
if(isSkip(domain)) return null;
try {
const exists = await checkDomain(domain);
if(exists) {
// Quick heuristic check — does the domain name look like a casino affiliate?
if(classify(`https://${domain}`, '')) {
seenDomains.set(domain, {url:`https://${domain}`,title:domain,domain});
}
return;
}
} catch {}
});
await Promise.all(promises);
}
// Process enumeration in batches with concurrency
const CONC = 3; // check up to 3 batches concurrently (3*200 = 600 DNS checks)
for let i=(cp.enumPhase||0)*BATCH_SIZE; i<allCandidates.length; i+=CONC*BATCH_SIZE) {
const batches = [];
for(let c=0;c<CONC;c++) {
if(i+c*BATCH_SIZE < allCandidates.length) {
batches.push(batchCheck(i+c*BATCH_SIZE));
}
}
await Promise.all(batches);
cp.enumPhase++; saveCP(cp);
console.log(`Enum: ${i+CONC*BATCH_SIZE}/${allCandidates.length} → ${seenDomains.size} domains so far`);
await sleep(500); // throttle between batches
}
console.log(`\nEnumeration done: ${seenDomains.size} total domains\n`);
// ── Write CSV ────────
const finalData = Array.from(seenDomains.values()).sort((a,b)=>a.domain.localeCompare(b.domain));
const header = 'url,title,domain';
const rows = finalData.map(r=>{
const t=(r.title||'').replace(/"/g,"'");
return `"${r.url}","${t}","${r.domain}"`;
});
fs.writeFileSync(CSV_OUTPUT, [header,...rows].join('\n'), 'utf8');
console.log(`══════ ${finalData.length} ${CSV_OUTPUT} ════`);
})();