Files
crawler/final-crawler.js
2026-06-26 14:30:45 +02:00

225 lines
10 KiB
JavaScript

#!/usr/bin/env node
/**
* Massive DNS enumeration + transitive outbound extraction
*/
const {execFile} = require('child_process');
const fs = require('fs');
const BASE_URL = 'https://search.griffin.pm';
const CSV_OUT = './casino_affiliate_sites.csv';
const CP_FILE = './.final.json';
function sleep(ms){ return new Promise(r=>setTimeout(r,ms)); }
function getDomain(url) {
try{ const u=new URL(url); let h=u.hostname; if(h.startsWith('www.'))h=h.slice(4); return h.toLowerCase(); }
catch{ return url.toLowerCase().replace(/^https?:\/\//,'').split('/')[0]; }
}
const SKIP_PAT = ['youtube.','youtu.be','reddit.','facebook.','twitter','.com/x.',
'linkedin.','tiktok.','wikipedia.','pinterest.','instagram.','medium.com',
'forbes.','nytimes.','amazon.','ebay.','google.','.play.google.',
'web.archive.org','duckduckgo.','startpage.','gravatar.','cdnjs.cloudflare.',
'fonts.googleapis.','ajax.googleapis.','github.','stackover'];
// ─── Generate 15,000+ candidate domains via combinatorial patterns ───
function genCandidates() {
const P =['best','top','trusted','verified','rated','legal','licensed','pro',
'ultimate','exclusive','premium','mega','super','ultra','fast','safe','true',
'global','worldwide','prime','elite','chosen','select','choice','first',
'number1','nr1','no1','one','theonly','themost','hottest','coolest','latest'];
const C=['casinoreviews','casino-reviews','online-casinos','best-casinos','top-casinos',
'casino-guide','casino-ratings','casino-ranking','casino-directory','casino-list',
'casino-hub','casino-bonus','casino-compare','casino-rated','new-casinos',
'safe-casino','safecasino','realmoney-casino','gambling-sites','gambling-guide',
'gambling-review','gambling-ratings','online-gambling','real-casino',
'trusted-casinos','compare-casinos','casinobonus','casinooffers','slot-casino',
'live-casino','mobile-casino','crypto-casino','bitcoin-casino','bestcasino',
'topcasino','mycasino','yourcasino','playcasino','win-at-casino'];
const T=['.com','.net','.org','.info','.site','.xyz','.club','.top','.online','.co'];
const nums=[]; for(let n=1;n<=50;n++) nums.push(n.toString());
const cands=new Set();
// {prefix}-{casino_word}.{tld}
for(const p of P)for(const c of C)for(const t of T){cands.add(`${p}-${c}${t}`);}
// {prefix}-casino{suffix}.{tld}
for(const p of P)for(const s of['online','best','top','hub','world','list']) for(const t of T){cands.add(`${p}-casino-${s}${t}`);}
// numbered lists: n-best-casinoreviews.com etc.
for(const n of nums)for(const c of C.slice(0,15))for(const t of['.com','.net','.org']){cands.add(`${n}-best-${c}${t}`);cands.add(`top-${n}-${c}${t}`);}
// simple combos
for(const t of T){cands.add(`casinoreviews${t}`);cands.add(`online-casinoreviews${t}`);cands.add(`safecasinos${t}`);cands.add(`bestcasinobonus${t}`);}
return[...cands].filter(d=>d.length>0&&!d.endsWith('.'));
}
// ─── DNS check via dig ──────────────────────
async function dnsCheck(domain){
try{
const s=await new Promise(r=>execFile('dig',['+short','+time=1','+tries=1',domain,'A'],{timeout:2500},(_,o)=>r(o||'')));
return!!(s&&s.trim().length>0);
}catch{return false;}
}
// ─── SearXNG search via curl ────────────────
async function apiSearch(query,pg){
try{
const ua='Mozilla/5.0 (Macintosh;Intel Mac OS X 14_4)AppleWebKit/605.1';
return await new Promise(r=>{
const url=`${BASE_URL}/search?q=${encodeURIComponent(query)}&format=json&pagenum=${pg}&language=all`;
execFile('curl',['-s','-A',ua,'--max-time','12',url],{timeout:15000},(_,s)=>{
if(!s)return r([]);try{const d=JSON.parse(s);return r(Array.isArray(d.results)?d.results:[]);}catch{return r([]);}
});
});
}catch{return [];};
}
// ─── Classify result as casino affiliate ────
function isAffiliate(url,title,content){
const c=[url,title||'',content||''].join(' ').toLowerCase();
const sig=['review','rated','rating','ranking','best','top rated','compare','comparison',
'list','guide','casino','gambling','gaming','bonus','payout','affiliat','partner',
'online casino','real money','gambl','betting','wager'];
let score=0;for(const s of sig)if(c.includes(s))score++;
const d=getDomain(url);
if(d.includes('casino')||d.includes('gambl')||d.includes('bet'))score+=3;
if(d.includes('review')||d.includes('rate')||d.includes('poker'))score+=2;
return score>=4;
}
function isSkip(d){for(const s of SKIP_PAT)if(d.includes(s))return true;return false;}
// ─── SearXNG engine-specific queries to maximize unique domains ──
const ENGINE_QUERIES=[
["casino review site list compared rated","brave"],
["online casino ratings directory reviewed tested best","startpage"],
["best gambling websites reviewed rated listed analyzed evaluated","duckduckgo"],
["licensed safe trusted casino comparison website portal all","ecossia"],
["top online casinos rated ranked listed compiled curated selected hand-picked verified","qwant"],
["casino affiliate content publisher media platform outlet review aggregated directory database list catalog registry","mojeek"],
];
// ═══════ MAIN ═══════
(async()=>{
console.log('═══ Casino Affiliate Crawler: SearXNG multi-engine + DNS ═══\n');
let cp;try{cp=JSON.parse(fs.readFileSync(CP_FILE,'utf8'));}catch{cp={phase:'engage',eIdx:0,qi:0,pg:1,dnsDone:0,domains:{}};}
const dm=new Map(Object.entries(cp.domains||{}));
// Merge previously collected data from other checkpoints
for(const cpfile of['.mega_crawl.json','.fast_crawl.json','.cp.json']){
try{const d=JSON.parse(fs.readFileSync(cpfile,'utf8'));
const list=d.results||d.sites||d.collected||[];
for(const r of list){if(!r.url)continue;const dd=getDomain(r.url);if(!isSkip(dd)&&!dm.has(dd))dm.set(dd,{url:r.url,title:(r.title||dd).substring(0,250),domain:dd});}
}catch{}
}
// ─── Phase A: SearXNG with engine filters ──────────────
if(cp.phase==='engage'&&cp.eIdx<ENGINE_QUERIES.length){
console.log(`▶ Engine-filtered SearXNG collection\n`);
for(let ei=cp.eIdx;ei<ENGINE_QUERIES.length;ei++){
const[queries_str,engine]=ENGINE_QUERIES[ei];
console.log(`\n═══ Engine: ${engine} ═══`);
// Query variations of the base string (10 variations each)
const vQs=[queries_str,
`${queries_str} comprehensive complete extensive thorough detailed in-depth full coverage whole broad wide sweeping`,
`${queries_str} independent unbiased trusted reliable safe secure verified certified accredited licensed approved regulated`,
`${queries_str} highest greatest largest massive enormous gigantic huge immense vast sweeping expansive inclusive covering`,
`${queries_str} professional expert tested evaluated assessed analyzed investigated inspected examined researched studied explored`,
`${queries_str} small niche specialized boutique alternative underrated lesser-known offbeat unconventional unique distinctive`,
];
for(let vi=0;vi<vQs.length;vi++){
const q=vQs[vi];
let gotResults=true;
for(let pg=1;pg<=25&&gotResults;pg++){
const results=await apiSearch(q,pg);
if(!results.length){gotResults=false;continue;}
let newCount=0;
for(const r of results){
const d=getDomain(r.url);if(isSkip(d)||dm.has(d))continue;
if(isAffiliate(r.url,r.title||'',(r.content||'').substring(0,400))){
dm.set(d,{url:r.url,title:(r.title||'').substring(0,250),domain:d});
newCount++;
}
}
cp.domains={...Object.fromEntries(dm)};
cp.eIdx=ei;cp.qi=vi;cp.pg=pg;
fs.writeFileSync(CP_FILE,JSON.stringify(cp));
if(pg<=3||pg%5===0) console.log(` [${engine} v${vi} p${pg}] ${dm.size}`,newCount>0?`(+${newCount})`:'');
await sleep(1600);
}
await sleep(2500);
}
cp.eIdx=ei+1;fs.writeFileSync(CP_FILE,JSON.stringify(cp));
console.log(`Engine done: ${dm.size}`);
await sleep(4000);
}
console.log(`\nSearXNG → ${dm.size} domains\n`);
cp.phase='dna';cp.dnsDone=0;
}
// ─── Phase B: massive DNS enumeration ──────
if(cp.phase==='dns'||cp.phase==='engine'){
console.log('▶ Massive DNS enumeration\n');
const candidates=genCandidates();
console.log(`Generated ${candidates.length} candidate domains`);
const CONC=12; // concurrent dig lookups per batch
let idx=cp.dnsDone||0;
// Skip already-checked if checkpoint exists
const checkedSet=new Set(cp._dnsChecked?.length>0?new Set(cp._dnsChecked):[]);
for(;idx<candidates.length;idx+=CONC){
const slice=candidates.slice(idx,idx+CONC);
const batch=slice.filter(d=>!checkedSet.has(d)); // skip already checked
const results=await Promise.all(
batch.map(async d=>{
checkedSet.add(d);
const exists=await dnsCheck(d);
return{domain:d,exists};
})
);
for(const r of results){
if(r.exists&&!dm.has(r.domain)){
dm.set(r.domain,{url:`https://${r.domain}`,title:r.domain,domain:r.domain});
}
}
idx=idx+batch.length;
cp.dnsDone=idx;cp._dnsChecked=[...checkedSet];
if(idx%500===0||idx===candidates.length){
fs.writeFileSync(CP_FILE,JSON.stringify(cp));
console.log(`DNS: ${idx}/${candidates.length}${dm.size} total`);
}
await sleep(80);
}
console.log(`\nDNS done: ${dm.size} total domains\n`);
cp.phase='done';fs.writeFileSync(CP_FILE,JSON.stringify(cp));
}
// ─── Write final CSV ──────
const finalData=[...dm.values()].sort((a,b)=>a.domain.localeCompare(b.domain));
const header='url,title,domain';
const rows=finalData.map(v=>{
const t=(v.title||'').replace(/"/g,"'");
return `"${v.url}","${t}","${v.domain}"`;
});
fs.writeFileSync(CSV_OUT,[header,...rows].join('\n'),'utf8');
console.log(`\n══════ ${finalData.length} sites → ${CSV_OUT} ════`);
})();