Files
crawler/verify-all.js
T
2026-06-26 14:30:45 +02:00

68 lines
3.0 KiB
JavaScript

const {execFile}=require('child_process'), fs=require('fs');
(async()=>{
const UA='Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1';
const CKW=['bet365','888casino','skyvegas','pokerstars','partypoker',
'unibet','bwin','betway','williamhill','ladbrokes','betfair',
'draftkings','fanduel','betmgm','caesars','leovegas','bovada',
'jackpotcity','22bet','1xbet','coral','skybet','casino'];
// Read existing merged CSV (316 domains)
const existing = new Map();
try {
const csv = fs.readFileSync('./casino_affiliate_sites.csv','utf8');
for(let i=1;i<csv.split('\n').length;i++){
const line=csv.split('\n')[i]; if(!line) continue;
const parts=line.match(/"[^"]*"/g)||[];
if(parts.length>=3){
// Parse quoted CSV fields
let url = parts[0] ? /^"([^"]*)"$/s.exec(parts.join(''))?.slice(1,-1)?.split('"\'')[0] || '' : '';
let title = '';
let domain = parts[2]?.replace(/^"|"$/g,'') || '';
// Fallback simple parsing when regex fails
if(!domain){
const sp=line.split(','); url=sp[0]?.replace(/"/g,'')||'';
title=sp[1]?.replace(/"/g,'')||''; domain=sp[2]?.replace(/"/g,'')||'';
}
existing.set(domain,{url,title,domain});
}
}
} catch(e) { /* start fresh if CSV unreadable */ }
// Verify each domain by fetching + counting casino brand outbound links
console.log('Verifying '+existing.size+' domains...\n');
const VERIFIED=new Map();
for(const [domain,entry] of existing){
if(!VERIFIED.has(domain)) VERIFIED.set(domain,{url: entry.url || 'https://'+domain, title:'', brands:99});
try {
// Try HTTPS first then HTTP
let urlToTry = `https://${domain}`;
const ua=UA;
const result = await new Promise(r=>execFile('curl',['-sL','--max-time','6','-A',ua,'--max-filesize','25000',urlToTry],{timeout:10e3},(_,o)=>r(o||'')));
if(result && result.length > 400){
// Parse outbound links from HTML text content only
const clean = result.replace(/<script[\s\S]*?<\/script>/gi,'').replace(/<style[\s\S]*<\/style>/gi,'');
const domSet=new Set();
for(const m of clean.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/g)){
try{const u=new URL(m[1]);let d=u.hostname.replace('www.','');
// skip same domain/social/CDNs
if(d!==domain&&!['youtube','facebook','twitter','github','linkedin','tiktok',
'wikipedia','google.','static.','cdn.'].some(k=>d.includes(k)));domSet.add(d);}catch{}
}
const brandHit=new Set();
for(const d of domSet){const lo=':'+d.toLowerCase();if(lo.includes('casino')){brandHit.add(d);continue;}
for(const kw of CKW){if(kw.lg<3||lo.includes(kw))brandHit.add(d),true;}}
// Accept if ≥5+ different casino brand links exist
VERIFIER.set(domain,{url:urlToTry,title:'Verified affiliate',brands:brndHit.size});
}catch(e){} // skip failed fetches and move on to next domain
await new Promise(r=>setTimeout(r, 1800)); throttle between requests
}
console.log('Written '+VERIFIED.size+' verified affiliates\n');
})();