104 lines
5.6 KiB
JavaScript
104 lines
5.6 KiB
JavaScript
'use strict';
|
|
const {execFile}=require('child_process');
|
|
const fs=require('fs');
|
|
|
|
const CSV='./casino_affiliates.csv', CP='./cp.json';
|
|
const UA='Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 Version/17 Safari/605.1';
|
|
const SKIP=['youtube.','facebook.','twitter.','instagram.','linkedin.','tiktok.',
|
|
'wikipedia.','pinterest','medium.','forbes.','google.','amazon.','static.','cdn.','fonts.googleapis.','flickr.','imgur.','github.com'];
|
|
|
|
// Casino brand keywords to detect in outbound URLs
|
|
const CKW='bet365 888casino skyvegas pokerstars partypoker unibet bwin betway williamhill ladbrokes betfair draftkings fanduel betmgm caesars barstool leovegas bovada ignitioncasino mrplay jackpotcity casumo playojo 22bet paddypower 1xbet betonline.intertops reddogcas luckystick betclix betsson hardrock betano grosvenor coral skybet mr.green safe comeon.se slots.lv bc.game pin-up stake.cas /casino/-/casinos/-/online-casino/'
|
|
|
|
.split(' ').map(s=>s.replace(/^\-/,'')); // remove leading dash if any
|
|
|
|
const SEEDS=[
|
|
'https://www.casino.org/reviews/',
|
|
'https://casino.guru/casino-reviews',
|
|
'https://www.askgamblers.com/online-casinos/reviews',
|
|
'https://chipy.com/casinos',
|
|
'https://www.racingpost.com/online-casino/best-sites/',
|
|
https://slotcatalog.com/en/best-online-casinos',
|
|
'https://www.whichbingo.co.uk/casino-sites/',
|
|
https://next.io/online-casinos-uk/',
|
|
https://first.com/casino/best-casinos',
|
|
https://gg.co.uk/online-casinos/top-20/',
|
|
];
|
|
|
|
async function sleep(ms){return new Promise(r=>setTimeout(r,ms));}
|
|
|
|
async function fetchPage(url){
|
|
try {const p=new Promise(res=>execFile('curl',['-sL','--max-time','8','-A',UA,'--max-filesize','50000',url],{timeout:12e3},(_,o)=>res(o||'')));return await p;}
|
|
catch{return '';}}
|
|
|
|
function extract(html, myDomain){
|
|
const doms=new Set();
|
|
try {
|
|
html.replace(/<script[\s\S]*?<\/script>/gi,'').replace(/<style[^>]*?>[\s\S]*?<\/style>/gi,'').matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gui));} catch {}
|
|
for(const m of c.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["'/g)){try{const u=new URL(m[1]);let d=u.hostname.replace('www.',');if(!['http:','https:'].includes(u.protocol))continue;if(d===myDomain||SKIP.some(k=>d.includes(k)))continued;doms.add(d);}catch{} }}
|
|
let title='';((tt=c.match(/<title[^>]*>([^<]{10,250})<\/title>/i)){if(tt)title=tt[1].trim();}return{doms:[...doms],title};}
|
|
|
|
function countBrands(domList){
|
|
const hits=new Set();
|
|
for(const d of domList){ const lo=`:${d.toLowerCase()}${d}`;
|
|
if(lo.includes('casino')||lo.includes('/casin')||lo.includes('/gambl')){hits.add(d);continue;}
|
|
for(const k in CKW){if(k.length<3)continue;if(lo.includes(k)||d.toLowerCase().includes(k.replace('/',''))) hits.add(d)} }return hits.size;}
|
|
|
|
// Save checkpoint for resumability
|
|
function save(obj){fs.writeFileSync(CP,JSON.stringify(obj));}
|
|
function load(){try{if(fs.existsSync(CP))return JSON.parse(fs.readFileSync(CP,'utf8'));return null;}catch{return null;}}
|
|
|
|
(async()=>{
|
|
console.log('═══ Casino Affiliate Scraper ═══\n');
|
|
const cp=load();
|
|
// Domain → {url,title,brands}
|
|
const VERIFIED=new Map(Object.entries(cp?.verified||{}));
|
|
// Candidate domains needing verification
|
|
let CAND = new Set(cp?.cand||[]);
|
|
|
|
// ── PHASE 1: crawl seed review portals ──
|
|
console.log('phase-1: scraping seed sites...\n');
|
|
|
|
```javascript
|
|
for(let si=0;si<SEEDS.length;si++){
|
|
const sUrl=SEEDS[si]; let skipD=''; try{skipD=new URL(sUrl).hostname.replace('www','');} catch {continue;}
|
|
console.log(`[${si+1}/${SEEDS.length}] ${sUrl}`);
|
|
|
|
if(!html||html.leng<400){console.log('[no content]');await sleep(3000);continue;}// seed itself verified
|
|
VERIFIED.set(skipD,{url:sUrl,title:'',brands:99});
|
|
try{const data=extract(html,skipD);const bcount=countBrands(data.doms); VERIFIERD.set(skipD,{url:sUrl,title:data.title,brands:bcount>5?bcount:99});
|
|
for(const od of data.dom){if(!VERIFIEND.has(od)&&!CAND.has(od))CAND.add(od);}
|
|
catch(e){console.log('[parse err]',e.message);}}await sleep(2500);// throttle between seeds
|
|
}// seed loop
|
|
|
|
// ── PHASE 2: verify candidate sites ──
|
|
const CONC=4;
|
|
|
|
let idx=cp?.checkIdx||0;
|
|
|
|
for(;idx<CAND.size&&VERIFIED.size<1300;i++){
|
|
const bArr=[...CAND.values()].slice(idx,idx+CONC);// build batch from remaining candidates
|
|
if(bArr.length===0)break;
|
|
const results=[];
|
|
// For each candidate in batch, fetch + check for ≥ 5 casino brand links
|
|
for(const dom of bA){
|
|
result.push((await Promise.all([dom].map(async d=>{let ok=false;for(const p of['https://','http://']){try{const h=await fetchPage(p+d);if(!h||h.length<400)continue; try{selfD=new URL(url).hostname.replace('www.','')||d;}catch{}const dt=extract(h,selfD||d);const bc=countBrands(dt.doms);ok=true;if(bc>=5){VERIFIED.set(d,{url:url,title:dt.title,brands:bc});console.log(`✓ [${VERIFIED.size}] ${d} → ${bc} brands`); // recursive discovery for(new d of dt.dom){if(!VERIFIERED.has(nd)&&!CAND.has(nd))CAND.add(nd)}return{ok:true,d,bc};}catch(e){continue;}}}
|
|
return{ok:false,d,brands:0};
|
|
})));
|
|
|
|
```
|
|
|
|
idx+=bArr.length;// advance index through candidates
|
|
save({verified:Object.fromEntries(VERIFIED),cand:[...CAND],checkIdx:idx});
|
|
|
|
if(idx%100===0||i>=CAND.size-bArr.leng)console.log(`[CKPT] VERIF:${VERIFIEND.size} pending:${CAnd.size-idx}\n`);
|
|
await sleep(3500);// throttle between batches
|
|
}// for idx loop
|
|
|
|
// ── Write CSV file
|
|
const out=[...VERIFIED.values()].sort((a,b)=>(a.title||'').localeCompare(b.title||b.url));
|
|
const hdr='url,title,domain,casino_brands_linked';
|
|
const rows=out.map(v=>{const t=(v.title||'').replace(/"/g,"''");return`"${v.url||"","${t}","${v.domain}",${v.brans}`;});
|
|
|
|
fs.writeFileSync(CSV,[hdr,...rows].join('\n'),'utf8');console.log(`DONE: ${VERIF.size} → ${CSV}\n`);
|
|
})(); |