const path = require('path'); const fs = require('fs'); const puppeteer = require('puppeteer-extra'); var StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); // Load all site configs from src/sites/ var sitesDir = path.join(__dirname, 'src', 'sites'); var jsonFiles = fs.readdirSync(sitesDir).filter(f => f.endsWith('.json')); var allSites = jsonFiles.map(function(fp) { try { var data = JSON.parse(fs.readFileSync(path.join(sitesDir, fp), 'utf8')); return data; } catch(e) { console.warn('Failed to parse:', fp); return null; } }).filter(Boolean); // Find the matching filename for each site config function findFilenameForSite(url) { for (const f of jsonFiles) { try { var c = JSON.parse(fs.readFileSync(path.join(sitesDir, f), 'utf8')); if (c.url === url) return f; } catch(e) {} } return null; } console.log('Loaded', allSites.length, 'site configs to validate\n'); (async function() { var browser = await puppeteer.launch({ headless: 'new', executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const CasinoCrawlerClass = require('./src/services/crawler.js'); var crawler = new CasinoCrawlerClass(); var goodSites = []; var badFilenames = []; for (var i = 0; i < allSites.length; i++) { const siteConfig = allSites[i]; const filename = findFilenameForSite(siteConfig.url); if (!filename) { console.log('[i]' + (i).padEnd(5), '| Missing file for:', siteConfig.name, '| SKIP'); continue; } let page = null; try { page = await browser.newPage(); await page.setUserAgent( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ); // Navigate — quick timeout to skip slow/bad sites await page.goto(siteConfig.url, { waitUntil: 'domcontentloaded', timeout: 8000 }); await new Promise(r => setTimeout(r, 2000)); // Get title + body text for checks const title = await page.title(); var sampleText = await page.evaluate( () => document.body.innerText.substring(0, 800) ).catch(() => ''); // Rule out non-casino sites / parked domains if (sampleText.toLowerCase().includes('this domain is for sale') || sampleText.toLowerCase().includes('godaddy parking')) { console.log('🚫 PARKED |', filename.padEnd(35), '| ' + title.substring(0, 60)); badFilenames.push(filename); } else if (!isCasinoRelated(title, sampleText)) { // Not actually a casino review / affiliate site console.log('🚫 NOT CASINO|', filename.padEnd(35), '| ' + title.substring(0, 60)); badFilenames.push(filename); } else if (title.includes('Cloudflare') || sampleText.toLowerCase().includes('checking your browser')) { // Blocked by anti-bot — keep for now but skip extraction console.log('⛔ BLOCKED |', filename.padEnd(35), '| ' + title.substring(0, 60)); } else { var extracted = await crawler.extractCasinoData(page); if (extracted.length >= 5) { console.log('✅ OK |', filename.padEnd(35), '| Found:', extracted.length, 'casinos'); goodSites.push(filename); } else if (extracted.length > 0) { // Few casinos found — keep but note it var names = []; for (var j = 0; j < extracted.length; j++) { names.push(extracted[j].name || '?'); } console.log('⚠️ FEW |', filename.padEnd(35), '| Found:', extracted.length, '| First3:', names.join(', ').substring(0, 40)); goodSites.push(filename); } else { // Casino-related site but couldn't extract anything — needs different extraction console.log('❌ NO DATA |', filename.padEnd(35), '| ' + title.substring(0, 60)); badFilenames.push(filename); } } } catch (e) { // Timeout or nav failure — delete since this is likely a dead/bad URL console.log('⏱️ TIMEOUT-DEL|', filename.padEnd(35), '| ' + e.message.split('\n')[0].substring(0, 60)); badFilenames.push(filename); } finally { if (page) await page.close().catch(() => {}); } } // Clean up dead JSON files from src/sites/ var deletedCount = 0; for (var dIdx = 0; dIdx < badFilenames.length; dIdx++) { try { fs.unlinkSync(path.join(sitesDir, badFilenames[dIdx])); deletedCount++; } catch(e) { console.warn('Failed to delete', badFilenames[dIdx]); } } console.log('\n========== VALIDATION SUMMARY =========='); console.log('✅ Kept:', goodSites.length, 'valid casino review/affiliate sites'); console.log('🚫 Deleted:', deletedCount, 'junk/parked/timeout files from src/sites/'); await browser.close(); })(); function isCasinoRelated(title, sample) { var check = (title + ' ' + sample).toLowerCase(); var keywords = ['casino', 'gambl', 'poker', 'bonus', 'review', 'real money', 'free spin', 'bookie']; for (var i = 0; i < keywords.length; i++) { if (check.includes(keywords[i])) return true; } return false; }