Files
crawler/validate-sites.js
2026-06-26 14:30:45 +02:00

145 lines
5.3 KiB
JavaScript

const path = require('path');
const fs = require('fs');
const puppeteer = require('puppeteer-extra');
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Load all site configs from src/sites/
var sitesDir = path.join(__dirname, 'src', 'sites');
var jsonFiles = fs.readdirSync(sitesDir).filter(f => f.endsWith('.json'));
var allSites = jsonFiles.map(function(fp) {
try {
var data = JSON.parse(fs.readFileSync(path.join(sitesDir, fp), 'utf8'));
return data;
} catch(e) {
console.warn('Failed to parse:', fp);
return null;
}
}).filter(Boolean);
// Find the matching filename for each site config
function findFilenameForSite(url) {
for (const f of jsonFiles) {
try {
var c = JSON.parse(fs.readFileSync(path.join(sitesDir, f), 'utf8'));
if (c.url === url) return f;
} catch(e) {}
}
return null;
}
console.log('Loaded', allSites.length, 'site configs to validate\n');
(async function() {
var browser = await puppeteer.launch({
headless: 'new',
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const CasinoCrawlerClass = require('./src/services/crawler.js');
var crawler = new CasinoCrawlerClass();
var goodSites = [];
var badFilenames = [];
for (var i = 0; i < allSites.length; i++) {
const siteConfig = allSites[i];
const filename = findFilenameForSite(siteConfig.url);
if (!filename) {
console.log('[i]' + (i).padEnd(5), '| Missing file for:', siteConfig.name, '| SKIP');
continue;
}
let page = null;
try {
page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
);
// Navigate — quick timeout to skip slow/bad sites
await page.goto(siteConfig.url, { waitUntil: 'domcontentloaded', timeout: 8000 });
await new Promise(r => setTimeout(r, 2000));
// Get title + body text for checks
const title = await page.title();
var sampleText = await page.evaluate(
() => document.body.innerText.substring(0, 800)
).catch(() => '');
// Rule out non-casino sites / parked domains
if (sampleText.toLowerCase().includes('this domain is for sale') ||
sampleText.toLowerCase().includes('godaddy parking')) {
console.log('🚫 PARKED |', filename.padEnd(35), '| ' + title.substring(0, 60));
badFilenames.push(filename);
} else if (!isCasinoRelated(title, sampleText)) {
// Not actually a casino review / affiliate site
console.log('🚫 NOT CASINO|', filename.padEnd(35), '| ' + title.substring(0, 60));
badFilenames.push(filename);
} else if (title.includes('Cloudflare') ||
sampleText.toLowerCase().includes('checking your browser')) {
// Blocked by anti-bot — keep for now but skip extraction
console.log('⛔ BLOCKED |', filename.padEnd(35), '| ' + title.substring(0, 60));
} else {
var extracted = await crawler.extractCasinoData(page);
if (extracted.length >= 5) {
console.log('✅ OK |', filename.padEnd(35), '| Found:', extracted.length, 'casinos');
goodSites.push(filename);
} else if (extracted.length > 0) {
// Few casinos found — keep but note it
var names = [];
for (var j = 0; j < extracted.length; j++) {
names.push(extracted[j].name || '?');
}
console.log('⚠️ FEW |', filename.padEnd(35), '| Found:', extracted.length, '| First3:', names.join(', ').substring(0, 40));
goodSites.push(filename);
} else {
// Casino-related site but couldn't extract anything — needs different extraction
console.log('❌ NO DATA |', filename.padEnd(35), '| ' + title.substring(0, 60));
badFilenames.push(filename);
}
}
} catch (e) {
// Timeout or nav failure — delete since this is likely a dead/bad URL
console.log('⏱️ TIMEOUT-DEL|', filename.padEnd(35), '| ' + e.message.split('\n')[0].substring(0, 60));
badFilenames.push(filename);
} finally {
if (page) await page.close().catch(() => {});
}
}
// Clean up dead JSON files from src/sites/
var deletedCount = 0;
for (var dIdx = 0; dIdx < badFilenames.length; dIdx++) {
try { fs.unlinkSync(path.join(sitesDir, badFilenames[dIdx])); deletedCount++;
} catch(e) { console.warn('Failed to delete', badFilenames[dIdx]); }
}
console.log('\n========== VALIDATION SUMMARY ==========');
console.log('✅ Kept:', goodSites.length, 'valid casino review/affiliate sites');
console.log('🚫 Deleted:', deletedCount, 'junk/parked/timeout files from src/sites/');
await browser.close();
})();
function isCasinoRelated(title, sample) {
var check = (title + ' ' + sample).toLowerCase();
var keywords = ['casino', 'gambl', 'poker', 'bonus', 'review',
'real money', 'free spin', 'bookie'];
for (var i = 0; i < keywords.length; i++) {
if (check.includes(keywords[i])) return true;
}
return false;
}