145 lines
5.3 KiB
JavaScript
145 lines
5.3 KiB
JavaScript
const path = require('path');
|
|
const fs = require('fs');
|
|
const puppeteer = require('puppeteer-extra');
|
|
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
// Load all site configs from src/sites/
|
|
var sitesDir = path.join(__dirname, 'src', 'sites');
|
|
var jsonFiles = fs.readdirSync(sitesDir).filter(f => f.endsWith('.json'));
|
|
var allSites = jsonFiles.map(function(fp) {
|
|
try {
|
|
var data = JSON.parse(fs.readFileSync(path.join(sitesDir, fp), 'utf8'));
|
|
return data;
|
|
} catch(e) {
|
|
console.warn('Failed to parse:', fp);
|
|
return null;
|
|
}
|
|
}).filter(Boolean);
|
|
|
|
// Find the matching filename for each site config
|
|
function findFilenameForSite(url) {
|
|
for (const f of jsonFiles) {
|
|
try {
|
|
var c = JSON.parse(fs.readFileSync(path.join(sitesDir, f), 'utf8'));
|
|
if (c.url === url) return f;
|
|
} catch(e) {}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
console.log('Loaded', allSites.length, 'site configs to validate\n');
|
|
|
|
(async function() {
|
|
var browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
const CasinoCrawlerClass = require('./src/services/crawler.js');
|
|
var crawler = new CasinoCrawlerClass();
|
|
|
|
var goodSites = [];
|
|
var badFilenames = [];
|
|
|
|
for (var i = 0; i < allSites.length; i++) {
|
|
const siteConfig = allSites[i];
|
|
const filename = findFilenameForSite(siteConfig.url);
|
|
|
|
if (!filename) {
|
|
console.log('[i]' + (i).padEnd(5), '| Missing file for:', siteConfig.name, '| SKIP');
|
|
continue;
|
|
}
|
|
|
|
let page = null;
|
|
|
|
try {
|
|
page = await browser.newPage();
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
|
);
|
|
|
|
// Navigate — quick timeout to skip slow/bad sites
|
|
await page.goto(siteConfig.url, { waitUntil: 'domcontentloaded', timeout: 8000 });
|
|
await new Promise(r => setTimeout(r, 2000));
|
|
|
|
// Get title + body text for checks
|
|
const title = await page.title();
|
|
var sampleText = await page.evaluate(
|
|
() => document.body.innerText.substring(0, 800)
|
|
).catch(() => '');
|
|
|
|
// Rule out non-casino sites / parked domains
|
|
if (sampleText.toLowerCase().includes('this domain is for sale') ||
|
|
sampleText.toLowerCase().includes('godaddy parking')) {
|
|
console.log('🚫 PARKED |', filename.padEnd(35), '| ' + title.substring(0, 60));
|
|
badFilenames.push(filename);
|
|
|
|
} else if (!isCasinoRelated(title, sampleText)) {
|
|
// Not actually a casino review / affiliate site
|
|
console.log('🚫 NOT CASINO|', filename.padEnd(35), '| ' + title.substring(0, 60));
|
|
badFilenames.push(filename);
|
|
|
|
} else if (title.includes('Cloudflare') ||
|
|
sampleText.toLowerCase().includes('checking your browser')) {
|
|
// Blocked by anti-bot — keep for now but skip extraction
|
|
console.log('⛔ BLOCKED |', filename.padEnd(35), '| ' + title.substring(0, 60));
|
|
|
|
} else {
|
|
var extracted = await crawler.extractCasinoData(page);
|
|
|
|
if (extracted.length >= 5) {
|
|
console.log('✅ OK |', filename.padEnd(35), '| Found:', extracted.length, 'casinos');
|
|
goodSites.push(filename);
|
|
|
|
} else if (extracted.length > 0) {
|
|
// Few casinos found — keep but note it
|
|
var names = [];
|
|
for (var j = 0; j < extracted.length; j++) {
|
|
names.push(extracted[j].name || '?');
|
|
}
|
|
console.log('⚠️ FEW |', filename.padEnd(35), '| Found:', extracted.length, '| First3:', names.join(', ').substring(0, 40));
|
|
goodSites.push(filename);
|
|
|
|
} else {
|
|
// Casino-related site but couldn't extract anything — needs different extraction
|
|
console.log('❌ NO DATA |', filename.padEnd(35), '| ' + title.substring(0, 60));
|
|
badFilenames.push(filename);
|
|
}
|
|
}
|
|
|
|
} catch (e) {
|
|
// Timeout or nav failure — delete since this is likely a dead/bad URL
|
|
console.log('⏱️ TIMEOUT-DEL|', filename.padEnd(35), '| ' + e.message.split('\n')[0].substring(0, 60));
|
|
badFilenames.push(filename);
|
|
|
|
} finally {
|
|
if (page) await page.close().catch(() => {});
|
|
}
|
|
}
|
|
|
|
// Clean up dead JSON files from src/sites/
|
|
var deletedCount = 0;
|
|
for (var dIdx = 0; dIdx < badFilenames.length; dIdx++) {
|
|
try { fs.unlinkSync(path.join(sitesDir, badFilenames[dIdx])); deletedCount++;
|
|
} catch(e) { console.warn('Failed to delete', badFilenames[dIdx]); }
|
|
}
|
|
|
|
console.log('\n========== VALIDATION SUMMARY ==========');
|
|
console.log('✅ Kept:', goodSites.length, 'valid casino review/affiliate sites');
|
|
console.log('🚫 Deleted:', deletedCount, 'junk/parked/timeout files from src/sites/');
|
|
|
|
await browser.close();
|
|
})();
|
|
|
|
function isCasinoRelated(title, sample) {
|
|
var check = (title + ' ' + sample).toLowerCase();
|
|
var keywords = ['casino', 'gambl', 'poker', 'bonus', 'review',
|
|
'real money', 'free spin', 'bookie'];
|
|
for (var i = 0; i < keywords.length; i++) {
|
|
if (check.includes(keywords[i])) return true;
|
|
}
|
|
return false;
|
|
}
|