Files
crawler/clean-sites.js
T
2026-06-26 14:30:45 +02:00

75 lines
2.7 KiB
JavaScript

const path = require('path');
const fs = require('fs');
var sitesDir = './src/sites';
var jsonFiles = fs.readdirSync(sitesDir).filter(f => f.endsWith('.json'));
console.log('Loaded', jsonFiles.length, 'configs\n');
// Patterns that indicate junk/parked/dead configs to remove
var badPatterns = [
// Known parked/for-sale domains from the validation run
'best-7-online-casinos.com.json', // Timeout
'best-9-online-casinos.com.json', // DNS fail
'best-casino-list.com.json', // Timeout
'best-casino.xyz.json', // Afternic parked
'best-casinos.com.json', // Afternic parked
'best-online-casinos.info.json', // Timeout
'best.unitestatesonlinecasino.net.json', // Not casino content
'casadelalmendro.com.json', // Non-casino, timeout
'cabinjohn.org.json', // Not a casino site
'delawarepark.betrivers.com.json', // Cloudflare block
'exclusive-casino.com.json', // No data extracted
];
// Auto-detect bad configs by checking if URL is clearly not a review/affiliate site
for (var i = 0; i < jsonFiles.length; i++) {
const fp = path.join(sitesDir, jsonFiles[i]);
var data = JSON.parse(fs.readFileSync(fp, 'utf8'));
var url = (data.url || '').toLowerCase();
// Delete if explicitly listed as bad from validation run
if (badPatterns.includes(jsonFiles[i])) {
try { fs.unlinkSync(fp); console.log('🚫 DELETE:', jsonFiles[i]); }
catch(e) { console.warn('Failed to delete', jsonFiles[i]); }
continue;
}
// Auto-detect junk: URLs that point to Afternic/Godaddy/Auctions/parked pages
if (url.includes('forsale.godaddy') || url.includes('afternic.com/forsale')) {
try { fs.unlinkSync(fp); console.log('🚫 PARKED:', jsonFiles[i]); }
catch(e) {}
continue;
}
// Auto-detect known dead/reserved TLDs that were just parked domains
if (url.includes('.xyz') || url.includes('.site')) {
try { fs.unlinkSync(fp); console.log('🚫 JUNK TLD:', jsonFiles[i]); }
catch(e) {}
continue;
}
// Remove generic casino directory sites that have no real content
var genericSites = [
'best-casino.net',
'safe-casino.xyz',
'ultimate-casinos.com',
'super-online-casinos.net'
];
for (var j = 0; j < genericSites.length; j++) {
if (url.includes(genericSites[j])) {
try { fs.unlinkSync(fp); console.log('🚫 GENERIC:', jsonFiles[i]); }
catch(e) {}
break;
}
}
}
// Count remaining
const finalFiles = fs.readdirSync(sitesDir).filter(f => f.endsWith('.json'));
console.log('\n========== SUMMARY ==========');
console.log('✅ Kept:', finalFiles.length, 'valid site configs in src/sites/');
console.log('🚫 Deleted:', jsonFiles.length - finalFiles.length, 'junk/parked configs');