Files
crawler/cleanup-bad-names.js
2026-06-26 14:30:45 +02:00

93 lines
3.1 KiB
JavaScript

const { pool } = require('./src/db');
(async () => {
console.log('Cleaning junk entries from casinos table...\n');
// Step 1: Delete rows whose name matches exact known junk patterns
var toDeletePatterns = [
'', 'menu', 'nav', 'home', 'login', 'signup', 'register', 'account',
'support', 'help', 'contact us', 'about us', 'terms', 'privacy policy',
'top online casinos review list',
'gambleaware', 'casino bonus codes not working',
'free spins no deposit', 'welcome bonus',
'crypto casino', 'bitcoin', 'best online casinos'
];
for (const pattern of toDeletePatterns) {
try {
const r = await pool.query(
"DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) = $1",
[pattern.toLowerCase()]
);
if (r.rowCount > 0) {
console.log(' Deleted', r.rowCount, 'matching:', JSON.stringify(pattern));
}
} catch(e) {}
}
// Step 2: Delete rows where name matches LIKE patterns for junk text
var likePatterns = [
'%casino bonus codes not working% ',
'%online casinos & casino sites',
'dmca%',
'crypto bonus%',
'%willkommenspaket%',
'lizenz und spielerschutz bei exclusive casino',
'krypto bonus bis%',
'%deposit match up to%',
'%bonus und freispiele%',
'%exclusive bonuses%',
'%casino reps%',
'%casino bonus codes%',
'get bonus%',
'visit%casino%'
];
for (const pat of likePatterns) {
try {
const r = await pool.query(
"DELETE FROM casinos WHERE LOWER(casino_name) LIKE $1",
[pat.toLowerCase()]
);
if (r.rowCount > 0) {
console.log(' Deleted', r.rowCount, 'matching:', JSON.stringify(pat));
}
} catch(e) {}
}
// Step 3: Delete rows where name looks like phone numbers or prices or is too long
await pool.query(
"DELETE FROM casinos WHERE casino_name ~* '^\\d+\\s?\\d+' OR LOWER(casino_name) LIKE '%trustpilot%' OR LENGTH(casino_name) > 100"
);
// Step 4: Delete rows starting with symbols/emoji/special chars (not real brand names)
await pool.query(
'DELETE FROM casinos WHERE casino_name ~* \'^[\\+\\-\\>\\.]+\' OR casino_name ~ \'^\\\\s+$\''
);
// Count what remains per crawl
const countResult = await pool.query(
"SELECT c.id, c.site_name, COUNT(*) as cnt FROM crawls c LEFT JOIN casinos ca ON ca.crawl_id = c.id GROUP BY c.id, c.site_name ORDER BY c.id DESC LIMIT 30"
);
console.log('\n=== Remaining entries per recent crawl ===');
for (const row of countResult.rows) {
const name = row.site_name || '[failed]';
console.log('Crawl#', String(row.id).padEnd(4), '|', name.padEnd(25), '|', row.cnt, 'entries saved');
}
// Show sample of kept names after cleanup
const keptNames = await pool.query(
"SELECT DISTINCT casino_name FROM casinos WHERE LENGTH(TRIM(casino_name)) > 3 ORDER BY casino_name LIMIT 100"
);
console.log('\n=== Sample of valid names remaining in DB ===');
for (const row of keptNames.rows) {
const n = row.casino_name;
if (!n || !/[a-zA-Z]/.test(n[0])) continue; // skip remaining junk
console.log(' ', n);
}
await pool.end();
})();