Initial commit
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
const { pool } = require('./src/db');
|
||||
|
||||
(async () => {
|
||||
console.log('Cleaning junk entries from casinos table...\n');
|
||||
|
||||
// Step 1: Delete rows whose name matches exact known junk patterns
|
||||
var toDeletePatterns = [
|
||||
'', 'menu', 'nav', 'home', 'login', 'signup', 'register', 'account',
|
||||
'support', 'help', 'contact us', 'about us', 'terms', 'privacy policy',
|
||||
'top online casinos review list',
|
||||
'gambleaware', 'casino bonus codes not working',
|
||||
'free spins no deposit', 'welcome bonus',
|
||||
'crypto casino', 'bitcoin', 'best online casinos'
|
||||
];
|
||||
|
||||
for (const pattern of toDeletePatterns) {
|
||||
try {
|
||||
const r = await pool.query(
|
||||
"DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) = $1",
|
||||
[pattern.toLowerCase()]
|
||||
);
|
||||
if (r.rowCount > 0) {
|
||||
console.log(' Deleted', r.rowCount, 'matching:', JSON.stringify(pattern));
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
|
||||
// Step 2: Delete rows where name matches LIKE patterns for junk text
|
||||
var likePatterns = [
|
||||
'%casino bonus codes not working% ',
|
||||
'%online casinos & casino sites',
|
||||
'dmca%',
|
||||
'crypto bonus%',
|
||||
'%willkommenspaket%',
|
||||
'lizenz und spielerschutz bei exclusive casino',
|
||||
'krypto bonus bis%',
|
||||
'%deposit match up to%',
|
||||
'%bonus und freispiele%',
|
||||
'%exclusive bonuses%',
|
||||
'%casino reps%',
|
||||
'%casino bonus codes%',
|
||||
'get bonus%',
|
||||
'visit%casino%'
|
||||
];
|
||||
|
||||
for (const pat of likePatterns) {
|
||||
try {
|
||||
const r = await pool.query(
|
||||
"DELETE FROM casinos WHERE LOWER(casino_name) LIKE $1",
|
||||
[pat.toLowerCase()]
|
||||
);
|
||||
if (r.rowCount > 0) {
|
||||
console.log(' Deleted', r.rowCount, 'matching:', JSON.stringify(pat));
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
|
||||
// Step 3: Delete rows where name looks like phone numbers or prices or is too long
|
||||
await pool.query(
|
||||
"DELETE FROM casinos WHERE casino_name ~* '^\\d+\\s?\\d+' OR LOWER(casino_name) LIKE '%trustpilot%' OR LENGTH(casino_name) > 100"
|
||||
);
|
||||
|
||||
// Step 4: Delete rows starting with symbols/emoji/special chars (not real brand names)
|
||||
await pool.query(
|
||||
'DELETE FROM casinos WHERE casino_name ~* \'^[\\+\\-\\>\\.]+\' OR casino_name ~ \'^\\\\s+$\''
|
||||
);
|
||||
|
||||
// Count what remains per crawl
|
||||
const countResult = await pool.query(
|
||||
"SELECT c.id, c.site_name, COUNT(*) as cnt FROM crawls c LEFT JOIN casinos ca ON ca.crawl_id = c.id GROUP BY c.id, c.site_name ORDER BY c.id DESC LIMIT 30"
|
||||
);
|
||||
|
||||
console.log('\n=== Remaining entries per recent crawl ===');
|
||||
for (const row of countResult.rows) {
|
||||
const name = row.site_name || '[failed]';
|
||||
console.log('Crawl#', String(row.id).padEnd(4), '|', name.padEnd(25), '|', row.cnt, 'entries saved');
|
||||
}
|
||||
|
||||
// Show sample of kept names after cleanup
|
||||
const keptNames = await pool.query(
|
||||
"SELECT DISTINCT casino_name FROM casinos WHERE LENGTH(TRIM(casino_name)) > 3 ORDER BY casino_name LIMIT 100"
|
||||
);
|
||||
|
||||
console.log('\n=== Sample of valid names remaining in DB ===');
|
||||
for (const row of keptNames.rows) {
|
||||
const n = row.casino_name;
|
||||
if (!n || !/[a-zA-Z]/.test(n[0])) continue; // skip remaining junk
|
||||
console.log(' ', n);
|
||||
}
|
||||
|
||||
await pool.end();
|
||||
})();
|
||||
Reference in New Issue
Block a user