Files
crawler/db-cleanup-3.js
2026-06-26 14:30:45 +02:00

99 lines
4.7 KiB
JavaScript

const { pool } = require('./src/db');
(async () => {
console.log('=== Running targeted cleanup passes ===\n');
// Pass 1: Delete icon SVG text scraped as brand names
var p1a = '% %icon%';
var r = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [p1a]);
if (r.rowCount) console.log('Deleted', r.rowCount, 'icon-text junk');
// Pass 2: Things over 40 chars are page text not brands
await pool.query("DELETE FROM casinos WHERE LENGTH(TRIM(LOWER(casino_name))) > 40");
// Pass 3: Payment methods/banking that scraped as "casino names"
var payments = ['american express', 'amex', 'applepay', 'apple pay', 'astropay',
'bank deposit', 'bank transfer', 'bank wire transfer', 'bitcoiin',
'credit card casinos', 'dogecoin', 'eth', 'ethereum', 'euronext', 'idcredit',
'immediate payment', 'interac', 'litecoin', 'monero', 'neousuf',
'online bank transfer', 'paysafe', 'paysafecard', 'phone pe deposit',
'rapidtransfer', 'rupee deposit', 'skrill', 'solana', 'swish', 'tether',
'truelayer', 'upi deposit'];
for (const pm of payments) {
var rm = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) = $1", [pm.toLowerCase()]);
if (rm.rowCount) console.log('Deleted', rm.rowCount, pm);
}
// Pass 4: Gambling regulators/authorities (not casinos)
var regs = ['%gaming commission%', '%gambling authority%', 'belgian gaming commission',
'bulgarian national revenue agency', 'cyprus', 'curacao', 'curacao gaming control board',
'czech', 'danish gambling authority', 'estonian tax and customs board', 'hungary',
'hellenic licensing and gambling commission', 'irish department of justice',
'isle of man gambling supervision commission', 'ltv', 'malta',
'malta gaming authority', 'the netherlands'];
for (const reg of regs) {
var rl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [reg.toLowerCase()]);
if (rl.rowCount) console.log('Deleted', rl.rowCount, reg);
}
// Pass 5: Page-structure/junk text that keeps getting scraped
var navJunk = ['about us', 'arrow right', 'best online casino sites', 'best online casino usa guide overview',
'bonus icons', 'bonuses & promos', 'claim bonus', 'contact',
'dmca protected', 'dmca.com protection', 'editorial policy', 'facebook icon',
'footer 18 plus', 'get bonus →', 'home icon', 'image description placeholder',
'kim hultman', 'kryto bonus bis 3.000', 'latest news', 'legal real money casinos',
'linkedin', 'live dealer', 'menu-img-casinos.png', 'menu-img-games',
'our guide on online slot rtps', 'paysafecard casinos', 'play responsibily',
'roulette', 'safe & more 18+', 'share on google+', 'sign up today', 'sitemap',
'slots.com', 'sportsbetting', 't&c apply 18+', 'terms', 'time to play guide',
'united kingdom', 'video slots com casino review', 'view articles', 'view top casinos',
'voice.com z.com rocket.com nfts.com porn.com', 'welcome to betanews'];
for (const nj of navJunk) {
var nl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [nj.toLowerCase()]);
if (nl.rowCount) console.log('Deleted', nl.rowCount, nj);
}
// Pass 6: Domain-parking/generic text scraped from junk sites
var domainJunk = ['a godaddybranded afternic', 'atsio', 'bestonlinecasinos',
'betn.com', 'diamond.com', 'gold.com', 'green.com', 'icon.com',
'nft.com', 'porn.com', 'rocket.com', 'we.com'];
for (const dj of domainJunk) {
var dl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1 OR LOWER(TRIM(casino_name)) ILIKE $2", [dj.toLowerCase(), dj.toLowerCase()]);
if (dl.rowCount) console.log('Deleted', dl.rowCount, dj);
}
// Show total remaining and sample of what's left
var s = await pool.query('SELECT COUNT(*) FROM casinos');
console.log('\nTotal entries in DB after cleanup:', s.rows[0].count);
var samples = await pool.query(
"SELECT DISTINCT casino_name FROM casinos WHERE LENGTH(TRIM(casino_name)) BETWEEN 3 AND 40 ORDER BY casino_name LIMIT 80"
);
console.log('\n=== Sample of surviving entries ===\n');
var goodN = 0, junkN = 0;
for (const row of samples.rows) {
var name = row.casino_name || '';
if (!name.match(/^[a-zA-Z]/)) continue;
var stillJunk =
name.length < 3 || name.length > 40 ||
!/[a-zA-Z]/.test(name[0]) ||
(name.match(/[a-zA-Z]/g) || []).length / name.length < 0.5 ||
/(icon|dmca)/i.test(name);
if (stillJunk) { console.log('❌', JSON.stringify(name)); junkN++; }
else { console.log('✅', name.replace(/[^a-zA-Z0-9\s&]/g, '').trim()); goodN++; }
}
console.log('\nGood: ' + goodN + ', Still bad: ' + junkN);
await pool.end();
})();