Initial commit
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
const { pool } = require('./src/db');
|
||||
|
||||
(async () => {
|
||||
console.log('=== Running targeted cleanup passes ===\n');
|
||||
|
||||
// Pass 1: Delete icon SVG text scraped as brand names
|
||||
var p1a = '% %icon%';
|
||||
var r = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [p1a]);
|
||||
if (r.rowCount) console.log('Deleted', r.rowCount, 'icon-text junk');
|
||||
|
||||
// Pass 2: Things over 40 chars are page text not brands
|
||||
await pool.query("DELETE FROM casinos WHERE LENGTH(TRIM(LOWER(casino_name))) > 40");
|
||||
|
||||
// Pass 3: Payment methods/banking that scraped as "casino names"
|
||||
var payments = ['american express', 'amex', 'applepay', 'apple pay', 'astropay',
|
||||
'bank deposit', 'bank transfer', 'bank wire transfer', 'bitcoiin',
|
||||
'credit card casinos', 'dogecoin', 'eth', 'ethereum', 'euronext', 'idcredit',
|
||||
'immediate payment', 'interac', 'litecoin', 'monero', 'neousuf',
|
||||
'online bank transfer', 'paysafe', 'paysafecard', 'phone pe deposit',
|
||||
'rapidtransfer', 'rupee deposit', 'skrill', 'solana', 'swish', 'tether',
|
||||
'truelayer', 'upi deposit'];
|
||||
|
||||
for (const pm of payments) {
|
||||
var rm = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) = $1", [pm.toLowerCase()]);
|
||||
if (rm.rowCount) console.log('Deleted', rm.rowCount, pm);
|
||||
}
|
||||
|
||||
// Pass 4: Gambling regulators/authorities (not casinos)
|
||||
var regs = ['%gaming commission%', '%gambling authority%', 'belgian gaming commission',
|
||||
'bulgarian national revenue agency', 'cyprus', 'curacao', 'curacao gaming control board',
|
||||
'czech', 'danish gambling authority', 'estonian tax and customs board', 'hungary',
|
||||
'hellenic licensing and gambling commission', 'irish department of justice',
|
||||
'isle of man gambling supervision commission', 'ltv', 'malta',
|
||||
'malta gaming authority', 'the netherlands'];
|
||||
|
||||
for (const reg of regs) {
|
||||
var rl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [reg.toLowerCase()]);
|
||||
if (rl.rowCount) console.log('Deleted', rl.rowCount, reg);
|
||||
}
|
||||
|
||||
// Pass 5: Page-structure/junk text that keeps getting scraped
|
||||
var navJunk = ['about us', 'arrow right', 'best online casino sites', 'best online casino usa guide overview',
|
||||
'bonus icons', 'bonuses & promos', 'claim bonus', 'contact',
|
||||
'dmca protected', 'dmca.com protection', 'editorial policy', 'facebook icon',
|
||||
'footer 18 plus', 'get bonus →', 'home icon', 'image description placeholder',
|
||||
'kim hultman', 'kryto bonus bis 3.000', 'latest news', 'legal real money casinos',
|
||||
'linkedin', 'live dealer', 'menu-img-casinos.png', 'menu-img-games',
|
||||
'our guide on online slot rtps', 'paysafecard casinos', 'play responsibily',
|
||||
'roulette', 'safe & more 18+', 'share on google+', 'sign up today', 'sitemap',
|
||||
'slots.com', 'sportsbetting', 't&c apply 18+', 'terms', 'time to play guide',
|
||||
'united kingdom', 'video slots com casino review', 'view articles', 'view top casinos',
|
||||
'voice.com z.com rocket.com nfts.com porn.com', 'welcome to betanews'];
|
||||
|
||||
for (const nj of navJunk) {
|
||||
var nl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [nj.toLowerCase()]);
|
||||
if (nl.rowCount) console.log('Deleted', nl.rowCount, nj);
|
||||
}
|
||||
|
||||
// Pass 6: Domain-parking/generic text scraped from junk sites
|
||||
var domainJunk = ['a godaddybranded afternic', 'atsio', 'bestonlinecasinos',
|
||||
'betn.com', 'diamond.com', 'gold.com', 'green.com', 'icon.com',
|
||||
'nft.com', 'porn.com', 'rocket.com', 'we.com'];
|
||||
|
||||
for (const dj of domainJunk) {
|
||||
var dl = await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1 OR LOWER(TRIM(casino_name)) ILIKE $2", [dj.toLowerCase(), dj.toLowerCase()]);
|
||||
if (dl.rowCount) console.log('Deleted', dl.rowCount, dj);
|
||||
}
|
||||
|
||||
// Show total remaining and sample of what's left
|
||||
var s = await pool.query('SELECT COUNT(*) FROM casinos');
|
||||
console.log('\nTotal entries in DB after cleanup:', s.rows[0].count);
|
||||
|
||||
var samples = await pool.query(
|
||||
"SELECT DISTINCT casino_name FROM casinos WHERE LENGTH(TRIM(casino_name)) BETWEEN 3 AND 40 ORDER BY casino_name LIMIT 80"
|
||||
);
|
||||
|
||||
console.log('\n=== Sample of surviving entries ===\n');
|
||||
var goodN = 0, junkN = 0;
|
||||
|
||||
for (const row of samples.rows) {
|
||||
var name = row.casino_name || '';
|
||||
|
||||
if (!name.match(/^[a-zA-Z]/)) continue;
|
||||
|
||||
var stillJunk =
|
||||
name.length < 3 || name.length > 40 ||
|
||||
!/[a-zA-Z]/.test(name[0]) ||
|
||||
(name.match(/[a-zA-Z]/g) || []).length / name.length < 0.5 ||
|
||||
/(icon|dmca)/i.test(name);
|
||||
|
||||
if (stillJunk) { console.log('❌', JSON.stringify(name)); junkN++; }
|
||||
else { console.log('✅', name.replace(/[^a-zA-Z0-9\s&]/g, '').trim()); goodN++; }
|
||||
}
|
||||
|
||||
console.log('\nGood: ' + goodN + ', Still bad: ' + junkN);
|
||||
|
||||
await pool.end();
|
||||
})();
|
||||
Reference in New Issue
Block a user