38 lines
1.9 KiB
JavaScript
38 lines
1.9 KiB
JavaScript
const { pool } = require('./src/db');
|
|
|
|
async function run() {
|
|
// Build each bad pattern one at a time with proper quoting to avoid Write tool mangling quotes
|
|
var p1 = '%best online casino%';
|
|
var p2 = '%icon%';
|
|
var p3 = '%deposit match up to%';
|
|
var p4 = '%exclusive bonus%';
|
|
var p5 = 'best welcome bonus';
|
|
|
|
console.log('Step 1 - long page titles/headers');
|
|
await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1 AND LENGTH(TRIM(casino_name)) > 15", [p1]);
|
|
|
|
console.log('Step 2 - icon SVG text scraped as brand names');
|
|
await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [p2]);
|
|
|
|
console.log('Step 3 - pricing/promotional bonus text');
|
|
await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1 OR LOWER(TRIM(casino_name)) LIKE $2", [p3, p4]);
|
|
|
|
console.log('Step 4 - specific known junk strings from DB sample');
|
|
await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) = $1", [p5]);
|
|
|
|
console.log('Step 5 - single emoji/symbol entries and paragraphs that are NEVER brands');
|
|
await pool.query("DELETE FROM casinos WHERE LENGTH(TRIM(casino_name)) < 3 OR LENGTH(TRIM(casino_name)) > 40");
|
|
|
|
// Step 6: Things like "Best Paying Casinos" content section headers
|
|
var payingPat = '%paying casinos%';
|
|
await pool.query("DELETE FROM casinos WHERE LOWER(TRIM(casino_name)) LIKE $1", [payingPat]);
|
|
|
|
// Count what survived by crawl
|
|
var countResult = await pool.query(
|
|
"SELECT c.id, c.site_name, COUNT(ca.casino_name) as cnt FROM crawls c JOIN casinos ca ON ca.crawl_id = c.id GROUP BY c.id, c.site_name ORDER BY c.id DESC LIMIT 30"
|
|
);
|
|
|
|
console.log('\n=== Entries per recent crawl ===');
|
|
for (const row of countResult.rows) {
|
|
var name = row.site_name || '[failed]';
|
|
console.log('Crawl#', String(row.id).padEnd(4), '|', name.padEnd(25), '|', row.cnt, 'entries' |