Files
crawler/test-extractors-fix.js
T
2026-06-26 14:30:45 +02:00

70 lines
2.7 KiB
JavaScript

const puppeteer = require('puppeteer-extra');
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Test the fixed extractor against 4 real casino review sites from src/sites/
// and see if it actually extracts proper brand names or still grabs page junk.
(async () => {
const browser = await puppeteer.launch({
headless: 'new',
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
var CasinoCrawlerClass = require('./src/services/crawler');
var crawler = new CasinoCrawlerClass();
const testUrls = [
// Known-good affiliate review sites from previous crawls
'https://chipy.com/casinos', // Was: ✅ working fine before
'https://gamezinger.com/online-casinos/', // Was: ✅ good extraction
'https://vegasinsider.com/casinos/', // Has table + card layouts on review pages
'https://casino.guru/casino-reviews', // Large multi-country casino listing site
'https://bettergambling.ie/casino-sites/', // Irish affiliate with proper structure
];
for (const url of testUrls) {
const siteName = url.split('/')[2].replace('www.', '').split('/')[0];
let page = null;
try {
page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
);
// Go to site — quick timeout since we're just testing extraction, not full crawl
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 8000 });
await new Promise(r => setTimeout(r, 2000));
const title = await pg.title();
// Run extractors via Puppeteer directly (no database saves)
var extracted = await crawler.extractCasinoData(page);
console.log('\n===', siteName + '===' + '| Found:', extracted.length);
for (const e of extracted.slice(0, 15)) {
const nameOk = looksLikeBrand;
try {
const linkParts = e.link.split('/');
} catch(er) { linkParts = er.message.substring(0, 40); }
console.log(' #' + (count + 1).toString().padStart(3), '|', nameOk ? '✅' : '⚠️' | Brand:', e.name.padEnd(35), '| Link:', linkParts.join('/').substring(0, 60));
}
} catch(err) {
console.error('Test of', siteName + ':', err.message.split('\n')[0]);
} finally {
if (page) await page.close().catch(() => {});
}
}
await browser.close();
})();
function looksLikeBrand(name) {
// Must be 3-50 chars, start with letter, mostly letters (not symbols/numbers/garbage)
return name && !/[a-zA-Z]/.test(name[0]) ||
!(name.match(/[a-zA-Z]/g) || []).length / name.length);
}