Initial commit
This commit is contained in:
@@ -0,0 +1,69 @@
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// Test the fixed extractor against 4 real casino review sites from src/sites/
|
||||
// and see if it actually extracts proper brand names or still grabs page junk.
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
|
||||
var CasinoCrawlerClass = require('./src/services/crawler');
|
||||
var crawler = new CasinoCrawlerClass();
|
||||
|
||||
const testUrls = [
|
||||
// Known-good affiliate review sites from previous crawls
|
||||
'https://chipy.com/casinos', // Was: ✅ working fine before
|
||||
'https://gamezinger.com/online-casinos/', // Was: ✅ good extraction
|
||||
'https://vegasinsider.com/casinos/', // Has table + card layouts on review pages
|
||||
'https://casino.guru/casino-reviews', // Large multi-country casino listing site
|
||||
'https://bettergambling.ie/casino-sites/', // Irish affiliate with proper structure
|
||||
];
|
||||
|
||||
for (const url of testUrls) {
|
||||
const siteName = url.split('/')[2].replace('www.', '').split('/')[0];
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
);
|
||||
|
||||
// Go to site — quick timeout since we're just testing extraction, not full crawl
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 8000 });
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const title = await pg.title();
|
||||
|
||||
// Run extractors via Puppeteer directly (no database saves)
|
||||
var extracted = await crawler.extractCasinoData(page);
|
||||
console.log('\n===', siteName + '===' + '| Found:', extracted.length);
|
||||
|
||||
for (const e of extracted.slice(0, 15)) {
|
||||
const nameOk = looksLikeBrand;
|
||||
try {
|
||||
const linkParts = e.link.split('/');
|
||||
} catch(er) { linkParts = er.message.substring(0, 40); }
|
||||
|
||||
console.log(' #' + (count + 1).toString().padStart(3), '|', nameOk ? '✅' : '⚠️' | Brand:', e.name.padEnd(35), '| Link:', linkParts.join('/').substring(0, 60));
|
||||
}
|
||||
} catch(err) {
|
||||
console.error('Test of', siteName + ':', err.message.split('\n')[0]);
|
||||
} finally {
|
||||
if (page) await page.close().catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
|
||||
function looksLikeBrand(name) {
|
||||
// Must be 3-50 chars, start with letter, mostly letters (not symbols/numbers/garbage)
|
||||
return name && !/[a-zA-Z]/.test(name[0]) ||
|
||||
!(name.match(/[a-zA-Z]/g) || []).length / name.length);
|
||||
}
|
||||
Reference in New Issue
Block a user