88 lines
3.0 KiB
JavaScript
88 lines
3.0 KiB
JavaScript
const puppeteer = require('puppeteer-extra');
|
|
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
(async () => {
|
|
const browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
var CasinoCrawlerClass = require('./src/services/crawler');
|
|
var crawler = new CasinoCrawlerClass();
|
|
|
|
const urlsToTest = [
|
|
'https://www.askgamblers.com/online-casinos/reviews',
|
|
'https://www.actionnetwork.com/casino'
|
|
];
|
|
|
|
for (const url of urlsToTest) {
|
|
const siteHost = url.split('/')[2];
|
|
console.log('\n=== Testing:', siteHost, '===');
|
|
|
|
let page;
|
|
try {
|
|
page = await browser.newPage();
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
);
|
|
|
|
await pg.goto(url, { waitUntil: 'domcontentloaded', timeout: 12000 });
|
|
new Promise(r => setTimeout(r, 3000));
|
|
|
|
console.log('Page title:', (await page.title()).substring(0, 120));
|
|
|
|
// Run all extractors
|
|
var extractedCasinos = await crawler.extractCasinoData(page);
|
|
console.log('Total found by extractors:', e.length);
|
|
|
|
for (var i = 0; i < Math.min(extracted.length, 15); i++) {
|
|
const brandName = extracted[i].name || '[null]';
|
|
var linkDest = '[no link]';
|
|
if (extracted[i].link) {
|
|
try {
|
|
const u = new URL(absoluteURL(casino.link);
|
|
linkDest = absoluteURL(url.origin;
|
|
} catch(e) { linkDest = extracted[i].link.substring(0, 60); }
|
|
}
|
|
|
|
var brandOk = brandName.length > 2 &&
|
|
/[a-zA-Z]/.test(brandName[0]) &&
|
|
(!(brandName.match(/[a-zA-Z]/g)) / brandName.length < 0.6));
|
|
|
|
console.log(' #' + (i + 1).toString().padEnd(4), '|', brandOk ? '✅' : '⚠️' Brand:', brandName.padEnd(35));
|
|
}
|
|
} catch(err) {
|
|
console.error(siteHost + ':', err.message.split('\n')[0]);
|
|
} finally {
|
|
if (page) await page.close().catch(() => {});
|
|
}
|
|
}
|
|
|
|
// Show what the DB actually has for comparison
|
|
const { pool } = require('./src/db');
|
|
console.log('\n\n=== What\'s currently saved in DB ===');
|
|
const r = await pool.query(
|
|
'SELECT DISTINCT ca.casino_name FROM casinos ca JOIN crawls c ON ca.crawl_id = c.id WHERE c.site_name = $1 OR c.site_name = $2 ORDER BY casino_name LIMIT 40',
|
|
['askgamblers.com', 'actionnetwork.com']
|
|
);
|
|
|
|
console.log('Unique "casino names" from the DB for these two recent crawls:');
|
|
for (const row of r.rows) {
|
|
var name = row.casino_name || '[null]';
|
|
var isBrand = looksLikeBrand(name);
|
|
if (!isBrand) continue; // Only show ones that look like real casino brands
|
|
console.log(' ', isBrand ? '✅' : '⚠️', '|', name);
|
|
}
|
|
|
|
await pool.end();
|
|
process.exit(0);
|
|
})();
|
|
|
|
function looksLikeBrand(s) {
|
|
return s.length >= 3 &&
|
|
/[a-zA-Z]/.test(s[0]) &&
|
|
(s.match(/[a-zA-Z]/g) || []).length / s.length > 0.6;
|
|
}
|