Initial commit
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
var StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
|
||||
var CasinoCrawlerClass = require('./src/services/crawler');
|
||||
var crawler = new CasinoCrawlerClass();
|
||||
|
||||
const urlsToTest = [
|
||||
'https://www.askgamblers.com/online-casinos/reviews',
|
||||
'https://www.actionnetwork.com/casino'
|
||||
];
|
||||
|
||||
for (const url of urlsToTest) {
|
||||
const siteHost = url.split('/')[2];
|
||||
console.log('\n=== Testing:', siteHost, '===');
|
||||
|
||||
let page;
|
||||
try {
|
||||
page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
);
|
||||
|
||||
await pg.goto(url, { waitUntil: 'domcontentloaded', timeout: 12000 });
|
||||
new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
console.log('Page title:', (await page.title()).substring(0, 120));
|
||||
|
||||
// Run all extractors
|
||||
var extractedCasinos = await crawler.extractCasinoData(page);
|
||||
console.log('Total found by extractors:', e.length);
|
||||
|
||||
for (var i = 0; i < Math.min(extracted.length, 15); i++) {
|
||||
const brandName = extracted[i].name || '[null]';
|
||||
var linkDest = '[no link]';
|
||||
if (extracted[i].link) {
|
||||
try {
|
||||
const u = new URL(absoluteURL(casino.link);
|
||||
linkDest = absoluteURL(url.origin;
|
||||
} catch(e) { linkDest = extracted[i].link.substring(0, 60); }
|
||||
}
|
||||
|
||||
var brandOk = brandName.length > 2 &&
|
||||
/[a-zA-Z]/.test(brandName[0]) &&
|
||||
(!(brandName.match(/[a-zA-Z]/g)) / brandName.length < 0.6));
|
||||
|
||||
console.log(' #' + (i + 1).toString().padEnd(4), '|', brandOk ? '✅' : '⚠️' Brand:', brandName.padEnd(35));
|
||||
}
|
||||
} catch(err) {
|
||||
console.error(siteHost + ':', err.message.split('\n')[0]);
|
||||
} finally {
|
||||
if (page) await page.close().catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
// Show what the DB actually has for comparison
|
||||
const { pool } = require('./src/db');
|
||||
console.log('\n\n=== What\'s currently saved in DB ===');
|
||||
const r = await pool.query(
|
||||
'SELECT DISTINCT ca.casino_name FROM casinos ca JOIN crawls c ON ca.crawl_id = c.id WHERE c.site_name = $1 OR c.site_name = $2 ORDER BY casino_name LIMIT 40',
|
||||
['askgamblers.com', 'actionnetwork.com']
|
||||
);
|
||||
|
||||
console.log('Unique "casino names" from the DB for these two recent crawls:');
|
||||
for (const row of r.rows) {
|
||||
var name = row.casino_name || '[null]';
|
||||
var isBrand = looksLikeBrand(name);
|
||||
if (!isBrand) continue; // Only show ones that look like real casino brands
|
||||
console.log(' ', isBrand ? '✅' : '⚠️', '|', name);
|
||||
}
|
||||
|
||||
await pool.end();
|
||||
process.exit(0);
|
||||
})();
|
||||
|
||||
function looksLikeBrand(s) {
|
||||
return s.length >= 3 &&
|
||||
/[a-zA-Z]/.test(s[0]) &&
|
||||
(s.match(/[a-zA-Z]/g) || []).length / s.length > 0.6;
|
||||
}
|
||||
Reference in New Issue
Block a user