Files
crawler/site-inspector.js
T
2026-06-26 14:30:45 +02:00

139 lines
5.0 KiB
JavaScript

#!/usr/bin/env node
// Site Inspector - compact DOM pattern discovery for casino listing pages
// Usage: node site-inspector.js "<url>"
const puppeteer = require('puppeteer-extra');
puppeteer.use(require('puppeteer-extra-plugin-stealth')());
(async () => {
var url = process.argv[2];
if (!url) { console.log('Usage: node site-inspector.js "<url>"'); process.exit(1); }
var b = await puppeteer.launch({
headless: 'new',
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
args: ['--no-sandbox']
});
var p = await b.newPage();
await p.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
console.error('\n>> Loading: ' + url);
await p.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
await new Promise(r => setTimeout(r, 4000));
var title = await p.title();
console.log('\n=== Title: ' + title.substring(0, 80) + '\n');
var HOST = url.replace(/https?:\/\//, '').split('/')[0].replace('www.', '');
// --- Phase 1: find card-like containers with imgs + off-site links ---
var cards = await p.evaluate(function (h) {
var res = [];
var main = document.querySelector('main, article, .content, .container, [class*="wrapper"]') || document.body;
// look at children of main for card-level divs
var candidates = [];
var level1 = main.querySelectorAll(':scope > [class]');
level1.forEach(function (d) { candidates.push(d); });
if (candidates.length < 3) {
document.querySelectorAll('[class]').forEach(function (d) { candidates.push(d); });
}
for (var i = 0; i < candidates.length && res.length < 20; i++) {
var card = candidates[i];
var imgs = card.querySelectorAll('img[alt]');
var alts = [];
for (var j = 0; j < imgs.length; j++) {
var a = (imgs[j].alt || '').trim();
if (a.length >= 2 && a.length < 80) alts.push(a);
}
if (!alts.length) continue;
// check for off-site links inside card
var extOrigins = [];
var seen = {};
var links = card.querySelectorAll('a[href]');
for (var k = 0; k < links.length && extOrigins.length < 5; k++) {
try {
var u = new URL(links[k].href, document.baseURI);
var host = u.hostname.replace('www.', '');
if (host !== h && !/google|facebook|twitter|instagram/i.test(host) && !seen[host]) {
extOrigins.push(u.origin);
seen[host] = true;
}
} catch (e) {}
}
var hasBonusText = /(?:\d+\s*free\s*spins|FS|[£€$]\d+)/.test(card.textContent || '');
res.push({
cls: String(card.className).substring(0, 80),
tag: card.tagName,
alts: alts.slice(0, 6),
ext: extOrigins.slice(0, 3),
bonus: hasBonusText
});
}
return res;
}, HOST);
// --- Phase 2: top alt-text patterns across page ---
var allAlts = await p.evaluate(function () {
var counts = {};
document.querySelectorAll('img[alt]').forEach(function (img) {
var a = (img.alt || '').trim();
if (a.length >= 2 && a.length < 80) counts[a] = (counts[a] || 0) + 1;
});
var entries = Object.entries(counts);
entries.sort(function (a, b) { return b[1] - a[1]; });
return entries.slice(0, 25);
});
// --- Output ---
if (cards.length) {
console.log('=== CARD CONTAINERS ===\n');
for (var c = 0; c < cards.length; c++) {
var cd = cards[c];
console.log('Card ' + (c + 1) + ': <' + cd.tag + ' class="' + cd.cls + '">');
console.log(' imgs: ' + JSON.stringify(cd.alts));
if (cd.ext.length) console.log(' ext: ' + cd.ext.join(', '));
if (cd.bonus) console.log(' bonus: text found');
console.log('');
}
} else {
console.log('(no card containers detected)\n');
}
if (allAlts.length) {
console.log('=== ALT TEXT PATTERNS ===\n');
for (var k = 0; k < allAlts.length; k++) {
var t = allAlts[k][0], n = allAlts[k][1];
var mark = /casino/i.test(t) ? ' *CASINO*' : '';
console.log(' x' + String(n).padStart(3, ' ') + ' "' + t + '"' + mark);
}
}
// --- recommended selector hints ---
console.log('\n=== HINTS ===\n');
var casinoCards = cards.filter(function (cd) {
return cd.alts.some(function (a) { return /casino/i.test(a); });
});
if (casinoCards.length) {
var clsArr = casinoCards[0].cls.trim().split(/\s+/);
console.log('Card class hint: .' + clsArr[0]);
if (clsArr.length > 1) console.log('Full class: ' + casinoCards[0].cls);
console.log('Has ext links: ' + !!casinoCards[0].ext.length);
var sampleAlt = casinoCards[0].alts.filter(function (a) { return /casino/i.test(a); })[0] || '';
if (sampleAlt) {
console.log('Sample alt: "' + sampleAlt + '"');
// suggest pattern
var stripped = sampleAlt.replace(/\s+C[aA]sino\b.*/i, '');
console.log('Brand would be: "' + stripped + '"');
}
} else {
console.log('No casino-card containers found.');
console.log('Check alt-text list for patterns.');
}
await p.close();
await b.close();
})();