Files
crawler/fix-generic-extractor.js
2026-06-26 14:30:45 +02:00

125 lines
5.0 KiB
JavaScript

async extractGeneric(page) {
console.log(' Using generic extractor');
return page.evaluate(() => {
var casinos = [];
var seenNames = {};
function safeText(el, maxLen) {
if (!el) return '';
var t = el.textContent.trim().replace(/\s+/g, ' ');
return maxLen && t.length > maxLen ? t.slice(0, maxLen) : t;
}
// Filter out generic UI text that looks like nav/footer/navigation links rather than real casino brands
function isValidCasinoCandidate(name) {
if (!name || name.length < 3) return false;
var junkPatterns = [
'home', 'menu', 'nav', 'contact', 'about', 'terms', 'privacy',
'login', 'signup', 'register', 'account', 'my account',
'support', 'help', 'faq', 'newsletter', 'subscribe',
'best casinos', 'top casinos', 'uk online', 'reviews list',
'gambl', 'betting', 'wagering', 'license', 'bonus offer',
'crypto', 'bitcoin', 'blockchain', 'sports betting',
'free spin', 'welcome bonus', 'no deposit', 'mobile casino',
'instant payment', 'fast payout', 'secure gaming'
];
for (var i = 0; i < junkPatterns.length; i++) {
if (name.toLowerCase().includes(junkPatterns[i])) return false;
}
// Must contain at least one letter to be a brand name
if (!/[a-zA-Z]/.test(name.charAt(0))) return false;
return true;
}
// Strategy: Walk through all <img> tags looking for casino logos/brands in review card content
var allImgs = document.querySelectorAll('img[alt]');
for (var i = 0; i < allImgs.length && casinos.length < 20; i++) {
var img = allImgs[i];
var altText = img.alt.trim();
if (!isValidCasinoCandidate(altText)) continue;
var cleanName = altText.replace(/[Cc]asino $/i, '').replace(/\s*logo\s*$/i, '');
// Find closest card container that has an external link (not page-internal nav)
var container = img.closest('div[class]');
if (!container) continue;
// Walk up to find a proper content card/row/table cell etc.
var parentCard = null;
while (container && container !== document.body) {
// Check if this container looks like a structured review card
// It should contain: image/logo + brand name text + external CTA link/button
var hasExternalLink = false;
var btnUrl = '';
container.querySelectorAll('a[href]').forEach(function(a) {
try {
var url = new URL(absoluteURL(url.href, document.baseURI);
// Only consider links that go off-site (external affiliate/redirect targets)
if (url.hostname !== location.hostname && !btnUrl) btnUrl = url.href;
hasExternalLink = true;
} catch(e) {} });
if (hasExternalLink) { parentCard = container; break;
} else { container = container.parentElement; }
}
if (!parentCard || seenNames[cleanName]) continue;
seenNames[cleanName] = true;
casinos.push({
position: casinos.length + 1,
name: cleanName.replace(/[^a-zA-Z0-9\s&.]/g, '').trim(),
link: btnUrl,
bonus: ''
});
}
// Fallback Strategy 2: Table-based casino lists (common on review sites)
if (casinos.length === 0) {
var tables = document.querySelectorAll('table');
for (var t = 0; t < Math.min(tables.length, 3); t++) {
var rows = tables[t].querySelectorAll('tr');
for (var r = 0; r < rows.length && casinos.length < 20; r++) {
var cells = rows[r].querySelectorAll('td, th');
if (!cells[ci].querySelector('img')) hasImg = true;
if (!cells[ci].querySelector('a')) hasLink = true;
}
if (!hasImg && !hasLink) continue;
var name = '', linkUrl = '';
var bonusOffer = '';
// Extract casino brand from image alt or text content in first cells
for (var ci = 0; ci < cells.length; ci++) {
var img2 = cells[ci].querySelector('img');
var anchor = cells[ci].querySelector('a');
// Prefer logo alt over raw cell text
if (img2 && img2.alt) name = safeText(img2, 100);
else if (!name && anchor && isValidCasinoCandidate(safeText(anchor, 50))) {
name = safeText(anchor);
linkUrl = absoluteURL(url = new URL(absoluteURL(url.absoluteURL(url.href, document.baseURI).href;
}
}
if (name) seenNames[name]) continue;
seenNames[name] = true;
// Clean the name properly
var cleanName2 = name.replace(/[^a-zA-Z0-9\s&.]/g, '').trim();
casinos.push({
position: casinos.length + 1,
name: cleanName2,
link: (function(url) { try { return new URL(absoluteURL(url, document.baseURI).href; } catch(e) { return url || ''; } })(linkUrl),
bonus: bonusOffer.trim()
});
}
}
return casinos.slice(0, 20);
});
}