Initial commit
This commit is contained in:
@@ -0,0 +1,233 @@
|
||||
const { execFile } = require('child_process');
|
||||
const fs = require('fs');
|
||||
|
||||
const CP_FILE = './.cp.json';
|
||||
const CSV_OUT = './casino_affiliate_sites.csv';
|
||||
const OUTFILE = './seed_expand.json';
|
||||
|
||||
// Casino brand / keyword fingerprints
|
||||
const CASINO_KW = [
|
||||
"casino" , "bet365", "betfair" , "888.", "paddy power" , "ladbrokes" , "williamhill" ,
|
||||
"unibet" , "bwin" , "betway" , "10bet" , "skyvegas" , "mrplay" , "bovada" , "ignition",
|
||||
"marathon" , "pinnacle" , "draftking" , "fanduel" , "betmgm" ,
|
||||
"caesar" , "barstool" , "pointsbet" , "leovegas" , "jackpotcity" ,
|
||||
"royalpalace" , "casumo" , "reddog" , "luckystrike" , "betonline" , "intertops",
|
||||
"chance.com", "betsson", "betclic", "22bet", "1xbet", "stake.",
|
||||
"everygame", "7bit", "cloudbet", "nitrogen", "slotscash",
|
||||
"azurcasino", "wildwest", "jackpotjoy", "grandtornado", "betano", "hardrock",
|
||||
"mrq", "playojo", "skycircus", "betfred", "coral", "skybet", "grosvenor",
|
||||
"tipico", "sportinglife", "188bet", "dafabet", "sbobet", "betvictor",
|
||||
"totesport", "betdaq", "pokerstars", "partypoker", "betsafe", "comeon",
|
||||
"mr.green", "red dog", "spinplanet", "casinozilla", "bonusboss",
|
||||
"slotsup", "slotcatalog", "pin-up casino", "bitsler", "playtech",
|
||||
"evolution gaming", "microgaming", "netent", "pragmatic play","play'n go",
|
||||
"red tiger", "yggdrasil", "push gaming", "relax gaming", "foxium",
|
||||
].map(k => k.toLowerCase());
|
||||
|
||||
const SKIP_PATS = [
|
||||
"youtube.com","youtu.be","reddit.com","facebook.com","twitter.com","x.com",
|
||||
"linkedin.com","tiktok.com","wikipedia.","pinterest.","instagram.",
|
||||
"medium.com","forbes.com","nytimes.com","amazon.","ebay.","microsoft.",
|
||||
"play.google.com","web.archive.org","duckduckgo.","startpage.","brave.com",
|
||||
"t.co","imgur.com","flickr.com","goodreads.","steamcommunity.",
|
||||
"github.com","stackoverflow.","stackexchange.","apple.com","google.",
|
||||
"cdn.","cdnjs.cloudflare.com","fonts.googleapis.","ajax.googleapis.",
|
||||
"gravatar.com","facebook.com","open.graph.facebook.","platform.twitter.",
|
||||
];
|
||||
|
||||
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
|
||||
|
||||
function getDomain(url) {
|
||||
try { const u = new URL(url); let h = u.hostname; if (h.startsWith('www.')) h = h.slice(4); return h.toLowerCase(); }
|
||||
catch { return url.toLowerCase(); }
|
||||
}
|
||||
|
||||
function isSkip(d) { for (const s of SKIP_PATS) if (d.includes(s)) return true; return false; }
|
||||
|
||||
// Deep count: how many DISTINCT casino-linked outbound domains does a page have?
|
||||
function deepCountCasinoLinks(html, selfDomain) {
|
||||
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
|
||||
const seen = new Set();
|
||||
const casinoDomains = new Set();
|
||||
|
||||
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
|
||||
let link = m[1]; if (!link.startsWith('http')) continue;
|
||||
const d = getDomain(link);
|
||||
if (d === selfDomain || isSkip(d) || seen.has(d)) continue;
|
||||
seen.add(d);
|
||||
|
||||
const lo = link.toLowerCase();
|
||||
for (const kw of CASINO_KW) {
|
||||
if (lo.includes(kw) && kw.length > 3) { casinoDomains.add(d); break; }
|
||||
}
|
||||
}
|
||||
return casinoDomains.size;
|
||||
}
|
||||
|
||||
// Extract all outbound link domains from a page (not just casino ones)
|
||||
function extractOutboundLinks(html, selfDomain) {
|
||||
const stripped = html.replace(/<script[\s\S]*?<\/script>/gi, ' ').replace(/<style[\s\S]*?<\/style>/gi, ' ');
|
||||
const outbounds = new Set();
|
||||
|
||||
for (const m of stripped.matchAll(/href=["'](https?:\/\/[^"'\s>]+)["']/gi)) {
|
||||
let link = m[1]; if (!link.startsWith('http')) continue;
|
||||
const d = getDomain(link);
|
||||
if (d !== selfDomain && !isSkip(d) && outbounds.size < 200) {
|
||||
outbounds.add(d);
|
||||
}
|
||||
}
|
||||
return [...outbounds];
|
||||
}
|
||||
|
||||
// Fetch page HTML via curl
|
||||
async function fetchHtml(url) {
|
||||
try {
|
||||
const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36';
|
||||
return await new Promise((resolve) => {
|
||||
execFile('curl', ['-sL', '--max-time', '10', '-A', ua, '--max-filesize', '50000', url], { timeout: 15000 }, (_, s) =>
|
||||
resolve(s || '')
|
||||
);
|
||||
});
|
||||
} catch { return ''; }
|
||||
}
|
||||
|
||||
// ═══════ MAIN ═══════
|
||||
(async () => {
|
||||
console.log('═══ Transitive Crawler v11 ── expand seed sites ═══\n');
|
||||
|
||||
// Load checkpoint from v9
|
||||
let cp = null;
|
||||
try { if (fs.existsSync(CP_FILE)) cp = JSON.parse(fs.readFileSync(CP_FILE, 'utf8')); } catch {}
|
||||
|
||||
if (!cp || !cp.sites || cp.sites.length === 0) {
|
||||
console.log('No checkpoint found. Exiting.');
|
||||
return;
|
||||
}
|
||||
|
||||
const allSites = [...cp.sites];
|
||||
let verified = [...(cp.verified || [])];
|
||||
const verifiedDomains = new Set(verified.map(v => v.domain));
|
||||
|
||||
console.log(`Loaded ${allSites.length} seeds. Already verified: ${verified.length}`);
|
||||
console.log(`Goal: verify each, then expand via outbound links.\n`);
|
||||
|
||||
// ── Step 1: Verify all seed sites (concurrent BATCH of 5) ───────
|
||||
const CONC = 5;
|
||||
let totalChecked = 0;
|
||||
const toCheck = allSites.filter(s => !verifiedDomains.has(s.domain));
|
||||
|
||||
for (let i = 0; i < toCheck.length; i += CONC) {
|
||||
const batch = toCheck.slice(i, i + CONC);
|
||||
const jobs = batch.map(async (site) => {
|
||||
const html = await fetchHtml(site.url ? site.url : `https://${site.domain}`);
|
||||
const count = deepCountCasinoLinks(html, site.domain);
|
||||
return { site, count };
|
||||
});
|
||||
const results = await Promise.all(jobs);
|
||||
|
||||
for (const { site, count } of results) {
|
||||
if (count >= 5) {
|
||||
verified.push({ url: site.url || `https://${site.domain}`, title: site.title, domain: site.domain, casinoLinks: count });
|
||||
verifiedDomains.add(site.domain);
|
||||
}
|
||||
}
|
||||
totalChecked += batch.length;
|
||||
|
||||
if (totalChecked % 30 === 0) console.log(`Verify: ${totalChecked}/${toCheck.length} → ${verified.length} pass ≥5`);
|
||||
await sleep(400);
|
||||
}
|
||||
|
||||
console.log(`\nStep 1 done: ${verified.length} verified from ${allSites.length} seeds\n`);
|
||||
|
||||
// ── Step 2: Transitive expansion via outbound links ────────────
|
||||
const checkedOutbounds = new Set();
|
||||
|
||||
let expansionRound = 0;
|
||||
const MAX_ROUNDS = 5;
|
||||
|
||||
while (expansionRound < MAX_ROUNDS) {
|
||||
expansionRound++;
|
||||
|
||||
// Which sites to scan for outbound links this round?
|
||||
const verifiedSitesList = [...new Map(verified.map(v => [v.domain, v])).values()];
|
||||
const sitesToScan = verifiedSitesList.filter(s => !checkedOutbounds.has(s.domain));
|
||||
|
||||
if (sitesToScan.length === 0) {
|
||||
console.log(`Round ${expansionRound}: nothing new to scan. Stopping.`);
|
||||
break;
|
||||
}
|
||||
|
||||
console.log(`══ Expansion round ${expansionRound}: scanning ${sitesToScan.length} verified sites ══`);
|
||||
|
||||
const newDomains = [];
|
||||
|
||||
for (let si = 0; si < sitesToScan.length; si++) {
|
||||
const site = sitesToScan[si];
|
||||
checkedOutbounds.add(site.domain);
|
||||
|
||||
try {
|
||||
const url = site.url ? site.url : `https://${site.domain}`;
|
||||
const html = await fetchHtml(url);
|
||||
if (!html || html.length < 500) continue;
|
||||
|
||||
const outboundDomains = extractOutboundLinks(html, site.domain);
|
||||
for (const od of outboundDomains) {
|
||||
if (!verifiedDomains.has(od)) {
|
||||
newDomains.push({ url: `https://${od}`, title: od, domain: od });
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
|
||||
if (si % 30 === 0) console.log(` Scanned ${si}/${sitesToScan.length}, found ${newDomains.length} new`);
|
||||
await sleep(300);
|
||||
}
|
||||
|
||||
// Deduplicate new domains
|
||||
const seenNew = new Set();
|
||||
const uniqueNew = [];
|
||||
for (const nd of newDomains) {
|
||||
if (!seenNew.has(nd.domain)) {
|
||||
seenNew.add(nd.domain);
|
||||
uniqueNew.push(nd);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` → ${uniqueNew.length} unique new domains to verify\n`);
|
||||
|
||||
// Verify the newly discovered domains
|
||||
const BATCH_V = 5;
|
||||
for (let i = 0; i < uniqueNew.length; i += BATCH_V) {
|
||||
const batch = uniqueNew.slice(i, i + BATCH_V);
|
||||
const jobs = batch.map(async (vsite) => {
|
||||
const html = await fetchHtml(vsite.url);
|
||||
const count = deepCountCasinoLinks(html, vsite.domain);
|
||||
return { site: vsite, count };
|
||||
});
|
||||
const results = await Promise.all(jobs);
|
||||
|
||||
for (const { site, count } of results) {
|
||||
if (count >= 5) {
|
||||
verified.push({ url: `https://${site.domain}`, title: site.title || site.domain, domain: site.domain, casinoLinks: count });
|
||||
verifiedDomains.add(site.domain);
|
||||
}
|
||||
}
|
||||
|
||||
if (i % 100 === 0) console.log(` verified ${verified.length} total`);
|
||||
await sleep(350);
|
||||
}
|
||||
|
||||
console.log(` Round ${expansionRound} done: ${verified.length} total verified\n`);
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
// ── Write final CSV ───────────────
|
||||
const header = 'url,title,domain,casino_links';
|
||||
const allData = [...new Map(verified.map(v => [v.domain, v])).values()];
|
||||
const rows = allData.map(v => {
|
||||
const t = (v.title || '').replace(/"/g, "'");
|
||||
return `"${v.url}","${t}","${v.domain}",${v.casinoLinks}`;
|
||||
});
|
||||
fs.writeFileSync(CSV_OUTPUT, [header, ...rows].join('\n'), 'utf8');
|
||||
|
||||
console.log(`\n══════════ ${allData.length} verified site → ${CSV_OUTPUT} ══════════`);
|
||||
})();
|
||||
Reference in New Issue
Block a user