const fs = require('fs'); const OUT_CSV = './casino_affiliate_sites.csv'; const CP_NAMES = ['.cp.json','.mega_crawl.json','.fast_crawl.json','scraper_checkpoint.json','.search_checkpoint.json','.hybrid.json','.dir_crawl.json']; function getDomain(url) { if (!url || typeof url !== 'string') return ''; try { const u = new URL(url); let h = u.hostname; if (h && h.startsWith('www.')) h = h.slice(4); return (h||'').toLowerCase(); } catch { return url.toLowerCase().replace(/^https?:\/\//,'').split('/')[0] || ''; } } const MERGED = new Map(); console.log('═══ Merging all checkpoint data... ═══\n'); for (const cpFile of CP_NAMES) { const fullP = process.cwd() + '/' + cpFile; if (!fs.existsSync(fullP)) { console.log('- ' + cpFile + ': missing'); continue; } try { const raw = JSON.parse(fs.readFileSync(fullP, 'utf8')); let entries = []; // Different checkpoints store data differently - handle all formats: if (Array.isArray(raw.results)) entries.push(...raw.results); if (Array.isArray(raw.sites)) entries.push(...raw.sites); if (Array.isArray(raw.collected)) entries.push(...raw.collected); if (Array.isArray(raw.verified)) entries.push(...raw.verified); if (Array.isArray(raw.found)) entries.push(...raw.found); const before = MERGED.size; for (const r of entries) { if (!r || typeof r !== 'object') continue; const dUrl = getDomain(r.url) || ''; if (!dUrl) continue; if (MERGED.has(dUrl)) continue; MERGED.set(dUrl, { url: r.url || ('https://' + dUrl), title: (r.title || '').substring(0, 250), domain: dUrl }); } const added = MERGED.size - before; if (added > 0) console.log('+ ' + cpFile + ': +' + added + ' new'); } catch(e) { console.log('! ' + cpFile + ': ' + e.message); } } // Also try .dns.json which stores found domains differently try { const dnsPath = process.cwd() + '/.dns.json'; if (fs.existsSync(dnsPath)) { const dnsData = JSON.parse(fs.readFileSync(dnsPath, 'utf8')); if (Array.isArray(dnsData.found)) { const before = MERGED.size; for (const r of dnsData.found) { if (!r || typeof r !== 'object') continue; const dUrl = getDomain(r.url) || r.domain || ''; if (!dUrl || MERGED.has(dUrl)) continue; MERGED.set(dUrl, { url: r.url || ('https://' + dUrl), title: r.title || dUrl, domain: dUrl }); } const added = MERGED.size - before; if (added > 0) console.log('+ .dns.json: +' + added + ' new'); } } } catch(e) {} // skip if dns checkpoint doesn't exist console.log('\nTotal unique domains collected: ' + MERGED.size); // Write final sorted CSV const header = 'url,title,domain'; const sorted = [...MERGED.values()].sort((a, b) => a.domain.localeCompare(b.domain)); const rows = sorted.map(function(v) { const t = (v.title || '').replace(/"/g, "'"); return '"' + v.url + '","' + t + '","' + (v.domain || '') + '"'; }); fs.writeFileSync(OUT_CSV, [header].concat(rows).join('\n'), 'utf8'); console.log('Written to: ' + OUT_CSV);