Initial commit

This commit is contained in:
Joe
2026-06-26 14:12:10 +02:00
commit 12518b259c
5258 changed files with 732924 additions and 0 deletions
+81
View File
@@ -0,0 +1,81 @@
const fs = require('fs');
const OUT_CSV = './casino_affiliate_sites.csv';
const CP_NAMES = ['.cp.json','.mega_crawl.json','.fast_crawl.json','scraper_checkpoint.json','.search_checkpoint.json','.hybrid.json','.dir_crawl.json'];
function getDomain(url) {
if (!url || typeof url !== 'string') return '';
try { const u = new URL(url); let h = u.hostname; if (h && h.startsWith('www.')) h = h.slice(4); return (h||'').toLowerCase(); }
catch { return url.toLowerCase().replace(/^https?:\/\//,'').split('/')[0] || ''; }
}
const MERGED = new Map();
console.log('═══ Merging all checkpoint data... ═══\n');
for (const cpFile of CP_NAMES) {
const fullP = process.cwd() + '/' + cpFile;
if (!fs.existsSync(fullP)) { console.log('- ' + cpFile + ': missing'); continue; }
try {
const raw = JSON.parse(fs.readFileSync(fullP, 'utf8'));
let entries = [];
// Different checkpoints store data differently - handle all formats:
if (Array.isArray(raw.results)) entries.push(...raw.results);
if (Array.isArray(raw.sites)) entries.push(...raw.sites);
if (Array.isArray(raw.collected)) entries.push(...raw.collected);
if (Array.isArray(raw.verified)) entries.push(...raw.verified);
if (Array.isArray(raw.found)) entries.push(...raw.found);
const before = MERGED.size;
for (const r of entries) {
if (!r || typeof r !== 'object') continue;
const dUrl = getDomain(r.url) || '';
if (!dUrl) continue;
if (MERGED.has(dUrl)) continue;
MERGED.set(dUrl, {
url: r.url || ('https://' + dUrl),
title: (r.title || '').substring(0, 250),
domain: dUrl
});
}
const added = MERGED.size - before;
if (added > 0) console.log('+ ' + cpFile + ': +' + added + ' new');
} catch(e) { console.log('! ' + cpFile + ': ' + e.message); }
}
// Also try .dns.json which stores found domains differently
try {
const dnsPath = process.cwd() + '/.dns.json';
if (fs.existsSync(dnsPath)) {
const dnsData = JSON.parse(fs.readFileSync(dnsPath, 'utf8'));
if (Array.isArray(dnsData.found)) {
const before = MERGED.size;
for (const r of dnsData.found) {
if (!r || typeof r !== 'object') continue;
const dUrl = getDomain(r.url) || r.domain || '';
if (!dUrl || MERGED.has(dUrl)) continue;
MERGED.set(dUrl, { url: r.url || ('https://' + dUrl), title: r.title || dUrl, domain: dUrl });
}
const added = MERGED.size - before;
if (added > 0) console.log('+ .dns.json: +' + added + ' new');
}
}
} catch(e) {} // skip if dns checkpoint doesn't exist
console.log('\nTotal unique domains collected: ' + MERGED.size);
// Write final sorted CSV
const header = 'url,title,domain';
const sorted = [...MERGED.values()].sort((a, b) => a.domain.localeCompare(b.domain));
const rows = sorted.map(function(v) {
const t = (v.title || '').replace(/"/g, "'");
return '"' + v.url + '","' + t + '","' + (v.domain || '') + '"';
});
fs.writeFileSync(OUT_CSV, [header].concat(rows).join('\n'), 'utf8');
console.log('Written to: ' + OUT_CSV);