Initial commit
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
const fs = require('fs');
|
||||
const OUT_CSV = './casino_affiliate_sites.csv';
|
||||
|
||||
const CP_NAMES = ['.cp.json','.mega_crawl.json','.fast_crawl.json','scraper_checkpoint.json','.search_checkpoint.json','.hybrid.json','.dir_crawl.json'];
|
||||
|
||||
function getDomain(url) {
|
||||
if (!url || typeof url !== 'string') return '';
|
||||
try { const u = new URL(url); let h = u.hostname; if (h && h.startsWith('www.')) h = h.slice(4); return (h||'').toLowerCase(); }
|
||||
catch { return url.toLowerCase().replace(/^https?:\/\//,'').split('/')[0] || ''; }
|
||||
}
|
||||
|
||||
const MERGED = new Map();
|
||||
|
||||
console.log('═══ Merging all checkpoint data... ═══\n');
|
||||
|
||||
for (const cpFile of CP_NAMES) {
|
||||
const fullP = process.cwd() + '/' + cpFile;
|
||||
if (!fs.existsSync(fullP)) { console.log('- ' + cpFile + ': missing'); continue; }
|
||||
|
||||
try {
|
||||
const raw = JSON.parse(fs.readFileSync(fullP, 'utf8'));
|
||||
let entries = [];
|
||||
|
||||
// Different checkpoints store data differently - handle all formats:
|
||||
if (Array.isArray(raw.results)) entries.push(...raw.results);
|
||||
if (Array.isArray(raw.sites)) entries.push(...raw.sites);
|
||||
if (Array.isArray(raw.collected)) entries.push(...raw.collected);
|
||||
if (Array.isArray(raw.verified)) entries.push(...raw.verified);
|
||||
if (Array.isArray(raw.found)) entries.push(...raw.found);
|
||||
|
||||
const before = MERGED.size;
|
||||
for (const r of entries) {
|
||||
if (!r || typeof r !== 'object') continue;
|
||||
const dUrl = getDomain(r.url) || '';
|
||||
if (!dUrl) continue;
|
||||
if (MERGED.has(dUrl)) continue;
|
||||
|
||||
MERGED.set(dUrl, {
|
||||
url: r.url || ('https://' + dUrl),
|
||||
title: (r.title || '').substring(0, 250),
|
||||
domain: dUrl
|
||||
});
|
||||
}
|
||||
const added = MERGED.size - before;
|
||||
if (added > 0) console.log('+ ' + cpFile + ': +' + added + ' new');
|
||||
|
||||
} catch(e) { console.log('! ' + cpFile + ': ' + e.message); }
|
||||
}
|
||||
|
||||
// Also try .dns.json which stores found domains differently
|
||||
try {
|
||||
const dnsPath = process.cwd() + '/.dns.json';
|
||||
if (fs.existsSync(dnsPath)) {
|
||||
const dnsData = JSON.parse(fs.readFileSync(dnsPath, 'utf8'));
|
||||
if (Array.isArray(dnsData.found)) {
|
||||
const before = MERGED.size;
|
||||
for (const r of dnsData.found) {
|
||||
if (!r || typeof r !== 'object') continue;
|
||||
const dUrl = getDomain(r.url) || r.domain || '';
|
||||
if (!dUrl || MERGED.has(dUrl)) continue;
|
||||
MERGED.set(dUrl, { url: r.url || ('https://' + dUrl), title: r.title || dUrl, domain: dUrl });
|
||||
}
|
||||
const added = MERGED.size - before;
|
||||
if (added > 0) console.log('+ .dns.json: +' + added + ' new');
|
||||
}
|
||||
}
|
||||
} catch(e) {} // skip if dns checkpoint doesn't exist
|
||||
|
||||
console.log('\nTotal unique domains collected: ' + MERGED.size);
|
||||
|
||||
// Write final sorted CSV
|
||||
const header = 'url,title,domain';
|
||||
const sorted = [...MERGED.values()].sort((a, b) => a.domain.localeCompare(b.domain));
|
||||
const rows = sorted.map(function(v) {
|
||||
const t = (v.title || '').replace(/"/g, "'");
|
||||
return '"' + v.url + '","' + t + '","' + (v.domain || '') + '"';
|
||||
});
|
||||
|
||||
fs.writeFileSync(OUT_CSV, [header].concat(rows).join('\n'), 'utf8');
|
||||
|
||||
console.log('Written to: ' + OUT_CSV);
|
||||
Reference in New Issue
Block a user