phishing/scripts/sortAll.mjs

// Copyright 2020-2026 @pezkuwi/phishing authors & contributors
// SPDX-License-Identifier: Apache-2.0

import fs from 'node:fs';

// @ts-expect-error @pezkuwi/dev scripts don't have .d.ts files
import { mkdirpSync, rimrafSync } from '@pezkuwi/dev/scripts/util.mjs';

/** @typedef {{ allow: string[]; deny: string[]; denySub: string[] }} AllList */

const KNOWN_URLS = ['telegra.ph', 'twitter.com', 'youtube.com', 'x.com'];

/**
 * @param {string} url
 * @param {boolean} [allowSub]
 * @returns {string}
 */
function sanitizeUrl (url, allowSub) {
  const naked = url.includes('://')
    ? url.split('://')[1]
    : url;

  return allowSub
    // return without trailing /
    ? naked.split('/').filter((p) => !!p).join('/')
    // return without subdomain
    : naked.split('/')[0];
}

/**
 * @param {string[]} list
 * @param {boolean} [allowSub]
 * @returns {string[]}
 */
function filterSection (list, allowSub) {
  return list
    .map((entry) => sanitizeUrl(entry, allowSub))
    .reduce((/** @type {string[]} */ filtered, entry) => {
      !filtered.includes(entry) &&
        filtered.push(entry);

      return filtered;
    }, []);
}

/**
 * @param {string[]} list
 * @param {boolean} [allowSub]
 * @returns {string[]}
 */
function sortSection (list, allowSub) {
  return filterSection(list, allowSub).sort((a, b) => a.localeCompare(b));
}

/**
 * @param {string[]} list
 * @param {string} url
 * @returns {boolean}
 */
function isSubdomain (list, url) {
  const parts = url.split('.');

  for (let i = 1; i < parts.length - 1; i++) {
    if (list.includes(parts.slice(i).join('.'))) {
      // this is a sub-domain of a domain that already exists
      return true;
    }
  }

  return false;
}

/**
 * @param {string} url
 * @returns {string}
 */
function flattenUrl (url) {
  // currently we only check for plesk-page to flatten
  if (!url.endsWith('plesk.page')) {
    return url;
  }

  const parts = url.split('.');

  return parts.length > 3
    ? parts.slice(-3).join('.')
    : url;
}

/**
 * @param {string[]} list
 * @returns {string[]}
 */
function rewriteSubs (list) {
  return filterSection(
    list
      .filter((url) => !isSubdomain(list, url))
      .map((url) => flattenUrl(url))
  );
}

/**
 * @param {Record<string, string[]>} values
 * @returns {Record<string, string[]>}
 */
function sortAddresses (values) {
  return Object
    .entries(values)
    .map(
      /** @returns {[string, string[]]} */
      ([key, addresses]) => [sanitizeUrl(key), addresses]
    )
    .sort(([a], [b]) => a.localeCompare(b))
    .reduce((/** @type {Record<string, string[]>} */ all, [key, addresses]) => {
      if (!all[key]) {
        all[key] = [];
      }

      sortSection(addresses).forEach((addr) => {
        !all[key].includes(addr) &&
          all[key].push(addr);
      });

      return all;
    }, {});
}

/**
 * @param {AllList} param0
 * @param {Record<string, string>} values
 * @returns
 */
function addSites ({ allow, deny }, values) {
  return Object
    .keys(values)
    .reduce((filtered, url) => {
      url.includes('.') && !url.includes(' ') && !url.includes('/') && !allow.includes(url) && !filtered.includes(url) && !KNOWN_URLS.includes(url) &&
        filtered.push(url);

      return filtered;
    }, deny);
}

/**
 * @param {string} file
 * @returns {any}
 */
function readJson (file) {
  return JSON.parse(fs.readFileSync(file, 'utf-8'));
}

/**
 * @param {string} file
 * @param {unknown} contents
 */
function writeJson (file, contents) {
  fs.writeFileSync(file, `${JSON.stringify(contents, null, '\t')}\n`);
}

function readMeta () {
  const months = readJson('meta/index.json');
  const meta = [];

  for (const month of months) {
    const items = readJson(`meta/${month}.json`);

    for (const item of items) {
      meta.push(item);
    }
  }

  return meta;
}

/**
 * @param {{ date: string; url: string; }[]} meta
 */
export function writeMeta (meta) {
  /** @type {Record<string, { date: string; url: string; }[]>} */
  const months = {};

  /** @type {string[]} */
  const index = [];

  for (const item of meta) {
    const month = item.date.split('-').slice(0, 2).join('-');

    if (!index.includes(month)) {
      index.push(month);
      months[month] = [];
    }

    months[month].push(item);
  }

  for (const month of Object.keys(months)) {
    writeJson(`meta/${month}.json`, months[month]);
  }

  writeJson('meta/index.json', index.sort((a, b) => b.localeCompare(a)));
}

/**
 * @param {string[]} deny
 */
function writeAllList (deny) {
  rimrafSync('all');
  mkdirpSync('all');

  const avail = deny.reduce((/** @type {Record<String, string[]>} */ avail, url) => {
    const [top] = url.split('.').reverse();

    if (!avail[top]) {
      avail[top] = [url];
    } else {
      avail[top].push(url);
    }

    return avail;
  }, {});

  Object.entries(avail).forEach(([top, urls]) => {
    mkdirpSync(`all/${top}`);
    writeJson(`all/${top}/all.json`, urls);
  });
}

const addr = readJson('address.json');
const all = readJson('all.json');
const meta = readMeta();
const deny = sortSection(addSites(all, addr));
const allJson = {
  allow: sortSection(all.allow),
  deny: rewriteSubs(deny),
  denySub: sortSection(all.denySub, true)
};

// rewrite with all our entries (newline included)
writeJson('address.json', sortAddresses(addr));
writeJson('all.json', allJson);

// add the specific alphabetical list
writeAllList(allJson.deny);

// find out what we don't have
const urls = meta.map(({ url }) => url);
const now = new Date();
const ym = `${now.getUTCFullYear()}-${`00${now.getUTCMonth() + 1}`.slice(-2)}`;
const ymd = `${ym}-${`00${now.getUTCDate()}`.slice(-2)}`;

// helper for parts
const urlParts = urls.map((u) => u.split('.'));

// rewrite with all our entries (newline included)
writeMeta(
  meta
    .concat(
      deny
        .filter((url) => {
          if (urls.includes(url)) {
            return false;
          }

          const len = url.split('.').length;

          return !urlParts.some((p) =>
            len < p.length &&
            url === p.slice(-len).join('.')
          );
        })
        .map((url) => ({ date: ymd, url }))
    )
    .filter(({ url }) =>
      deny.includes(url) ||
      isSubdomain(deny, url)
    )
    .sort((a, b) => b.date.localeCompare(a.date) || a.url.localeCompare(b.url))
);