Add new phishing feed / speed up domains sort

This commit is contained in:
SukkaW
2023-12-12 17:10:55 +08:00
parent e56f601fbc
commit e970006445
8 changed files with 72 additions and 89 deletions

View File

@@ -3,7 +3,7 @@ import path from 'path';
import * as tldts from 'tldts'; import * as tldts from 'tldts';
import { processLine } from './lib/process-line'; import { processLine } from './lib/process-line';
import { readFileByLine } from './lib/fetch-text-by-line'; import { readFileByLine } from './lib/fetch-text-by-line';
import { createDomainSorter } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { task } from './lib/trace-runner'; import { task } from './lib/trace-runner';
import { compareAndWriteFile } from './lib/create-file'; import { compareAndWriteFile } from './lib/create-file';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
@@ -58,8 +58,8 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
} }
}; };
const [domainSorter] = await Promise.all([ const [gorhill] = await Promise.all([
getGorhillPublicSuffixPromise().then(createDomainSorter), getGorhillPublicSuffixPromise(),
processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/cdn.conf')), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/cdn.conf')),
processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global.conf')), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global.conf')),
processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global_plus.conf')), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global_plus.conf')),
@@ -74,7 +74,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
return compareAndWriteFile( return compareAndWriteFile(
[ [
...Array.from(set).sort(domainSorter).map(i => `SUFFIX,${i}`), ...sortDomains(Array.from(set), gorhill).map(i => `SUFFIX,${i}`),
...Array.from(keywords).sort().map(i => `REGEX,${i}`) ...Array.from(keywords).sort().map(i => `REGEX,${i}`)
], ],
path.resolve(import.meta.dir, '../List/internal/cdn.txt') path.resolve(import.meta.dir, '../List/internal/cdn.txt')

View File

@@ -1,17 +1,16 @@
// @ts-check // @ts-check
import fsp from 'fs/promises';
import path from 'path'; import path from 'path';
import { processHosts, processFilterRules } from './lib/parse-filter'; import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { createTrie } from './lib/trie'; import { createTrie } from './lib/trie';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } from './lib/reject-data-source'; import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST, DOMAIN_LISTS } from './lib/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { processLine } from './lib/process-line'; import { processLine } from './lib/process-line';
import { domainDeduper } from './lib/domain-deduper'; import { domainDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick'; import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine } from './lib/fetch-text-by-line'; import { readFileByLine } from './lib/fetch-text-by-line';
import { createDomainSorter } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { traceSync, task, traceAsync } from './lib/trace-runner'; import { traceSync, task, traceAsync } from './lib/trace-runner';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import * as tldts from 'tldts'; import * as tldts from 'tldts';
@@ -38,6 +37,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
domainSets.add(host); domainSets.add(host);
}); });
})), })),
...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1])),
...ADGUARD_FILTERS.map(input => { ...ADGUARD_FILTERS.map(input => {
const promise = typeof input === 'string' const promise = typeof input === 'string'
? processFilterRules(input) ? processFilterRules(input)
@@ -144,14 +144,15 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
// Dedupe domainSets // Dedupe domainSets
const dudupedDominArray = traceSync('* Dedupe from covered subdomain', () => domainDeduper(Array.from(domainSets))); const dudupedDominArray = traceSync('* Dedupe from covered subdomain', () => domainDeduper(Array.from(domainSets)));
console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`); console.log(`Deduped ${previousSize - dudupedDominArray.length} rules from covered subdomain!`);
console.log(`Final size ${dudupedDominArray.length}`);
// Create reject stats // Create reject stats
const rejectDomainsStats: Array<[string, number]> = traceSync( const rejectDomainsStats: Array<[string, number]> = traceSync(
'* Collect reject domain stats', '* Collect reject domain stats',
() => Object.entries( () => Object.entries(
dudupedDominArray.reduce<Record<string, number>>((acc, cur) => { dudupedDominArray.reduce<Record<string, number>>((acc, cur) => {
const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false }); const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false, validateHostname: false });
if (suffix) { if (suffix) {
acc[suffix] = (acc[suffix] ?? 0) + 1; acc[suffix] = (acc[suffix] ?? 0) + 1;
} }
@@ -174,7 +175,10 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
'', '',
'Build from:', 'Build from:',
...HOSTS.map(host => ` - ${host[0]}`), ...HOSTS.map(host => ` - ${host[0]}`),
...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`) ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`),
...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
' - https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt',
' - https://phishing.army/download/phishing_army_blocklist.txt'
]; ];
return Promise.all([ return Promise.all([
@@ -182,7 +186,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => {
'Sukka\'s Ruleset - Reject Base', 'Sukka\'s Ruleset - Reject Base',
description, description,
new Date(), new Date(),
traceSync('* Sort reject domainset', () => dudupedDominArray.sort(createDomainSorter(gorhill))), traceSync('* Sort reject domainset', () => sortDomains(dudupedDominArray, gorhill)),
'domainset', 'domainset',
path.resolve(import.meta.dir, '../List/domainset/reject.conf'), path.resolve(import.meta.dir, '../List/domainset/reject.conf'),
path.resolve(import.meta.dir, '../Clash/domainset/reject.txt') path.resolve(import.meta.dir, '../Clash/domainset/reject.txt')

View File

@@ -1,13 +1,14 @@
import { domainDeduper } from './lib/domain-deduper'; import { domainDeduper } from './lib/domain-deduper';
import path from 'path'; import path from 'path';
import { createRuleset } from './lib/create-file'; import { createRuleset } from './lib/create-file';
import domainSorter from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { Sema } from 'async-sema'; import { Sema } from 'async-sema';
import * as tldts from 'tldts'; import * as tldts from 'tldts';
import { task } from './lib/trace-runner'; import { task } from './lib/trace-runner';
import { fetchWithRetry } from './lib/fetch-retry'; import { fetchWithRetry } from './lib/fetch-retry';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
const s = new Sema(3); const s = new Sema(3);
@@ -140,7 +141,9 @@ export const buildSpeedtestDomainSet = task(import.meta.path, async () => {
} }
} }
const deduped = domainDeduper(Array.from(domains)).sort(domainSorter); const gorhill = await getGorhillPublicSuffixPromise();
const deduped = sortDomains(domainDeduper(Array.from(domains)), gorhill);
const description = [ const description = [
...SHARED_DESCRIPTION, ...SHARED_DESCRIPTION,
'', '',

View File

@@ -1,7 +1,7 @@
import fsp from 'fs/promises'; import fsp from 'fs/promises';
import path from 'path'; import path from 'path';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processHosts } from './parse-filter'; import { processDomainLists, processHosts } from './parse-filter';
import { traceAsync, traceSync } from './trace-runner'; import { traceAsync, traceSync } from './trace-runner';
import * as tldts from 'tldts'; import * as tldts from 'tldts';
import { createTrie } from './trie'; import { createTrie } from './trie';
@@ -33,7 +33,12 @@ const BLACK_TLD = new Set([
'club', 'club',
'cn', 'cn',
'codes', 'codes',
'co.uk',
'co.in',
'com.br',
'com.cn', 'com.cn',
'com.pl',
'com.vn',
'cool', 'cool',
'cyou', 'cyou',
'fit', 'fit',
@@ -53,6 +58,7 @@ const BLACK_TLD = new Set([
'ltd', 'ltd',
'ml', 'ml',
'mobi', 'mobi',
'net.pl',
'one', 'one',
'online', 'online',
'pro', 'pro',
@@ -79,19 +85,12 @@ const BLACK_TLD = new Set([
]); ]);
export const getPhishingDomains = () => traceAsync('get phishing domains', async () => { export const getPhishingDomains = () => traceAsync('get phishing domains', async () => {
const [domainSet, gorhill] = await Promise.all([ const [domainSet, domainSet2, gorhill] = await Promise.all([
processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true), processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true),
// processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true), processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true),
// processFilterRules(
// 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt',
// [
// 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
// // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt'
// ]
// ),
getGorhillPublicSuffixPromise() getGorhillPublicSuffixPromise()
]); ]);
domainSet2.forEach((domain) => domainSet.add(domain));
traceSync.skip('* whitelisting phishing domains', () => { traceSync.skip('* whitelisting phishing domains', () => {
const trieForRemovingWhiteListed = createTrie(domainSet); const trieForRemovingWhiteListed = createTrie(domainSet);

View File

@@ -37,14 +37,13 @@ const normalizeDomain = (domain: string) => {
return h[0] === '.' ? h.slice(1) : h; return h[0] === '.' ? h.slice(1) : h;
}; };
export async function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) { export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();
for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) { for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
const domainToAdd = processLine(line); const domainToAdd = processLine(line);
if (!domainToAdd) { if (!domainToAdd) continue;
continue;
}
if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND); warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND);
@@ -55,9 +54,10 @@ export async function processDomainLists(domainListsUrl: string, includeAllSubDo
} }
return domainSets; return domainSets;
});
} }
export async function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) { export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
return traceAsync(`- processHosts: ${hostsUrl}`, async () => { return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();

View File

@@ -26,6 +26,11 @@ export const HOSTS = [
['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true] ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true]
] as const; ] as const;
export const DOMAIN_LISTS = [
// DigitalSide Threat-Intel - OSINT Hub
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true]
] as const;
export const ADGUARD_FILTERS = [ export const ADGUARD_FILTERS = [
// EasyList // EasyList
[ [

View File

@@ -1,13 +0,0 @@
import domainSorter from './stable-sort-domain';
// eslint-disable-next-line import/no-unresolved -- fuck eslint-import
import { describe, it, expect } from 'bun:test';
describe('stable-sort-domain', () => {
it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => {
expect(domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov')).toBe(-1);
});
it('.fgnzdb.xyz, .hub.fghtem.com', () => {
expect(domainSorter('.fgnzdb.xyz', '.hub.fghtem.com')).toBe(1);
});
});

View File

@@ -10,18 +10,16 @@ const compare = (a: string | null, b: string | null) => {
return -1; return -1;
} }
if (a.length !== b.length) { const aLen = a.length;
const r = a.length - b.length; const r = aLen - b.length;
if (r > 0) { if (r > 0) {
return 1; return 1;
} }
if (r < 0) { if (r < 0) {
return -1; return -1;
} }
return 0;
}
for (let i = 0; i < a.length; i++) { for (let i = 0; i < aLen; i++) {
if (b[i] == null) { if (b[i] == null) {
return 1; return 1;
} }
@@ -35,34 +33,21 @@ const compare = (a: string | null, b: string | null) => {
return 0; return 0;
}; };
const createDomainSorter = (gorhill: PublicSuffixList | null = null) => { export const sortDomains = (inputs: string[], gorhill: PublicSuffixList) => {
if (gorhill) {
const getDomain = createCachedGorhillGetDomain(gorhill); const getDomain = createCachedGorhillGetDomain(gorhill);
const domains = inputs.reduce<Record<string, string>>((acc, cur) => {
acc[cur] ||= getDomain(cur);
return acc;
}, {});
return (a: string, b: string) => { const sorter = (a: string, b: string) => {
if (a === b) return 0; if (a === b) return 0;
const aDomain = getDomain(a); const aDomain = domains[a];
const bDomain = getDomain(b); const bDomain = domains[b];
const resultDomain = compare(aDomain, bDomain); return compare(aDomain, bDomain) || compare(a, b);
return resultDomain !== 0 ? resultDomain : compare(a, b);
}; };
}
// eslint-disable-next-line @typescript-eslint/no-var-requires -- fuck return inputs.sort(sorter);
const tldts = require('./cached-tld-parse');
return (a: string, b: string) => {
if (a === b) return 0;
const aDomain = tldts.parse(a).domain;
const bDomain = tldts.parse(b).domain;
const resultDomain = compare(aDomain, bDomain);
return resultDomain !== 0 ? resultDomain : compare(a, b);
};
}; };
export default createDomainSorter();
export { createDomainSorter };