From e97000644520f6ad32882eb16e3e2871032390f4 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Tue, 12 Dec 2023 17:10:55 +0800 Subject: [PATCH] Add new phishing feed / speed up domains sort --- Build/build-internal-cdn-rules.ts | 8 ++-- Build/build-reject-domainset.ts | 20 ++++++---- Build/build-speedtest-domainset.ts | 7 +++- Build/lib/get-phishing-domains.ts | 21 +++++------ Build/lib/parse-filter.ts | 32 ++++++++-------- Build/lib/reject-data-source.ts | 5 +++ Build/lib/stable-sort-domain.test.ts | 13 ------- Build/lib/stable-sort-domain.ts | 55 ++++++++++------------------ 8 files changed, 72 insertions(+), 89 deletions(-) delete mode 100644 Build/lib/stable-sort-domain.test.ts diff --git a/Build/build-internal-cdn-rules.ts b/Build/build-internal-cdn-rules.ts index 8ca8cb48..a6c2a408 100644 --- a/Build/build-internal-cdn-rules.ts +++ b/Build/build-internal-cdn-rules.ts @@ -3,7 +3,7 @@ import path from 'path'; import * as tldts from 'tldts'; import { processLine } from './lib/process-line'; import { readFileByLine } from './lib/fetch-text-by-line'; -import { createDomainSorter } from './lib/stable-sort-domain'; +import { sortDomains } from './lib/stable-sort-domain'; import { task } from './lib/trace-runner'; import { compareAndWriteFile } from './lib/create-file'; import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; @@ -58,8 +58,8 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => { } }; - const [domainSorter] = await Promise.all([ - getGorhillPublicSuffixPromise().then(createDomainSorter), + const [gorhill] = await Promise.all([ + getGorhillPublicSuffixPromise(), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/cdn.conf')), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global.conf')), processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global_plus.conf')), @@ -74,7 +74,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => { return compareAndWriteFile( [ - ...Array.from(set).sort(domainSorter).map(i => `SUFFIX,${i}`), + ...sortDomains(Array.from(set), gorhill).map(i => `SUFFIX,${i}`), ...Array.from(keywords).sort().map(i => `REGEX,${i}`) ], path.resolve(import.meta.dir, '../List/internal/cdn.txt') diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 94a08dd3..351970e2 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -1,17 +1,16 @@ // @ts-check -import fsp from 'fs/promises'; import path from 'path'; -import { processHosts, processFilterRules } from './lib/parse-filter'; +import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter'; import { createTrie } from './lib/trie'; -import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } from './lib/reject-data-source'; +import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST, DOMAIN_LISTS } from './lib/reject-data-source'; import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { processLine } from './lib/process-line'; import { domainDeduper } from './lib/domain-deduper'; import createKeywordFilter from './lib/aho-corasick'; import { readFileByLine } from './lib/fetch-text-by-line'; -import { createDomainSorter } from './lib/stable-sort-domain'; +import { sortDomains } from './lib/stable-sort-domain'; import { traceSync, task, traceAsync } from './lib/trace-runner'; import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; import * as tldts from 'tldts'; @@ -38,6 +37,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { domainSets.add(host); }); })), + ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1])), ...ADGUARD_FILTERS.map(input => { const promise = typeof input === 'string' ? processFilterRules(input) @@ -144,14 +144,15 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { // Dedupe domainSets const dudupedDominArray = traceSync('* Dedupe from covered subdomain', () => domainDeduper(Array.from(domainSets))); - console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`); + console.log(`Deduped ${previousSize - dudupedDominArray.length} rules from covered subdomain!`); + console.log(`Final size ${dudupedDominArray.length}`); // Create reject stats const rejectDomainsStats: Array<[string, number]> = traceSync( '* Collect reject domain stats', () => Object.entries( dudupedDominArray.reduce>((acc, cur) => { - const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false }); + const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false, validateHostname: false }); if (suffix) { acc[suffix] = (acc[suffix] ?? 0) + 1; } @@ -174,7 +175,10 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { '', 'Build from:', ...HOSTS.map(host => ` - ${host[0]}`), - ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`) + ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`), + ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`), + ' - https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', + ' - https://phishing.army/download/phishing_army_blocklist.txt' ]; return Promise.all([ @@ -182,7 +186,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { 'Sukka\'s Ruleset - Reject Base', description, new Date(), - traceSync('* Sort reject domainset', () => dudupedDominArray.sort(createDomainSorter(gorhill))), + traceSync('* Sort reject domainset', () => sortDomains(dudupedDominArray, gorhill)), 'domainset', path.resolve(import.meta.dir, '../List/domainset/reject.conf'), path.resolve(import.meta.dir, '../Clash/domainset/reject.txt') diff --git a/Build/build-speedtest-domainset.ts b/Build/build-speedtest-domainset.ts index c1623068..ef7b5782 100644 --- a/Build/build-speedtest-domainset.ts +++ b/Build/build-speedtest-domainset.ts @@ -1,13 +1,14 @@ import { domainDeduper } from './lib/domain-deduper'; import path from 'path'; import { createRuleset } from './lib/create-file'; -import domainSorter from './lib/stable-sort-domain'; +import { sortDomains } from './lib/stable-sort-domain'; import { Sema } from 'async-sema'; import * as tldts from 'tldts'; import { task } from './lib/trace-runner'; import { fetchWithRetry } from './lib/fetch-retry'; import { SHARED_DESCRIPTION } from './lib/constants'; +import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; const s = new Sema(3); @@ -140,7 +141,9 @@ export const buildSpeedtestDomainSet = task(import.meta.path, async () => { } } - const deduped = domainDeduper(Array.from(domains)).sort(domainSorter); + const gorhill = await getGorhillPublicSuffixPromise(); + const deduped = sortDomains(domainDeduper(Array.from(domains)), gorhill); + const description = [ ...SHARED_DESCRIPTION, '', diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index a3d775d5..e8c978c7 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,7 +1,7 @@ import fsp from 'fs/promises'; import path from 'path'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; -import { processHosts } from './parse-filter'; +import { processDomainLists, processHosts } from './parse-filter'; import { traceAsync, traceSync } from './trace-runner'; import * as tldts from 'tldts'; import { createTrie } from './trie'; @@ -33,7 +33,12 @@ const BLACK_TLD = new Set([ 'club', 'cn', 'codes', + 'co.uk', + 'co.in', + 'com.br', 'com.cn', + 'com.pl', + 'com.vn', 'cool', 'cyou', 'fit', @@ -53,6 +58,7 @@ const BLACK_TLD = new Set([ 'ltd', 'ml', 'mobi', + 'net.pl', 'one', 'online', 'pro', @@ -79,19 +85,12 @@ const BLACK_TLD = new Set([ ]); export const getPhishingDomains = () => traceAsync('get phishing domains', async () => { - const [domainSet, gorhill] = await Promise.all([ + const [domainSet, domainSet2, gorhill] = await Promise.all([ processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true), - // processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true), - // processFilterRules( - // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt', - // [ - // 'https://phishing-filter.pages.dev/phishing-filter-agh.txt' - // // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while - // // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt' - // ] - // ), + processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true), getGorhillPublicSuffixPromise() ]); + domainSet2.forEach((domain) => domainSet.add(domain)); traceSync.skip('* whitelisting phishing domains', () => { const trieForRemovingWhiteListed = createTrie(domainSet); diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 45958c11..5c9bb77a 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -37,27 +37,27 @@ const normalizeDomain = (domain: string) => { return h[0] === '.' ? h.slice(1) : h; }; -export async function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) { - const domainSets = new Set(); +export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) { + return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => { + const domainSets = new Set(); - for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) { - const domainToAdd = processLine(line); - if (!domainToAdd) { - continue; + for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) { + const domainToAdd = processLine(line); + if (!domainToAdd) continue; + + if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { + warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND); + foundDebugDomain = true; + } + + domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); } - if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { - warnOnce(domainListsUrl, false, DEBUG_DOMAIN_TO_FIND); - foundDebugDomain = true; - } - - domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); - } - - return domainSets; + return domainSets; + }); } -export async function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) { +export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) { return traceAsync(`- processHosts: ${hostsUrl}`, async () => { const domainSets = new Set(); diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 5f5609e6..f774a250 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -26,6 +26,11 @@ export const HOSTS = [ ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true] ] as const; +export const DOMAIN_LISTS = [ + // DigitalSide Threat-Intel - OSINT Hub + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true] +] as const; + export const ADGUARD_FILTERS = [ // EasyList [ diff --git a/Build/lib/stable-sort-domain.test.ts b/Build/lib/stable-sort-domain.test.ts deleted file mode 100644 index ca18b30c..00000000 --- a/Build/lib/stable-sort-domain.test.ts +++ /dev/null @@ -1,13 +0,0 @@ -import domainSorter from './stable-sort-domain'; -// eslint-disable-next-line import/no-unresolved -- fuck eslint-import -import { describe, it, expect } from 'bun:test'; - -describe('stable-sort-domain', () => { - it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => { - expect(domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov')).toBe(-1); - }); - - it('.fgnzdb.xyz, .hub.fghtem.com', () => { - expect(domainSorter('.fgnzdb.xyz', '.hub.fghtem.com')).toBe(1); - }); -}); diff --git a/Build/lib/stable-sort-domain.ts b/Build/lib/stable-sort-domain.ts index 7aafac29..d8516922 100644 --- a/Build/lib/stable-sort-domain.ts +++ b/Build/lib/stable-sort-domain.ts @@ -10,18 +10,16 @@ const compare = (a: string | null, b: string | null) => { return -1; } - if (a.length !== b.length) { - const r = a.length - b.length; - if (r > 0) { - return 1; - } - if (r < 0) { - return -1; - } - return 0; + const aLen = a.length; + const r = aLen - b.length; + if (r > 0) { + return 1; + } + if (r < 0) { + return -1; } - for (let i = 0; i < a.length; i++) { + for (let i = 0; i < aLen; i++) { if (b[i] == null) { return 1; } @@ -35,34 +33,21 @@ const compare = (a: string | null, b: string | null) => { return 0; }; -const createDomainSorter = (gorhill: PublicSuffixList | null = null) => { - if (gorhill) { - const getDomain = createCachedGorhillGetDomain(gorhill); +export const sortDomains = (inputs: string[], gorhill: PublicSuffixList) => { + const getDomain = createCachedGorhillGetDomain(gorhill); + const domains = inputs.reduce>((acc, cur) => { + acc[cur] ||= getDomain(cur); + return acc; + }, {}); - return (a: string, b: string) => { - if (a === b) return 0; - - const aDomain = getDomain(a); - const bDomain = getDomain(b); - - const resultDomain = compare(aDomain, bDomain); - return resultDomain !== 0 ? resultDomain : compare(a, b); - }; - } - - // eslint-disable-next-line @typescript-eslint/no-var-requires -- fuck - const tldts = require('./cached-tld-parse'); - - return (a: string, b: string) => { + const sorter = (a: string, b: string) => { if (a === b) return 0; - const aDomain = tldts.parse(a).domain; - const bDomain = tldts.parse(b).domain; + const aDomain = domains[a]; + const bDomain = domains[b]; - const resultDomain = compare(aDomain, bDomain); - return resultDomain !== 0 ? resultDomain : compare(a, b); + return compare(aDomain, bDomain) || compare(a, b); }; -}; -export default createDomainSorter(); -export { createDomainSorter }; + return inputs.sort(sorter); +};