From efa34399b070d66232c8f401a4617df8fa929776 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Mon, 27 May 2024 02:42:56 +0800 Subject: [PATCH] Chore: minor changes --- Build/build-reject-domainset.ts | 50 ++++++++++++++++-------------- Build/constants/loose-tldts-opt.ts | 9 ++++++ Build/lib/aho-corasick.ts | 4 +-- Build/lib/get-phishing-domains.ts | 13 ++------ Build/lib/normalize-domain.ts | 1 + Build/lib/stable-sort-domain.ts | 41 +++++++++++++++++------- Build/lib/trie.ts | 44 +++++++++++++------------- 7 files changed, 94 insertions(+), 68 deletions(-) create mode 100644 Build/constants/loose-tldts-opt.ts diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index d62dc02a..2cf88ab7 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -9,7 +9,7 @@ import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { domainDeduper } from './lib/domain-deduper'; import createKeywordFilter from './lib/aho-corasick'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; -import { sortDomains } from './lib/stable-sort-domain'; +import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain'; import { task } from './trace'; // tldts-experimental is way faster than tldts, but very little bit inaccurate // (since it is hashes based). But the result is still deterministic, which is @@ -21,6 +21,10 @@ import { getPhishingDomains } from './lib/get-phishing-domains'; import { subtract as SetSubstract } from 'mnemonist/set'; import { setAddFromArray, setAddFromArrayCurried } from './lib/set-add-from-array'; import { sort } from './lib/timsort'; +import { looseTldtsOpt } from './constants/loose-tldts-opt'; +import { build } from 'bun'; + +const getRejectSukkaConfPromise = readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')); export const buildRejectDomainSet = task(import.meta.path, async (span) => { /** Whitelists */ @@ -37,11 +41,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { let shouldStop = false; await Promise.all([ // Parse from remote hosts & domain lists - ...HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)), - - ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)), - - ...ADGUARD_FILTERS.map( + HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)), + DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)), + ADGUARD_FILTERS.map( input => processFilterRules(childSpan, ...input) .then(({ white, black, foundDebugDomain }) => { if (foundDebugDomain) { @@ -53,7 +55,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { setAddFromArray(domainSets, black); }) ), - ...([ + ([ 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' ].map( @@ -64,9 +66,8 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { }) )), getPhishingDomains(childSpan).then(appendArrayToDomainSets), - childSpan.traceChildAsync('process reject_sukka.conf', () => readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')) - .then(appendArrayToDomainSets)) - ]); + getRejectSukkaConfPromise.then(appendArrayToDomainSets) + ].flat()); // eslint-disable-next-line sukka/no-single-return -- not single return return shouldStop; }); @@ -107,30 +108,31 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { }); }); - const trie = span.traceChildSync('dedupe from white suffixes', () => { - const trie = createTrie(domainSets, true, true); - filterRuleWhitelistDomainSets.forEach(trie.whitelist); - return trie; - }); + const trie = span.traceChildSync('create smol trie', () => createTrie(domainSets, true, true)); + + span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist)); // Dedupe domainSets const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie)); console.log(`Final size ${dudupedDominArray.length}`); + const { + domainMap: domainArrayMainDomainMap, + subdomainMap: domainArraySubdomainMap + } = span.traceChildSync( + 'build map for stat and sort', + () => buildParseDomainMap(dudupedDominArray) + ); + // Create reject stats const rejectDomainsStats: Array<[string, number]> = span .traceChild('create reject stats') .traceSyncFn(() => { - const tldtsOpt = { allowPrivateDomains: false, detectIp: false, validateHostname: false }; const statMap = dudupedDominArray.reduce>((acc, cur) => { - const suffix = tldts.getDomain(cur, tldtsOpt); - if (!suffix) return acc; - - if (acc.has(suffix)) { - acc.set(suffix, acc.get(suffix)! + 1); - } else { - acc.set(suffix, 1); + const suffix = domainArrayMainDomainMap.get(cur); + if (suffix) { + acc.set(suffix, (acc.get(suffix) ?? 0) + 1); } return acc; }, new Map()); @@ -157,7 +159,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { 'Sukka\'s Ruleset - Reject Base', description, new Date(), - span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray)), + span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)), 'domainset', path.resolve(import.meta.dir, '../List/domainset/reject.conf'), path.resolve(import.meta.dir, '../Clash/domainset/reject.txt') diff --git a/Build/constants/loose-tldts-opt.ts b/Build/constants/loose-tldts-opt.ts new file mode 100644 index 00000000..f794128c --- /dev/null +++ b/Build/constants/loose-tldts-opt.ts @@ -0,0 +1,9 @@ +import type * as tldts from 'tldts'; + +export const looseTldtsOpt: Parameters[1] = { + allowPrivateDomains: false, + extractHostname: false, + validateHostname: false, + detectIp: false, + mixedInputs: false +}; diff --git a/Build/lib/aho-corasick.ts b/Build/lib/aho-corasick.ts index cebbc5ad..80b25751 100644 --- a/Build/lib/aho-corasick.ts +++ b/Build/lib/aho-corasick.ts @@ -1,8 +1,8 @@ interface Node { /** @default false */ - wordEnd?: boolean, + wordEnd: boolean, children: Map, - fail?: Node + fail: Node | undefined } const createNode = (): Node => ({ diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 13fd6e3e..391e5b29 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -5,6 +5,7 @@ import { getSubdomain, getPublicSuffix } from 'tldts-experimental'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; import { PHISHING_DOMAIN_LISTS } from './reject-data-source'; +import { looseTldtsOpt } from '../constants/loose-tldts-opt'; const BLACK_TLD = new Set([ 'accountant', @@ -99,14 +100,6 @@ export const WHITELIST_MAIN_DOMAINS = new Set([ 'notion.site' ]); -const tldtsOpt: Parameters[1] = { - allowPrivateDomains: false, - extractHostname: false, - validateHostname: false, - detectIp: false, - mixedInputs: false -}; - export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { const gorhill = await getGorhillPublicSuffixPromise(); @@ -132,7 +125,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g continue; } - const tld = getPublicSuffix(safeGorhillLine, tldtsOpt); + const tld = getPublicSuffix(safeGorhillLine, looseTldtsOpt); if (!tld || !BLACK_TLD.has(tld)) continue; domainCountMap[apexDomain] ||= 0; @@ -187,7 +180,7 @@ export function calcDomainAbuseScore(line: string) { } } - const subdomain = getSubdomain(line, tldtsOpt); + const subdomain = getSubdomain(line, looseTldtsOpt); if (subdomain) { if (subdomain.slice(1).includes('.')) { diff --git a/Build/lib/normalize-domain.ts b/Build/lib/normalize-domain.ts index bb27083c..ac30829d 100644 --- a/Build/lib/normalize-domain.ts +++ b/Build/lib/normalize-domain.ts @@ -7,6 +7,7 @@ export const normalizeDomain = (domain: string) => { const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false }); // if (parsed.isIp) return null; if (!parsed.hostname) return null; + // Private invalid domain (things like .tor, .dn42, etc) if (!parsed.isIcann && !parsed.isPrivate) return null; let h = parsed.hostname; diff --git a/Build/lib/stable-sort-domain.ts b/Build/lib/stable-sort-domain.ts index 1f844ec1..ebd1b0c2 100644 --- a/Build/lib/stable-sort-domain.ts +++ b/Build/lib/stable-sort-domain.ts @@ -3,32 +3,51 @@ // enough when sorting. import { getDomain, getSubdomain } from 'tldts-experimental'; import { sort } from './timsort'; +import { looseTldtsOpt } from '../constants/loose-tldts-opt'; export const compare = (a: string, b: string) => { if (a === b) return 0; return (a.length - b.length) || a.localeCompare(b); }; -const tldtsOpt: Parameters[1] = { - allowPrivateDomains: false, - extractHostname: false, - validateHostname: false, - detectIp: false, - mixedInputs: false -}; - -export const sortDomains = (inputs: string[]) => { +export const buildParseDomainMap = (inputs: string[]) => { const domainMap = new Map(); const subdomainMap = new Map(); for (let i = 0, len = inputs.length; i < len; i++) { const cur = inputs[i]; if (!domainMap.has(cur)) { - const topD = getDomain(cur, tldtsOpt); + const topD = getDomain(cur, looseTldtsOpt); domainMap.set(cur, topD ?? cur); } if (!subdomainMap.has(cur)) { - const subD = getSubdomain(cur, tldtsOpt); + const subD = getSubdomain(cur, looseTldtsOpt); + subdomainMap.set(cur, subD ?? cur); + } + } + + return { domainMap, subdomainMap }; +}; + +export const sortDomains = ( + inputs: string[], + domainMap?: Map, + subdomainMap?: Map +) => { + if (!domainMap || !subdomainMap) { + const { domainMap: dm, subdomainMap: sm } = buildParseDomainMap(inputs); + domainMap = dm; + subdomainMap = sm; + } + + for (let i = 0, len = inputs.length; i < len; i++) { + const cur = inputs[i]; + if (!domainMap.has(cur)) { + const topD = getDomain(cur, looseTldtsOpt); + domainMap.set(cur, topD ?? cur); + } + if (!subdomainMap.has(cur)) { + const subD = getSubdomain(cur, looseTldtsOpt); subdomainMap.set(cur, subD ?? cur); } } diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index ae9a8cdb..a3740e00 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -36,32 +36,34 @@ const createNode = (parent: TrieNode | null = null): TrieNode => { return node; }; +const hostnameToTokens = (hostname: string): string[] => { + let buf = ''; + const tokens: string[] = []; + for (let i = 0, l = hostname.length; i < l; i++) { + const c = hostname[i]; + if (c === '.') { + if (buf) { + tokens.push(buf, /* . */ c); + buf = ''; + } else { + tokens.push(/* . */ c); + } + } else { + buf += c; + } + } + if (buf) { + tokens.push(buf); + } + return tokens; +}; + export const createTrie = (from?: string[] | Set | null, hostnameMode = false, smolTree = false) => { let size = 0; const root: TrieNode = createNode(); const suffixToTokens = hostnameMode - ? (suffix: string) => { - let buf = ''; - const tokens: string[] = []; - for (let i = 0, l = suffix.length; i < l; i++) { - const c = suffix[i]; - if (c === '.') { - if (buf) { - tokens.push(buf, /* . */ c); - buf = ''; - } else { - tokens.push(/* . */ c); - } - } else { - buf += c; - } - } - if (buf) { - tokens.push(buf); - } - return tokens; - } + ? hostnameToTokens : (suffix: string) => suffix; /**