From eaf993deca4ffbd73379c57168f2158bd999477b Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 14 Jan 2024 22:58:53 +0800 Subject: [PATCH] Perf: minor optimization here and there --- Build/build-microsoft-cdn.ts | 4 +- Build/build-reject-domainset.ts | 69 +++++++++++++------------------ Build/lib/get-phishing-domains.ts | 19 +++++---- Build/lib/trie.ts | 2 +- 4 files changed, 44 insertions(+), 50 deletions(-) diff --git a/Build/build-microsoft-cdn.ts b/Build/build-microsoft-cdn.ts index 70a7e3f0..75005ae4 100644 --- a/Build/build-microsoft-cdn.ts +++ b/Build/build-microsoft-cdn.ts @@ -7,6 +7,8 @@ import { createTrie } from './lib/trie'; import { SHARED_DESCRIPTION } from './lib/constants'; import { createMemoizedPromise } from './lib/memo-promise'; +const PROBE_DOMAINS = ['.microsoft.com', '.windows.net', '.windows.com', '.windowsupdate.com', '.windowssearch.com', '.office.net']; + const WHITELIST = [ 'DOMAIN-SUFFIX,download.prss.microsoft.com', 'DOMAIN,res.cdn.office.net' @@ -29,7 +31,7 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => { trie.add(domain); } } - return new Set(['.microsoft.com', '.windows.net', '.windows.com', '.windowsupdate.com', '.windowssearch.com', '.office.net'].flatMap(domain => trie.find(domain, false))); + return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain, false))); }); const trie2 = createTrie(set); diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 802fcddc..b885a031 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -21,38 +21,36 @@ import * as SetHelpers from 'mnemonist/set'; import { setAddFromArray } from './lib/set-add-from-array'; export const buildRejectDomainSet = task(import.meta.path, async (span) => { + const gorhill = await getGorhillPublicSuffixPromise(); + /** Whitelists */ const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); const domainSets = new Set(); + let shouldStop = false; // Parse from AdGuard Filters - const [gorhill, shouldStop] = await span + await span .traceChild('download and process hosts / adblock filter rules') .traceAsyncFn(async (childSpan) => { - let shouldStop = false; - - const [gorhill] = await Promise.all([ - getGorhillPublicSuffixPromise(), + await Promise.all([ // Parse from remote hosts & domain lists - ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2]).then(hosts => { - SetHelpers.add(domainSets, hosts); - })), - ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2])), - ...ADGUARD_FILTERS.map(input => { - const promise = typeof input === 'string' - ? processFilterRules(childSpan, input) - : processFilterRules(childSpan, input[0], input[1], input[2]); + ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))), - return promise.then(({ white, black, foundDebugDomain }) => { - if (foundDebugDomain) { - shouldStop = true; + ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))), + + ...ADGUARD_FILTERS.map(input => ( + typeof input === 'string' + ? processFilterRules(childSpan, input) + : processFilterRules(childSpan, input[0], input[1], input[2]) + ).then(({ white, black, foundDebugDomain }) => { + if (foundDebugDomain) { + shouldStop = true; // we should not break here, as we want to see full matches from all data source - } - setAddFromArray(filterRuleWhitelistDomainSets, white); - setAddFromArray(domainSets, black); - }); - }), + } + setAddFromArray(filterRuleWhitelistDomainSets, white); + setAddFromArray(domainSets, black); + })), ...([ 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' @@ -67,9 +65,8 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { childSpan.traceChild('process reject_sukka.conf').traceAsyncFn(async () => { for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) { const line = processLine(l); - if (line) { - domainSets.add(line); - } + if (!line) continue; + domainSets.add(line); } }) ]); @@ -85,7 +82,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { } } - return [gorhill, shouldStop] as const; + return shouldStop; }); if (shouldStop) { @@ -124,15 +121,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { // remove pre-defined enforced blacklist from whitelist const kwfilter = createKeywordFilter(domainKeywordsSet); - // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) - const trieWhite = createTrie(filterRuleWhitelistDomainSets); + // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) for (const domain of domainSets) { if (domain[0] === '.') { - if (trieWhite.contains(domain)) { + if (filterRuleWhitelistDomainSets.has(domain)) { domainSets.delete(domain); continue; } - } else if (trieWhite.has(`.${domain}`)) { + } else if (filterRuleWhitelistDomainSets.has(`.${domain}`)) { domainSets.delete(domain); continue; } @@ -154,8 +150,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { console.log(`Final size ${dudupedDominArray.length}`); // Create reject stats - const rejectDomainsStats: Array<[string, number]> = span.traceChild('create reject stats').traceSyncFn( - () => Object.entries( + const rejectDomainsStats: Array<[string, number]> = span + .traceChild('create reject stats') + .traceSyncFn(() => Object.entries( dudupedDominArray.reduce>((acc, cur) => { const suffix = tldts.getDomain(cur, { allowPrivateDomains: false, detectIp: false, validateHostname: false }); if (suffix) { @@ -163,15 +160,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { } return acc; }, {}) - ).filter(a => a[1] > 10).sort((a, b) => { - const t = b[1] - a[1]; - if (t !== 0) { - return t; - } - - return a[0].localeCompare(b[0]); - }) - ); + ).filter(a => a[1] > 5).sort((a, b) => (b[1] - a[1]) || a[0].localeCompare(b[0]))); const description = [ ...SHARED_DESCRIPTION, diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 5d0f7b16..40837aff 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -11,7 +11,7 @@ import { isCI } from 'ci-info'; import { add as SetAdd } from 'mnemonist/set'; import type { Span } from '../trace'; -const WHITELIST_DOMAIN = new Set([ +const WHITELIST_DOMAIN = [ 'w3s.link', 'dweb.link', 'nftstorage.link', @@ -20,7 +20,7 @@ const WHITELIST_DOMAIN = new Set([ 'page.link', // Firebase URL Shortener 'fleek.cool', 'notion.site' -]); +]; const BLACK_TLD = new Set([ 'autos', 'bar', @@ -101,12 +101,15 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g span.traceChild('whitelisting phishing domains').traceSyncFn(() => { const trieForRemovingWhiteListed = createTrie(domainSet); - for (const white of WHITELIST_DOMAIN) { - const found = trieForRemovingWhiteListed.find(`.${white}`, false); - for (let i = 0, len = found.length; i < len; i++) { - domainSet.delete(found[i]); - } - domainSet.delete(white); + + const needToBeWhite = WHITELIST_DOMAIN.flatMap(white => { + const found = trieForRemovingWhiteListed.find(`.${white}`, true); + found.push(white); + return found; + }); + + for (let i = 0, len = needToBeWhite.length; i < len; i++) { + domainSet.delete(needToBeWhite[i]); } }); diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index bbdc0892..099792d2 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -58,7 +58,7 @@ export const createTrie = (from?: string[] | Set) => { /** * Method used to retrieve every item in the trie with the given prefix. */ - const find = (suffix: string, includeEqualWithSuffix = true): string[] => { + const find = (suffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => { let node: TrieNode = root; let token: string;