From e8f35194794339f6115d00825b95121d9a7046fe Mon Sep 17 00:00:00 2001 From: SukkaW Date: Thu, 11 Jan 2024 11:56:15 +0800 Subject: [PATCH] Perf: minor optimization here and there --- Build/build-microsoft-cdn.ts | 5 ++++- Build/build-reject-domainset.ts | 37 +++++++++++++++++-------------- Build/lib/aho-corasick.ts | 4 +--- Build/lib/cache-filesystem.ts | 19 +++++++++------- Build/lib/domain-deduper.ts | 11 ++++----- Build/lib/get-phishing-domains.ts | 17 +++++++++----- Build/lib/normalize-domain.ts | 12 ++++++++-- Build/lib/parse-filter.ts | 6 ++--- Build/lib/reject-data-source.ts | 12 +++++----- Build/lib/set-add-from-array.ts | 8 +++++++ Build/validate-gfwlist.ts | 9 -------- 11 files changed, 80 insertions(+), 60 deletions(-) create mode 100644 Build/lib/set-add-from-array.ts diff --git a/Build/build-microsoft-cdn.ts b/Build/build-microsoft-cdn.ts index b1b21777..6eda4c54 100644 --- a/Build/build-microsoft-cdn.ts +++ b/Build/build-microsoft-cdn.ts @@ -32,7 +32,10 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => { }); const trie2 = createTrie(set); - BLACKLIST.flatMap(domain => trie2.find(domain, true)).forEach(d => set.delete(d)); + const black = BLACKLIST.flatMap(domain => trie2.find(domain, true)); + for (let i = 0, len = black.length; i < len; i++) { + set.delete(black[i]); + } return Array.from(set).map(d => `DOMAIN-SUFFIX,${d}`).concat(WHITELIST); }); diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index d109c1ab..8853e89c 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -17,6 +17,9 @@ import * as tldts from 'tldts'; import { SHARED_DESCRIPTION } from './lib/constants'; import { getPhishingDomains } from './lib/get-phishing-domains'; +import * as SetHelpers from 'mnemonist/set'; +import { setAddFromArray } from './lib/set-add-from-array'; + export const buildRejectDomainSet = task(import.meta.path, async () => { /** Whitelists */ const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); @@ -33,9 +36,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { getGorhillPublicSuffixPromise(), // Parse from remote hosts & domain lists ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2]).then(hosts => { - hosts.forEach(host => { - domainSets.add(host); - }); + SetHelpers.add(domainSets, hosts); })), ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])), ...ADGUARD_FILTERS.map(input => { @@ -48,24 +49,20 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { shouldStop = true; // we should not break here, as we want to see full matches from all data source } - white.forEach(i => filterRuleWhitelistDomainSets.add(i)); - black.forEach(i => domainSets.add(i)); + setAddFromArray(filterRuleWhitelistDomainSets, white); + setAddFromArray(domainSets, black); }); }), ...([ 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' ].map(input => processFilterRules(input).then(({ white, black }) => { - white.forEach(i => filterRuleWhitelistDomainSets.add(i)); - black.forEach(i => filterRuleWhitelistDomainSets.add(i)); + setAddFromArray(filterRuleWhitelistDomainSets, white); + setAddFromArray(filterRuleWhitelistDomainSets, black); }))), - getPhishingDomains().then(([purePhishingDomains, fullDomainSet]) => { - fullDomainSet.forEach(host => { - if (host) { - domainSets.add(host); - } - }); - purePhishingDomains.forEach(suffix => domainSets.add(`.${suffix}`)); + getPhishingDomains().then(([purePhishingDomains, fullPhishingDomainSet]) => { + SetHelpers.add(domainSets, fullPhishingDomainSet); + setAddFromArray(domainSets, purePhishingDomains); }), (async () => { for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) { @@ -79,9 +76,14 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { // remove pre-defined enforced blacklist from whitelist const trie0 = createTrie(filterRuleWhitelistDomainSets); - PREDEFINED_ENFORCED_BACKLIST.forEach(enforcedBlack => { - trie0.find(enforcedBlack).forEach(found => filterRuleWhitelistDomainSets.delete(found)); - }); + + for (let i = 0, len1 = PREDEFINED_ENFORCED_BACKLIST.length; i < len1; i++) { + const enforcedBlack = PREDEFINED_ENFORCED_BACKLIST[i]; + const found = trie0.find(enforcedBlack); + for (let j = 0, len2 = found.length; j < len2; j++) { + filterRuleWhitelistDomainSets.delete(found[j]); + } + } return [gorhill, shouldStop] as const; }); @@ -109,6 +111,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { // Dedupe domainSets traceSync('* Dedupe from black keywords/suffixes', () => { const trie1 = createTrie(domainSets); + domainSuffixSet.forEach(suffix => { trie1.find(suffix, true).forEach(f => domainSets.delete(f)); }); diff --git a/Build/lib/aho-corasick.ts b/Build/lib/aho-corasick.ts index da07ff42..e713d97b 100644 --- a/Build/lib/aho-corasick.ts +++ b/Build/lib/aho-corasick.ts @@ -73,9 +73,7 @@ const createKeywordFilter = (keys: string[] | Set) => { } }; - keys.forEach(k => { - put(k, k.length); - }); + keys.forEach(k => put(k, k.length)); build(); diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts index 1065b127..bed3cbd8 100644 --- a/Build/lib/cache-filesystem.ts +++ b/Build/lib/cache-filesystem.ts @@ -34,10 +34,13 @@ interface CacheApplyStringOption { type CacheApplyOption = T extends string ? CacheApplyStringOption : CacheApplyNonStringOption; const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min; + +const ONE_HOUR = 60 * 60 * 1000; +const ONE_DAY = 24 * ONE_HOUR; // Add some randomness to the cache ttl to avoid thundering herd export const TTL = { humanReadable(ttl: number) { - if (ttl >= 24 * 60 * 60 * 1000) { + if (ttl >= ONE_DAY) { return `${Math.round(ttl / 24 / 60 / 60 / 1000)}d`; } if (ttl >= 60 * 60 * 1000) { @@ -45,13 +48,13 @@ export const TTL = { } return `${Math.round(ttl / 1000)}s`; }, - THREE_HOURS: () => randomInt(1, 3) * 60 * 60 * 1000, - TWLVE_HOURS: () => randomInt(8, 12) * 60 * 60 * 1000, - ONE_DAY: () => randomInt(23, 25) * 60 * 60 * 1000, - THREE_DAYS: () => randomInt(1, 3) * 24 * 60 * 60 * 1000, - ONE_WEEK: () => randomInt(4, 7) * 24 * 60 * 60 * 1000, - TWO_WEEKS: () => randomInt(10, 14) * 24 * 60 * 60 * 1000, - TEN_DAYS: () => randomInt(7, 10) * 24 * 60 * 60 * 1000 + THREE_HOURS: () => randomInt(1, 3) * ONE_HOUR, + TWLVE_HOURS: () => randomInt(8, 12) * ONE_HOUR, + ONE_DAY: () => randomInt(23, 25) * ONE_HOUR, + THREE_DAYS: () => randomInt(1, 3) * ONE_DAY, + ONE_WEEK: () => randomInt(4, 7) * ONE_DAY, + TEN_DAYS: () => randomInt(7, 10) * ONE_DAY, + TWO_WEEKS: () => randomInt(10, 14) * ONE_DAY }; export class Cache { diff --git a/Build/lib/domain-deduper.ts b/Build/lib/domain-deduper.ts index df1bc4cc..541caefb 100644 --- a/Build/lib/domain-deduper.ts +++ b/Build/lib/domain-deduper.ts @@ -6,13 +6,16 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[] const trie = createTrie(inputDomains); const sets = new Set(inputDomains); - for (let j = 0, len = inputDomains.length; j < len; j++) { - const d = inputDomains[j]; + for (let i = 0, len = inputDomains.length; i < len; i++) { + const d = inputDomains[i]; if (d[0] !== '.') { continue; } - trie.find(d, false).forEach(f => sets.delete(f)); + const found = trie.find(d, true); + for (let j = 0, len = found.length; j < len; j++) { + sets.delete(found[j]); + } const a: string = d.slice(1); @@ -27,5 +30,3 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[] return sets; } - -export default domainDeduper; diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 3ca9a54d..742ece1f 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -8,6 +8,8 @@ import { processLine } from './process-line'; import { TTL } from './cache-filesystem'; import { isCI } from 'ci-info'; +import { add as SetAdd } from 'mnemonist/set'; + const WHITELIST_DOMAIN = new Set([ 'w3s.link', 'dweb.link', @@ -92,16 +94,19 @@ export const getPhishingDomains = () => traceAsync('get phishing domains', async : null, getGorhillPublicSuffixPromise() ]); - domainSet2?.forEach((domain) => domainSet.add(domain)); + if (domainSet2) { + SetAdd(domainSet, domainSet2); + } traceSync.skip('* whitelisting phishing domains', () => { const trieForRemovingWhiteListed = createTrie(domainSet); - WHITELIST_DOMAIN.forEach(white => { - trieForRemovingWhiteListed.find(`.${white}`, false).forEach(f => domainSet.delete(f)); - // if (trieForRemovingWhiteListed.has(white)) { + for (const white of WHITELIST_DOMAIN) { + const found = trieForRemovingWhiteListed.find(`.${white}`, false); + for (let i = 0, len = found.length; i < len; i++) { + domainSet.delete(found[i]); + } domainSet.delete(white); - // } - }); + } }); const domainCountMap: Record = {}; diff --git a/Build/lib/normalize-domain.ts b/Build/lib/normalize-domain.ts index 52202c2c..5971bc7a 100644 --- a/Build/lib/normalize-domain.ts +++ b/Build/lib/normalize-domain.ts @@ -10,8 +10,16 @@ export const normalizeDomain = (domain: string) => { if (!parsed.isIcann && !parsed.isPrivate) return null; let h = parsed.hostname; - if (h[0] === '.') h = h.slice(1); - if (h.endsWith('.')) h = h.slice(0, -1); + + let sliceStart = 0; + let sliceEnd = h.length; + + if (h[0] === '.') sliceStart = 1; + if (h.endsWith('.')) sliceEnd = -1; + + if (sliceStart !== 0 || sliceEnd !== h.length) { + h = h.slice(sliceStart, sliceEnd); + } if (h) return h; return null; diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index b92057d3..c352c897 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -208,12 +208,12 @@ export async function processFilterRules( } )); - warningMessages.forEach(msg => { + for (let i = 0, len = warningMessages.length; i < len; i++) { console.warn( - picocolors.yellow(msg), + picocolors.yellow(warningMessages[i]), picocolors.gray(picocolors.underline(filterRulesUrl)) ); - }); + } console.log( picocolors.gray('[process filter]'), diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 03687295..933a4f01 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -15,17 +15,12 @@ export const HOSTS = [ // Curben's UrlHaus Malicious URL Blocklist // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', - ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()], + ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()] // Curben's Phishing URL Blocklist // Covered by lib/get-phishing-domains.ts // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt' // 'https://phishing-filter.pages.dev/phishing-filter-agh.txt' // ['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true], - // Curben's PUP Domains Blocklist - // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' - // 'https://pup-filter.pages.dev/pup-filter-agh.txt' - // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl - ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, TTL.TWO_WEEKS()] ] as const; export const DOMAIN_LISTS = [ @@ -38,6 +33,11 @@ export const DOMAIN_LISTS = [ // DigitalSide Threat-Intel - OSINT Hub // Update once per day ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()], + // Curben's PUP Domains Blocklist + // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' + // 'https://pup-filter.pages.dev/pup-filter-agh.txt' + // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl + ['https://curbengh.github.io/pup-filter/pup-filter-domains.txt', true, TTL.TWO_WEEKS()], // AdGuard CNAME Filter Combined // Update on a 7 days basis, so we add a 3 hours cache ttl ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()], diff --git a/Build/lib/set-add-from-array.ts b/Build/lib/set-add-from-array.ts new file mode 100644 index 00000000..bf025f5b --- /dev/null +++ b/Build/lib/set-add-from-array.ts @@ -0,0 +1,8 @@ +/** + * In-place adding of elements from an array to a set. + */ +export function setAddFromArray(set: Set, arr: T[]): void { + for (let i = 0, len = arr.length; i < len; i++) { + set.add(arr[i]); + } +} diff --git a/Build/validate-gfwlist.ts b/Build/validate-gfwlist.ts index 1f6cca75..72c25b4a 100644 --- a/Build/validate-gfwlist.ts +++ b/Build/validate-gfwlist.ts @@ -93,15 +93,6 @@ export const parseGfwList = async () => { runAgainstRuleset(path.resolve(import.meta.dir, '../List/non_ip/stream.conf')) ]); - // for await (const l of readFileByLine(path.resolve(import.meta.dir, '../List/non_ip/stream.conf'))) { - // const line = processLine(l); - // if (!line) continue; - // const domain = line[0] === '.' ? line.slice(1) : line; - // if (top500Gfwed.has(domain)) { - // notIncludedTop500Gfwed.delete(domain); - // } - // } - console.log(notIncludedTop500Gfwed); return [