From 35aa11f36137226fcf5f02f2570cd7e528ba759d Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 12 May 2024 00:50:50 +0800 Subject: [PATCH] Perf: remove cached tld parse --- Build/lib/cached-tld-parse.ts | 9 ----- Build/lib/get-phishing-domains.ts | 4 +-- Build/lib/trie.ts | 59 +++++++++++++------------------ 3 files changed, 26 insertions(+), 46 deletions(-) delete mode 100644 Build/lib/cached-tld-parse.ts diff --git a/Build/lib/cached-tld-parse.ts b/Build/lib/cached-tld-parse.ts deleted file mode 100644 index a09fa809..00000000 --- a/Build/lib/cached-tld-parse.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { createCache } from './cache-apply'; -import type { PublicSuffixList } from '@gorhill/publicsuffixlist'; - -let gorhillGetDomainCache: ReturnType | null = null; -export const createCachedGorhillGetDomain = (gorhill: PublicSuffixList) => { - gorhillGetDomainCache ??= createCache('cached-gorhill-get-domain', true); - return (domain: string) => gorhillGetDomainCache! // we do know gothillGetDomainCache exists here - .sync(domain, () => gorhill.getDomain(domain[0] === '.' ? domain.slice(1) : domain)); -}; diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index a5644555..4c1254b8 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -2,7 +2,6 @@ import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { processDomainLists } from './parse-filter'; import * as tldts from 'tldts'; import { createTrie } from './trie'; -import { createCachedGorhillGetDomain } from './cached-tld-parse'; import { processLine } from './process-line'; import { TTL } from './cache-filesystem'; import { isCI } from 'ci-info'; @@ -130,7 +129,6 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g }); const domainCountMap: Record = {}; - const getDomain = createCachedGorhillGetDomain(gorhill); span.traceChildSync('process phishing domain set', () => { const domainArr = Array.from(domainSet); @@ -139,7 +137,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g const line = processLine(domainArr[i]); if (!line) continue; - const apexDomain = getDomain(line); + const apexDomain = gorhill.getDomain(line); if (!apexDomain) continue; domainCountMap[apexDomain] ||= 0; diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index f81bc117..74f34306 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -79,11 +79,10 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = if (node.has(token)) { node = node.get(token)!; - if (smolTree) { - if (node.get('.')?.[SENTINEL] === true) { - return; - } - // return; + // During the adding of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie + // Dedupe the covered subdomain by skipping + if (smolTree && (node.get('.')?.[SENTINEL])) { + return; } } else { const newNode = createNode(node); @@ -92,9 +91,12 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = } if (smolTree) { + // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie if (i === 1 && tokens[0] === '.') { + // If there is a `[start]sub.example.com` here, remove it node[SENTINEL] = false; - // Trying to add `.sub.example.com` where there is already a `blog.sub.example.com` in the trie + + // Removing the rest of the child nodes by creating a new node and disconnecting the old one const newNode = createNode(node); node.set('.', newNode); node = newNode; @@ -225,13 +227,11 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = node = nodeStack.pop()!; if (node[SENTINEL]) { - if (suffix !== inputTokens) { - // found match, delete it from set - if (hostnameMode) { - set.delete((suffix as string[]).join('')); - } else { - set.delete(suffix as string); - } + // found match, delete it from set + if (hostnameMode) { + set.delete((suffix as string[]).join('')); + } else if (suffix !== inputTokens) { + set.delete(suffix as string); } } @@ -317,37 +317,22 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = return node[SENTINEL]; }; - if (Array.isArray(from)) { - for (let i = 0, l = from.length; i < l; i++) { - add(from[i]); - } - } else if (from) { - from.forEach(add); - } - const dump = () => { const nodeStack: TrieNode[] = []; const suffixStack: Array = []; - // Resolving initial string - const suffix = hostnameMode ? [] : ''; nodeStack.push(root); - suffixStack.push(suffix); + // Resolving initial string (begin the start of the stack) + suffixStack.push(hostnameMode ? [] : ''); const results: string[] = []; let node: TrieNode; do { - let hasValue = false; - node = nodeStack.pop()!; const suffix = suffixStack.pop()!; - if (node[SENTINEL]) { - hasValue = true; - } - node.forEach((childNode, k) => { nodeStack.push(childNode); @@ -358,16 +343,22 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = } }); - if (hasValue) { - results.push( - hostnameMode ? (suffix as string[]).join('') : (suffix as string) - ); + if (node[SENTINEL]) { + results.push(hostnameMode ? (suffix as string[]).join('') : (suffix as string)); } } while (nodeStack.length); return results; }; + if (Array.isArray(from)) { + for (let i = 0, l = from.length; i < l; i++) { + add(from[i]); + } + } else if (from) { + from.forEach(add); + } + return { add, contains,