From a7e7c19a518472bf2d7fcf4a7d3020d7c4f004f3 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Wed, 2 Oct 2024 22:01:38 +0800 Subject: [PATCH] Rrefactor: rewrite trie in class --- Build/build-cdn-download-conf.ts | 23 +-- Build/build-microsoft-cdn.ts | 8 +- Build/lib/get-phishing-domains.ts | 2 +- Build/lib/trie.test.ts | 25 +-- Build/lib/trie.ts | 310 +++++++++++++++--------------- Build/validate-domestic.ts | 2 +- Build/validate-gfwlist.ts | 2 +- 7 files changed, 187 insertions(+), 185 deletions(-) diff --git a/Build/build-cdn-download-conf.ts b/Build/build-cdn-download-conf.ts index 93863503..c7739821 100644 --- a/Build/build-cdn-download-conf.ts +++ b/Build/build-cdn-download-conf.ts @@ -10,19 +10,16 @@ import { processLine } from './lib/process-line'; import { DomainsetOutput } from './lib/create-file'; const getS3OSSDomainsPromise = (async (): Promise => { - const trie = createTrie( - (await getPublicSuffixListTextPromise()).reduce( - (acc, cur) => { - const tmp = processLine(cur); - if (tmp) { - acc.push(tmp); - } - return acc; - }, - [] - ), - true - ); + const trie = createTrie((await getPublicSuffixListTextPromise()).reduce( + (acc, cur) => { + const tmp = processLine(cur); + if (tmp) { + acc.push(tmp); + } + return acc; + }, + [] + )); /** * Extract OSS domain from publicsuffix list diff --git a/Build/build-microsoft-cdn.ts b/Build/build-microsoft-cdn.ts index 6501d7f5..1d6cdb65 100644 --- a/Build/build-microsoft-cdn.ts +++ b/Build/build-microsoft-cdn.ts @@ -1,6 +1,6 @@ import { task } from './trace'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; -import { createTrie } from './lib/trie'; +import { HostnameSmolTrie } from './lib/trie'; import { SHARED_DESCRIPTION } from './lib/constants'; import { createMemoizedPromise } from './lib/memo-promise'; import { extractDomainsFromFelixDnsmasq } from './lib/parse-dnsmasq'; @@ -27,7 +27,7 @@ const BLACKLIST = [ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise<[domains: string[], domainSuffixes: string[]]>(async () => { // First trie is to find the microsoft domains that matches probe domains - const trie = createTrie(null, true); + const trie = new HostnameSmolTrie(); for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) { const domain = extractDomainsFromFelixDnsmasq(line); if (domain) { @@ -37,8 +37,8 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise<[domains: str const foundMicrosoftCdnDomains = PROBE_DOMAINS.flatMap(domain => trie.find(domain)); // Second trie is to remove blacklisted domains - const trie2 = createTrie(foundMicrosoftCdnDomains, true); - BLACKLIST.forEach(trie2.whitelist); + const trie2 = new HostnameSmolTrie(foundMicrosoftCdnDomains); + BLACKLIST.forEach(black => trie2.whitelist(black)); const domains: string[] = DOMAINS; const domainSuffixes: string[] = DOMAIN_SUFFIXES; diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 938bc5c0..0c39bf37 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -191,7 +191,7 @@ async function processPhihsingDomains(domainArr: string[]) { ); } -export function calcDomainAbuseScore(subdomain: string, fullDomain: string) { +export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) { let weight = 0; const hitLowKeywords = lowKeywords(fullDomain); diff --git a/Build/lib/trie.test.ts b/Build/lib/trie.test.ts index bf0a4663..514eacff 100644 --- a/Build/lib/trie.test.ts +++ b/Build/lib/trie.test.ts @@ -56,6 +56,7 @@ describe('Trie', () => { trie.add('skk.moe'); trie.add('blog.skk.moe'); + // eslint-disable-next-line sukka/no-element-overwrite -- deliberately do testing trie.add('skk.moe'); expect(trie.size).to.equal(2); @@ -63,18 +64,18 @@ describe('Trie', () => { }); it('should be possible to set the null sequence.', () => { - let trie = createTrie(null, false); + const trie = createTrie(null, false); trie.add(''); expect(trie.has('')).to.equal(true); - trie = createTrie(null, true); - trie.add(''); - expect(trie.has('')).to.equal(true); + const trie2 = createTrie(null, true); + trie2.add(''); + expect(trie2.has('')).to.equal(true); }); it('should be possible to delete items.', () => { - const trie = createTrie(null); + const trie = createTrie(null, false); trie.add('skk.moe'); trie.add('example.com'); @@ -108,7 +109,7 @@ describe('Trie', () => { }); it('should be possible to retrieve items matching the given prefix.', () => { - const trie = createTrie(null); + const trie = createTrie(null, false); trie.add('example.com'); trie.add('blog.example.com'); @@ -141,12 +142,12 @@ describe('Trie', () => { }); it('should be possible to create a trie from an arbitrary iterable.', () => { - let trie = createTrie(['skk.moe', 'blog.skk.moe']); + let trie = createTrie(['skk.moe', 'blog.skk.moe'], false); expect(trie.size).to.equal(2); expect(trie.has('skk.moe')).to.equal(true); - trie = createTrie(new Set(['skk.moe', 'example.com'])); + trie = createTrie(new Set(['skk.moe', 'example.com']), false); expect(trie.size).to.equal(2); expect(trie.has('skk.moe')).to.equal(true); }); @@ -154,28 +155,28 @@ describe('Trie', () => { describe('surge domainset dedupe', () => { it('should not remove same entry', () => { - const trie = createTrie(['.skk.moe', 'noc.one']); + const trie = createTrie(['.skk.moe', 'noc.one'], false); expect(trie.find('.skk.moe')).to.deep.equal(['.skk.moe']); expect(trie.find('noc.one')).to.deep.equal(['noc.one']); }); it('should match subdomain - 1', () => { - const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']); + const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], false); expect(trie.find('.skk.moe')).to.deep.equal(['image.cdn.skk.moe', 'blog.skk.moe']); expect(trie.find('.sukkaw.com')).to.deep.equal(['www.sukkaw.com']); }); it('should match subdomain - 2', () => { - const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']); + const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], false); expect(trie.find('.skk.moe')).to.deep.equal(['.skk.moe', 'image.cdn.skk.moe', 'blog.skk.moe']); expect(trie.find('.sukkaw.com')).to.deep.equal(['www.sukkaw.com']); }); it('should not remove non-subdomain', () => { - const trie = createTrie(['skk.moe', 'sukkaskk.moe']); + const trie = createTrie(['skk.moe', 'sukkaskk.moe'], false); expect(trie.find('.skk.moe')).to.deep.equal([]); }); }); diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index 2b0cdb0a..88ac72e4 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -80,94 +80,39 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea return false; }; -export const createTrie = (from?: string[] | Set | null, smolTree = false) => { - let size = 0; - const root: TrieNode = createNode(); +interface FindSingleChildLeafResult { + node: TrieNode, + toPrune: TrieNode | null, + tokenToPrune: string | null, + parent: TrieNode +} - /** - * Method used to add the given suffix to the trie. - */ - const add = smolTree - ? (suffix: string, meta?: Meta): void => { - let node: TrieNode = root; - let curNodeChildren: Map> = node[2]; +abstract class Triebase { + protected readonly $root: TrieNode = createNode(); + protected $size = 0; - const onToken = (token: string) => { - curNodeChildren = node[2]; - if (curNodeChildren.has(token)) { - node = curNodeChildren.get(token)!; + get root() { + return this.$root; + } - // During the adding of `[start]blog|.skk.moe` and find out that there is a `[start].skk.moe` in the trie, skip adding the rest of the node - if (node[0] && token === '.') { - return true; - } - } else { - const newNode = createNode(node); - curNodeChildren.set(token, newNode); - node = newNode; - } - - return false; - }; - - // When walkHostnameTokens returns true, we should skip the rest - if (walkHostnameTokens(suffix, onToken)) { - return; + constructor(from?: string[] | Set | null) { + // Actually build trie + if (Array.isArray(from)) { + for (let i = 0, l = from.length; i < l; i++) { + this.add(from[i]); } - - // If we are in smolTree mode, we need to do something at the end of the loop - if (suffix[0] === '.') { - // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie - - // Make sure parent `[start]sub.example.com` (without dot) is removed (SETINEL to false) - (/** parent */ node[1]!)[0] = false; - - // Removing the rest of the parent's child nodes - node[2].clear(); - // The SENTINEL of this node will be set to true at the end of the function, so we don't need to set it here - - // we can use else-if here, because the children is now empty, we don't need to check the leading "." - } else if (node[2].get('.')?.[0] === true) { - // Trying to add `example.com` when there is already a `.example.com` in the trie - // No need to increment size and set SENTINEL to true (skip this "new" item) - return; - } - - node[0] = true; - node[3] = meta!; + } else if (from) { + from.forEach((value) => this.add(value)); } - : (suffix: string, meta?: Meta): void => { - let node: TrieNode = root; + } - const onToken = (token: string) => { - if (node[2].has(token)) { - node = node[2].get(token)!; - } else { - const newNode = createNode(node); - node[2].set(token, newNode); - node = newNode; - } + public abstract add(suffix: string, meta?: Meta): void; - return false; - }; - - // When walkHostnameTokens returns true, we should skip the rest - if (walkHostnameTokens(suffix, onToken)) { - return; - } - - if (!node[0]) { - size++; - node[0] = true; - node[3] = meta!; - } - }; - - const walkIntoLeafWithTokens = ( + protected walkIntoLeafWithTokens( tokens: string[], onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop - ) => { - let node: TrieNode = root; + ) { + let node: TrieNode = this.$root; let parent: TrieNode = node; let token: string; @@ -193,11 +138,11 @@ export const createTrie = (from?: string[] | Set | null, smo return { node, parent }; }; - const walkIntoLeafWithSuffix = ( + protected walkIntoLeafWithSuffix( suffix: string, onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop - ) => { - let node: TrieNode = root; + ) { + let node: TrieNode = this.$root; let parent: TrieNode = node; const onToken = (token: string) => { @@ -225,18 +170,18 @@ export const createTrie = (from?: string[] | Set | null, smo return { node, parent }; }; - const contains = (suffix: string): boolean => walkIntoLeafWithSuffix(suffix) !== null; + public contains(suffix: string): boolean { return this.walkIntoLeafWithSuffix(suffix) !== null; }; - const walk = ( + private walk( onMatches: (suffix: string[], meta: Meta) => void, - initialNode = root, + initialNode = this.$root, initialSuffix: string[] = [] - ) => { + ) { const nodeStack: Array> = [initialNode]; // Resolving initial string (begin the start of the stack) const suffixStack: string[][] = [initialSuffix]; - let node: TrieNode = root; + let node: TrieNode = initialNode; do { node = nodeStack.pop()!; @@ -256,14 +201,7 @@ export const createTrie = (from?: string[] | Set | null, smo } while (nodeStack.length); }; - interface FindSingleChildLeafResult { - node: TrieNode, - toPrune: TrieNode | null, - tokenToPrune: string | null, - parent: TrieNode - } - - const getSingleChildLeaf = (tokens: string[]): FindSingleChildLeafResult | null => { + protected getSingleChildLeaf(tokens: string[]): FindSingleChildLeafResult | null { let toPrune: TrieNode | null = null; let tokenToPrune: string | null = null; @@ -289,7 +227,7 @@ export const createTrie = (from?: string[] | Set | null, smo } }; - const res = walkIntoLeafWithTokens(tokens, onLoop); + const res = this.walkIntoLeafWithTokens(tokens, onLoop); if (res === null) return null; return { node: res.node, toPrune, tokenToPrune, parent: res.parent }; @@ -298,16 +236,16 @@ export const createTrie = (from?: string[] | Set | null, smo /** * Method used to retrieve every item in the trie with the given prefix. */ - const find = ( + public find( inputSuffix: string, /** @default true */ includeEqualWithSuffix = true - ): string[] => { + ): string[] { // if (smolTree) { // throw new Error('A Trie with smolTree enabled cannot perform find!'); // } const inputTokens = hostnameToTokens(inputSuffix); - const res = walkIntoLeafWithTokens(inputTokens); + const res = this.walkIntoLeafWithTokens(inputTokens); if (res === null) return []; const matches: string[][] = []; @@ -322,7 +260,7 @@ export const createTrie = (from?: string[] | Set | null, smo } }; - walk( + this.walk( onMatches, res.node, // Performing DFS from prefix inputTokens @@ -334,13 +272,13 @@ export const createTrie = (from?: string[] | Set | null, smo /** * Method used to delete a prefix from the trie. */ - const remove = (suffix: string): boolean => { - const res = getSingleChildLeaf(hostnameToTokens(suffix)); + public remove(suffix: string): boolean { + const res = this.getSingleChildLeaf(hostnameToTokens(suffix)); if (res === null) return false; if (!res.node[0]) return false; - size--; + this.$size--; const { node, toPrune, tokenToPrune } = res; if (tokenToPrune && toPrune) { @@ -352,58 +290,121 @@ export const createTrie = (from?: string[] | Set | null, smo return true; }; + // eslint-disable-next-line @typescript-eslint/unbound-method -- alias class methods + public delete = this.remove; + /** - * Method used to assert whether the given prefix exists in the Trie. - */ - const has = (suffix: string): boolean => { - const res = walkIntoLeafWithSuffix(suffix); + * Method used to assert whether the given prefix exists in the Trie. + */ + public has(suffix: string): boolean { + const res = this.walkIntoLeafWithSuffix(suffix); return res ? res.node[0] : false; }; - function dump(onSuffix: (suffix: string) => void): void; - function dump(): string[]; - function dump(onSuffix?: (suffix: string) => void): string[] | void { + public dump(onSuffix: (suffix: string) => void): void; + public dump(): string[]; + public dump(onSuffix?: (suffix: string) => void): string[] | void { const results: string[] = []; const handleSuffix = onSuffix ? (suffix: string[]) => onSuffix(fastStringArrayJoin(suffix, '')) : (suffix: string[]) => results.push(fastStringArrayJoin(suffix, '')); - walk(handleSuffix); + this.walk(handleSuffix); return results; }; - const dumpMeta = () => { + public dumpMeta() { const results: Meta[] = []; - walk((suffix, meta) => { + this.walk((_suffix, meta) => { results.push(meta); }); return results; }; - const dumpWithMeta = () => { + public dumpWithMeta() { const results: Array<[string, Meta]> = []; - walk((suffix, meta) => { + this.walk((suffix, meta) => { results.push([fastStringArrayJoin(suffix, ''), meta]); }); return results; }; - const whitelist = (suffix: string) => { - if (!smolTree) { - throw new Error('whitelist method is only available in smolTree mode.'); + public inspect(depth: number, unpackMeta?: (meta?: Meta) => any) { + return fastStringArrayJoin( + JSON.stringify(deepTrieNodeToJSON(this.$root, unpackMeta), null, 2).split('\n').map((line) => ' '.repeat(depth) + line), + '\n' + ); + } + + public [util.inspect.custom](depth: number) { + return this.inspect(depth); + }; +} + +export class HostnameSmolTrie extends Triebase { + public smolTree = true; + + add(suffix: string, meta?: Meta): void { + let node: TrieNode = this.$root; + let curNodeChildren: Map> = node[2]; + + const onToken = (token: string) => { + curNodeChildren = node[2]; + if (curNodeChildren.has(token)) { + node = curNodeChildren.get(token)!; + + // During the adding of `[start]blog|.skk.moe` and find out that there is a `[start].skk.moe` in the trie, skip adding the rest of the node + if (node[0] && token === '.') { + return true; + } + } else { + const newNode = createNode(node); + curNodeChildren.set(token, newNode); + node = newNode; + } + + return false; + }; + + // When walkHostnameTokens returns true, we should skip the rest + if (walkHostnameTokens(suffix, onToken)) { + return; } + // If we are in smolTree mode, we need to do something at the end of the loop + if (suffix[0] === '.') { + // Trying to add `[start].sub.example.com` where there is already a `[start]blog.sub.example.com` in the trie + + // Make sure parent `[start]sub.example.com` (without dot) is removed (SETINEL to false) + (/** parent */ node[1]!)[0] = false; + + // Removing the rest of the parent's child nodes + node[2].clear(); + // The SENTINEL of this node will be set to true at the end of the function, so we don't need to set it here + + // we can use else-if here, because the children is now empty, we don't need to check the leading "." + } else if (node[2].get('.')?.[0] === true) { + // Trying to add `example.com` when there is already a `.example.com` in the trie + // No need to increment size and set SENTINEL to true (skip this "new" item) + return; + } + + node[0] = true; + node[3] = meta!; + } + + public whitelist(suffix: string) { const tokens = hostnameToTokens(suffix); - const res = getSingleChildLeaf(tokens); + const res = this.getSingleChildLeaf(tokens); if (res === null) return; @@ -433,45 +434,48 @@ export const createTrie = (from?: string[] | Set | null, smo node[0] = false; } }; +} - // Actually build trie - if (Array.isArray(from)) { - for (let i = 0, l = from.length; i < l; i++) { - add(from[i]); - } - } else if (from) { - from.forEach((value) => add(value)); +export class HostnameTrie extends Triebase { + get size() { + return this.$size; } - const inspect = (depth: number, unpackMeta?: (meta?: Meta) => any) => fastStringArrayJoin( - JSON.stringify(deepTrieNodeToJSON(root, unpackMeta), null, 2).split('\n').map((line) => ' '.repeat(depth) + line), - '\n' - ); + add(suffix: string, meta?: Meta): void { + let node: TrieNode = this.$root; - return { - add, - contains, - find, - remove, - delete: remove, - has, - dump, - dumpMeta, - dumpWithMeta, - get size() { - if (smolTree) { - throw new Error('A Trie with smolTree enabled cannot have correct size!'); + const onToken = (token: string) => { + if (node[2].has(token)) { + node = node[2].get(token)!; + } else { + const newNode = createNode(node); + node[2].set(token, newNode); + node = newNode; } - return size; - }, - get root() { - return root; - }, - whitelist, - inspect, - [util.inspect.custom]: inspect, - smolTree - }; + + return false; + }; + + // When walkHostnameTokens returns true, we should skip the rest + if (walkHostnameTokens(suffix, onToken)) { + return; + } + + if (!node[0]) { + this.$size++; + node[0] = true; + node[3] = meta!; + } + } +} + +export function createTrie(from: string[] | Set | null, smolTree: true): HostnameSmolTrie; +export function createTrie(from?: string[] | Set | null, smolTree?: false): HostnameTrie; +export function createTrie<_Meta = any>(from?: string[] | Set | null, smolTree = true) { + if (smolTree) { + return new HostnameSmolTrie(from); + } + return new HostnameTrie(from); }; export type Trie = ReturnType; diff --git a/Build/validate-domestic.ts b/Build/validate-domestic.ts index 158ea3c1..8e2831a4 100644 --- a/Build/validate-domestic.ts +++ b/Build/validate-domestic.ts @@ -7,7 +7,7 @@ import { parseFelixDnsmasq } from './lib/parse-dnsmasq'; import { SOURCE_DIR } from './constants/dir'; export const parseDomesticList = async () => { - const trie = createTrie(await parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'), true); + const trie = createTrie(await parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')); const top5000 = new Set(); diff --git a/Build/validate-gfwlist.ts b/Build/validate-gfwlist.ts index 72f97657..d07f314a 100644 --- a/Build/validate-gfwlist.ts +++ b/Build/validate-gfwlist.ts @@ -76,7 +76,7 @@ export const parseGfwList = async () => { })).text(); const topDomains = parse(res); - const trie = createTrie(blackSet, true); + const trie = createTrie(blackSet); for await (const [domain] of topDomains) { if (trie.has(domain)) {