diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 7172858f..4fb51457 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -94,9 +94,6 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { } }); - // Remove as many domains as possible from domainSets before creating trie - SetSubstract(domainSets, filterRuleWhitelistDomainSets); - // Perform kwfilter to remove as many domains as possible from domainSets before creating trie childSpan.traceChildSync('dedupe from black keywords', () => { const kwfilter = createKeywordFilter(domainKeywordsSet); @@ -110,11 +107,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { }); }); - const trie = createTrie(domainSets, true, true); - span.traceChildSync('dedupe from white suffixes', () => { + const trie = span.traceChildSync('dedupe from white suffixes', () => { + const trie = createTrie(domainSets, true, true); + filterRuleWhitelistDomainSets.forEach(suffix => { trie.whitelist(suffix); }); + + return trie; }); // Dedupe domainSets diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 92747bf3..222d5cc6 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -251,7 +251,7 @@ export const PREDEFINED_WHITELIST = [ 'business.site', // Drag'n'Drop site building platform 'page.link', // Firebase URL Shortener 'notion.site' -]; +].map(suffix => `.${suffix}`); export const PREDEFINED_ENFORCED_WHITELIST = [ 'r2.dev', diff --git a/Build/lib/trie.test.ts b/Build/lib/trie.test.ts index fd21d513..af8b0fd4 100644 --- a/Build/lib/trie.test.ts +++ b/Build/lib/trie.test.ts @@ -208,6 +208,26 @@ describe('smol tree', () => { ]); }); + it('should whitelist trie correctly', () => { + const trie = createTrie([ + '.t.co', + 't.co', + 'example.t.co', + '.skk.moe' + ], true, true); + + expect(trie.dump()).toStrictEqual([ + '.skk.moe', + '.t.co' + ]); + + trie.whitelist('.t.co'); + expect(trie.dump()).toStrictEqual(['.skk.moe']); + + trie.whitelist('skk.moe'); + expect(trie.dump()).toStrictEqual([]); + }); + it('should efficiently whitelist domains', () => { const trie = createTrie([ 'skk.moe', @@ -231,7 +251,6 @@ describe('smol tree', () => { ]); trie.whitelist('anotherskk.moe'); - expect(trie.dump()).toStrictEqual([ 'blog.anotherskk.moe' ]); diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index 84646fbe..ae9a8cdb 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -370,27 +370,11 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = parent = node; node = node.get(token); - if (!node) { - return; - } - - // Keeping track of a potential branch to prune - // If the node is to be pruned, but they are more than one token child in it, we can't prune it - // If there is only one token child, or no child at all, we can prune it safely - - const onlyChild = node.size === 1 && node.has(token); - - if (onlyChild) { - toPrune = parent; - tokenToPrune = token; - } else if (toPrune !== null) { // not only child, retain the branch - toPrune = null; - tokenToPrune = null; - } + if (!node) return; // During the whitelist of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie // Dedupe the covered subdomain by skipping - if (node.get('.')?.[SENTINEL]) { + if (i > 1 && node.get('.')?.[SENTINEL] === true) { return; } @@ -399,21 +383,35 @@ export const createTrie = (from?: string[] | Set | null, hostnameMode = // If there is a `[start]sub.example.com` here, remove it node[SENTINEL] = false; - // Removing the rest of the child nodes by creating a new node and disconnecting the old one - const newNode = createNode(node); - node.set('.', newNode); - node = newNode; - break; - } - if (i === 0) { - // Trying to add `example.com` when there is already a `.example.com` in the trie - if (node.get('.')?.[SENTINEL] === true) { - return; + // Removing all the child nodes by disconnecting "." + node.delete('.'); + } else if (i === 0) { + // Trying to whitelist `example.com` when there is already a `.example.com` in the trie + const dotNode = node.get('.'); + if (dotNode?.[SENTINEL] === true) { + dotNode[SENTINEL] = false; } } + + // Keeping track of a potential branch to prune + // If the node is to be pruned, but they are more than one token child in it, we can't prune it + // If there is only one token child, or no child at all, we can prune it safely + + if (toPrune != null) { // the first branch that could potentially being pruned + if (node.size > 1 || node.has('.')) { + // not only child, retain the branch. + // And we need to abort prune the parent, so we set it to null + toPrune = null; + tokenToPrune = null; + } + } else if (node.size < 2 && !node.has('.')) { + toPrune = parent; + tokenToPrune = token; + } } if (!node[SENTINEL]) return false; + if (tokenToPrune && toPrune) { toPrune.delete(tokenToPrune); } else {