Fix/Perf: more efficient and correct whitelisting

This commit is contained in:
SukkaW 2024-05-27 01:09:11 +08:00
parent 8b1eeb1c14
commit 2f329a4144
4 changed files with 52 additions and 35 deletions

View File

@ -94,9 +94,6 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
}
});
// Remove as many domains as possible from domainSets before creating trie
SetSubstract(domainSets, filterRuleWhitelistDomainSets);
// Perform kwfilter to remove as many domains as possible from domainSets before creating trie
childSpan.traceChildSync('dedupe from black keywords', () => {
const kwfilter = createKeywordFilter(domainKeywordsSet);
@ -110,11 +107,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
});
});
const trie = span.traceChildSync('dedupe from white suffixes', () => {
const trie = createTrie(domainSets, true, true);
span.traceChildSync('dedupe from white suffixes', () => {
filterRuleWhitelistDomainSets.forEach(suffix => {
trie.whitelist(suffix);
});
return trie;
});
// Dedupe domainSets

View File

@ -251,7 +251,7 @@ export const PREDEFINED_WHITELIST = [
'business.site', // Drag'n'Drop site building platform
'page.link', // Firebase URL Shortener
'notion.site'
];
].map(suffix => `.${suffix}`);
export const PREDEFINED_ENFORCED_WHITELIST = [
'r2.dev',

View File

@ -208,6 +208,26 @@ describe('smol tree', () => {
]);
});
it('should whitelist trie correctly', () => {
const trie = createTrie([
'.t.co',
't.co',
'example.t.co',
'.skk.moe'
], true, true);
expect(trie.dump()).toStrictEqual([
'.skk.moe',
'.t.co'
]);
trie.whitelist('.t.co');
expect(trie.dump()).toStrictEqual(['.skk.moe']);
trie.whitelist('skk.moe');
expect(trie.dump()).toStrictEqual([]);
});
it('should efficiently whitelist domains', () => {
const trie = createTrie([
'skk.moe',
@ -231,7 +251,6 @@ describe('smol tree', () => {
]);
trie.whitelist('anotherskk.moe');
expect(trie.dump()).toStrictEqual([
'blog.anotherskk.moe'
]);

View File

@ -370,27 +370,11 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
parent = node;
node = node.get(token);
if (!node) {
return;
}
// Keeping track of a potential branch to prune
// If the node is to be pruned, but they are more than one token child in it, we can't prune it
// If there is only one token child, or no child at all, we can prune it safely
const onlyChild = node.size === 1 && node.has(token);
if (onlyChild) {
toPrune = parent;
tokenToPrune = token;
} else if (toPrune !== null) { // not only child, retain the branch
toPrune = null;
tokenToPrune = null;
}
if (!node) return;
// During the whitelist of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie
// Dedupe the covered subdomain by skipping
if (node.get('.')?.[SENTINEL]) {
if (i > 1 && node.get('.')?.[SENTINEL] === true) {
return;
}
@ -399,21 +383,35 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
// If there is a `[start]sub.example.com` here, remove it
node[SENTINEL] = false;
// Removing the rest of the child nodes by creating a new node and disconnecting the old one
const newNode = createNode(node);
node.set('.', newNode);
node = newNode;
break;
// Removing all the child nodes by disconnecting "."
node.delete('.');
} else if (i === 0) {
// Trying to whitelist `example.com` when there is already a `.example.com` in the trie
const dotNode = node.get('.');
if (dotNode?.[SENTINEL] === true) {
dotNode[SENTINEL] = false;
}
if (i === 0) {
// Trying to add `example.com` when there is already a `.example.com` in the trie
if (node.get('.')?.[SENTINEL] === true) {
return;
}
// Keeping track of a potential branch to prune
// If the node is to be pruned, but they are more than one token child in it, we can't prune it
// If there is only one token child, or no child at all, we can prune it safely
if (toPrune != null) { // the first branch that could potentially being pruned
if (node.size > 1 || node.has('.')) {
// not only child, retain the branch.
// And we need to abort prune the parent, so we set it to null
toPrune = null;
tokenToPrune = null;
}
} else if (node.size < 2 && !node.has('.')) {
toPrune = parent;
tokenToPrune = token;
}
}
if (!node[SENTINEL]) return false;
if (tokenToPrune && toPrune) {
toPrune.delete(tokenToPrune);
} else {