Perf: domain deduper using only trie + DFS

This commit is contained in:
SukkaW 2024-05-11 16:49:06 +08:00
parent 1b116637d2
commit 160e7bfab7
3 changed files with 164 additions and 42 deletions

View File

@ -3,22 +3,29 @@ import { createTrie } from './trie';
export function domainDeduper(inputDomains: string[], toArray?: true): string[]; export function domainDeduper(inputDomains: string[], toArray?: true): string[];
export function domainDeduper(inputDomains: string[], toArray: false): Set<string>; export function domainDeduper(inputDomains: string[], toArray: false): Set<string>;
export function domainDeduper(inputDomains: string[], toArray = true): string[] | Set<string> { export function domainDeduper(inputDomains: string[], toArray = true): string[] | Set<string> {
const trie = createTrie(inputDomains, true); const trie = createTrie(inputDomains, true, true);
const sets = new Set(inputDomains); const dumped = trie.dump();
for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
const d = inputDomains[i];
if (d[0] !== '.') {
continue;
}
trie.substractSetInPlaceFromFound(d, sets);
sets.delete(d.slice(1));
}
if (toArray) { if (toArray) {
return Array.from(sets); return dumped;
} }
return new Set(dumped);
return sets; // const trie = createTrie(inputDomains, true);
// const sets = new Set(inputDomains);
// for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
// const d = inputDomains[i];
// if (d[0] !== '.') {
// continue;
// }
// trie.substractSetInPlaceFromFound(d, sets);
// sets.delete(d.slice(1));
// }
// if (toArray) {
// return Array.from(sets);
// }
// return sets;
} }

View File

@ -112,7 +112,7 @@ describe.each([
expect(trie.find('noc.one')).toStrictEqual(['noc.one']); expect(trie.find('noc.one')).toStrictEqual(['noc.one']);
}); });
it('should remove subdomain', () => { it('should match subdomain - 1', () => {
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode); const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
console.log(trie); console.log(trie);
@ -121,8 +121,80 @@ describe.each([
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']); expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
}); });
it('should match subdomain - 2', () => {
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
console.log(trie);
expect(trie.find('.skk.moe')).toStrictEqual(['.skk.moe', 'image.cdn.skk.moe', 'blog.skk.moe']);
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
});
it('should not remove non-subdomain', () => { it('should not remove non-subdomain', () => {
const trie = createTrie(['skk.moe', 'sukkaskk.moe'], hostnameMode); const trie = createTrie(['skk.moe', 'sukkaskk.moe'], hostnameMode);
expect(trie.find('.skk.moe')).toStrictEqual([]); expect(trie.find('.skk.moe')).toStrictEqual([]);
}); });
}); });
describe('smol tree', () => {
it('should create simple tree - 1', () => {
const trie = createTrie([
'.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe',
'www.noc.one', 'cdn.noc.one',
'.blog.sub.example.com', 'sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
], true, true);
console.log(trie);
expect(trie.dump()).toStrictEqual([
'.sub.example.com',
'cdn.noc.one', 'www.noc.one',
'.skk.moe'
]);
});
it.only('should create simple tree - 2', () => {
const trie = createTrie([
'.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe'
], true, true);
console.log({ trie });
expect(trie.dump()).toStrictEqual([
'.skk.moe'
]);
});
it('should create simple tree - 2', () => {
const trie = createTrie([
'.blog.sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
], true, true);
console.log(trie);
expect(trie.dump()).toStrictEqual([
'.sub.example.com'
]);
trie.add('.sub.example.com');
expect(trie.dump()).toStrictEqual([
'.sub.example.com'
]);
});
it('should create simple tree - 3', () => {
const trie = createTrie([
'commercial.shouji.360.cn',
'act.commercial.shouji.360.cn',
'cdn.creative.medialytics.com',
'px.cdn.creative.medialytics.com'
], true, true);
expect(trie.dump()).toStrictEqual([
'cdn.creative.medialytics.com',
'px.cdn.creative.medialytics.com',
'commercial.shouji.360.cn',
'act.commercial.shouji.360.cn'
]);
});
});

View File

@ -5,9 +5,11 @@
// import { Trie } from 'mnemonist'; // import { Trie } from 'mnemonist';
export const SENTINEL = Symbol('SENTINEL'); export const SENTINEL = Symbol('SENTINEL');
const PARENT = Symbol('Parent Node');
type TrieNode = { type TrieNode = {
[SENTINEL]: boolean, [SENTINEL]: boolean,
[PARENT]: TrieNode | null,
[Bun.inspect.custom]: () => string [Bun.inspect.custom]: () => string
} & Map<string, TrieNode>; } & Map<string, TrieNode>;
@ -26,14 +28,15 @@ function trieNodeInspectCustom(this: TrieNode) {
return JSON.stringify(deepTrieNodeToJSON(this), null, 2); return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
} }
const createNode = (): TrieNode => { const createNode = (parent: TrieNode | null = null): TrieNode => {
const node = new Map<string, TrieNode>() as TrieNode; const node = new Map<string, TrieNode>() as TrieNode;
node[SENTINEL] = false; node[SENTINEL] = false;
node[PARENT] = parent;
node[Bun.inspect.custom] = trieNodeInspectCustom; node[Bun.inspect.custom] = trieNodeInspectCustom;
return node; return node;
}; };
export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false) => { export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
let size = 0; let size = 0;
const root: TrieNode = createNode(); const root: TrieNode = createNode();
@ -75,11 +78,35 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
if (node.has(token)) { if (node.has(token)) {
node = node.get(token)!; node = node.get(token)!;
if (smolTree) {
if (node.get('.')?.[SENTINEL] === true) {
return;
}
// return;
}
} else { } else {
const newNode = createNode(); const newNode = createNode(node);
node.set(token, newNode); node.set(token, newNode);
node = newNode; node = newNode;
} }
if (smolTree) {
if (i === 1 && tokens[0] === '.') {
node[SENTINEL] = false;
// Trying to add `.sub.example.com` where there is already a `blog.sub.example.com` in the trie
const newNode = createNode(node);
node.set('.', newNode);
node = newNode;
break;
}
if (i === 0) {
// Trying to add `example.com` when there is already a `.example.com` in the trie
if (node.get('.')?.[SENTINEL] === true) {
return;
}
}
}
} }
// Do we need to increase size? // Do we need to increase size?
@ -107,10 +134,15 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
return true; return true;
}; };
/** /**
* Method used to retrieve every item in the trie with the given prefix. * Method used to retrieve every item in the trie with the given prefix.
*/ */
const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => { const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot perform find!');
}
let node: TrieNode | undefined = root; let node: TrieNode | undefined = root;
let token: string; let token: string;
@ -153,10 +185,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
nodeStack.push(childNode); nodeStack.push(childNode);
if (hostnameMode) { if (hostnameMode) {
const stack = (suffix as string[]).slice(); suffixStack.push([k, ...suffix]);
stack.unshift(k);
suffixStack.push(stack);
} else { } else {
suffixStack.push(k + (suffix as string)); suffixStack.push(k + (suffix as string));
} }
@ -167,9 +196,13 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
}; };
/** /**
* Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place. * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
*/ */
const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => { const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
}
let node: TrieNode | undefined = root; let node: TrieNode | undefined = root;
let token: string; let token: string;
@ -193,7 +226,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
if (node[SENTINEL]) { if (node[SENTINEL]) {
if (suffix !== inputTokens) { if (suffix !== inputTokens) {
// found match, delete it from set // found match, delete it from set
if (hostnameMode) { if (hostnameMode) {
set.delete((suffix as string[]).join('')); set.delete((suffix as string[]).join(''));
} else { } else {
@ -205,8 +238,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
node.forEach((childNode, k) => { node.forEach((childNode, k) => {
nodeStack.push(childNode); nodeStack.push(childNode);
if (hostnameMode) { if (hostnameMode) {
const stack = (suffix as string[]).slice(); const stack = [k, ...suffix];
stack.unshift(k);
suffixStack.push(stack); suffixStack.push(stack);
} else { } else {
suffixStack.push(k + (suffix as string)); suffixStack.push(k + (suffix as string));
@ -216,8 +248,8 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
}; };
/** /**
* Method used to delete a prefix from the trie. * Method used to delete a prefix from the trie.
*/ */
const remove = (suffix: string): boolean => { const remove = (suffix: string): boolean => {
let node: TrieNode | undefined = root; let node: TrieNode | undefined = root;
let toPrune: TrieNode | null = null; let toPrune: TrieNode | null = null;
@ -294,35 +326,43 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
} }
const dump = () => { const dump = () => {
const node = root;
const nodeStack: TrieNode[] = []; const nodeStack: TrieNode[] = [];
const suffixStack: string[] = []; const suffixStack: Array<string | string[]> = [];
// Resolving initial string // Resolving initial string
const suffix = ''; const suffix = hostnameMode ? [] : '';
nodeStack.push(node); nodeStack.push(root);
suffixStack.push(suffix); suffixStack.push(suffix);
const results: string[] = []; const results: string[] = [];
let currentNode: TrieNode; let node: TrieNode;
let currentPrefix: string;
let hasValue = false;
do { do {
currentNode = nodeStack.pop()!; let hasValue = false;
currentPrefix = suffixStack.pop()!;
if (currentNode[SENTINEL]) { node = nodeStack.pop()!;
const suffix = suffixStack.pop()!;
if (node[SENTINEL]) {
hasValue = true; hasValue = true;
} }
node.forEach((childNode, k) => { node.forEach((childNode, k) => {
nodeStack.push(childNode); nodeStack.push(childNode);
suffixStack.push(k + suffix);
if (hostnameMode) {
suffixStack.push([k, ...suffix]);
} else {
suffixStack.push(k + (suffix as string));
}
}); });
if (hasValue) results.push(currentPrefix); if (hasValue) {
results.push(
hostnameMode ? (suffix as string[]).join('') : (suffix as string)
);
}
} while (nodeStack.length); } while (nodeStack.length);
return results; return results;
@ -338,6 +378,9 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
has, has,
dump, dump,
get size() { get size() {
if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot have correct size!');
}
return size; return size;
}, },
get root() { get root() {