mirror of
https://github.com/SukkaW/Surge.git
synced 2025-12-13 01:30:37 +08:00
Perf: domain deduper using only trie + DFS
This commit is contained in:
parent
1b116637d2
commit
160e7bfab7
@ -3,22 +3,29 @@ import { createTrie } from './trie';
|
|||||||
export function domainDeduper(inputDomains: string[], toArray?: true): string[];
|
export function domainDeduper(inputDomains: string[], toArray?: true): string[];
|
||||||
export function domainDeduper(inputDomains: string[], toArray: false): Set<string>;
|
export function domainDeduper(inputDomains: string[], toArray: false): Set<string>;
|
||||||
export function domainDeduper(inputDomains: string[], toArray = true): string[] | Set<string> {
|
export function domainDeduper(inputDomains: string[], toArray = true): string[] | Set<string> {
|
||||||
const trie = createTrie(inputDomains, true);
|
const trie = createTrie(inputDomains, true, true);
|
||||||
const sets = new Set(inputDomains);
|
const dumped = trie.dump();
|
||||||
|
|
||||||
for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
|
|
||||||
const d = inputDomains[i];
|
|
||||||
if (d[0] !== '.') {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
trie.substractSetInPlaceFromFound(d, sets);
|
|
||||||
sets.delete(d.slice(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (toArray) {
|
if (toArray) {
|
||||||
return Array.from(sets);
|
return dumped;
|
||||||
}
|
}
|
||||||
|
return new Set(dumped);
|
||||||
|
|
||||||
return sets;
|
// const trie = createTrie(inputDomains, true);
|
||||||
|
// const sets = new Set(inputDomains);
|
||||||
|
|
||||||
|
// for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
|
||||||
|
// const d = inputDomains[i];
|
||||||
|
// if (d[0] !== '.') {
|
||||||
|
// continue;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// trie.substractSetInPlaceFromFound(d, sets);
|
||||||
|
// sets.delete(d.slice(1));
|
||||||
|
// }
|
||||||
|
|
||||||
|
// if (toArray) {
|
||||||
|
// return Array.from(sets);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// return sets;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -112,7 +112,7 @@ describe.each([
|
|||||||
expect(trie.find('noc.one')).toStrictEqual(['noc.one']);
|
expect(trie.find('noc.one')).toStrictEqual(['noc.one']);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should remove subdomain', () => {
|
it('should match subdomain - 1', () => {
|
||||||
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
|
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
|
||||||
|
|
||||||
console.log(trie);
|
console.log(trie);
|
||||||
@ -121,8 +121,80 @@ describe.each([
|
|||||||
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
|
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should match subdomain - 2', () => {
|
||||||
|
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], hostnameMode);
|
||||||
|
|
||||||
|
console.log(trie);
|
||||||
|
|
||||||
|
expect(trie.find('.skk.moe')).toStrictEqual(['.skk.moe', 'image.cdn.skk.moe', 'blog.skk.moe']);
|
||||||
|
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
|
||||||
|
});
|
||||||
|
|
||||||
it('should not remove non-subdomain', () => {
|
it('should not remove non-subdomain', () => {
|
||||||
const trie = createTrie(['skk.moe', 'sukkaskk.moe'], hostnameMode);
|
const trie = createTrie(['skk.moe', 'sukkaskk.moe'], hostnameMode);
|
||||||
expect(trie.find('.skk.moe')).toStrictEqual([]);
|
expect(trie.find('.skk.moe')).toStrictEqual([]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('smol tree', () => {
|
||||||
|
it('should create simple tree - 1', () => {
|
||||||
|
const trie = createTrie([
|
||||||
|
'.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe',
|
||||||
|
'www.noc.one', 'cdn.noc.one',
|
||||||
|
'.blog.sub.example.com', 'sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
|
||||||
|
], true, true);
|
||||||
|
|
||||||
|
console.log(trie);
|
||||||
|
|
||||||
|
expect(trie.dump()).toStrictEqual([
|
||||||
|
'.sub.example.com',
|
||||||
|
'cdn.noc.one', 'www.noc.one',
|
||||||
|
'.skk.moe'
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.only('should create simple tree - 2', () => {
|
||||||
|
const trie = createTrie([
|
||||||
|
'.skk.moe', 'blog.skk.moe', '.cdn.skk.moe', 'skk.moe'
|
||||||
|
], true, true);
|
||||||
|
|
||||||
|
console.log({ trie });
|
||||||
|
|
||||||
|
expect(trie.dump()).toStrictEqual([
|
||||||
|
'.skk.moe'
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should create simple tree - 2', () => {
|
||||||
|
const trie = createTrie([
|
||||||
|
'.blog.sub.example.com', 'cdn.sub.example.com', '.sub.example.com'
|
||||||
|
], true, true);
|
||||||
|
|
||||||
|
console.log(trie);
|
||||||
|
|
||||||
|
expect(trie.dump()).toStrictEqual([
|
||||||
|
'.sub.example.com'
|
||||||
|
]);
|
||||||
|
|
||||||
|
trie.add('.sub.example.com');
|
||||||
|
expect(trie.dump()).toStrictEqual([
|
||||||
|
'.sub.example.com'
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should create simple tree - 3', () => {
|
||||||
|
const trie = createTrie([
|
||||||
|
'commercial.shouji.360.cn',
|
||||||
|
'act.commercial.shouji.360.cn',
|
||||||
|
'cdn.creative.medialytics.com',
|
||||||
|
'px.cdn.creative.medialytics.com'
|
||||||
|
], true, true);
|
||||||
|
|
||||||
|
expect(trie.dump()).toStrictEqual([
|
||||||
|
'cdn.creative.medialytics.com',
|
||||||
|
'px.cdn.creative.medialytics.com',
|
||||||
|
'commercial.shouji.360.cn',
|
||||||
|
'act.commercial.shouji.360.cn'
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@ -5,9 +5,11 @@
|
|||||||
// import { Trie } from 'mnemonist';
|
// import { Trie } from 'mnemonist';
|
||||||
|
|
||||||
export const SENTINEL = Symbol('SENTINEL');
|
export const SENTINEL = Symbol('SENTINEL');
|
||||||
|
const PARENT = Symbol('Parent Node');
|
||||||
|
|
||||||
type TrieNode = {
|
type TrieNode = {
|
||||||
[SENTINEL]: boolean,
|
[SENTINEL]: boolean,
|
||||||
|
[PARENT]: TrieNode | null,
|
||||||
[Bun.inspect.custom]: () => string
|
[Bun.inspect.custom]: () => string
|
||||||
} & Map<string, TrieNode>;
|
} & Map<string, TrieNode>;
|
||||||
|
|
||||||
@ -26,14 +28,15 @@ function trieNodeInspectCustom(this: TrieNode) {
|
|||||||
return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
|
return JSON.stringify(deepTrieNodeToJSON(this), null, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
const createNode = (): TrieNode => {
|
const createNode = (parent: TrieNode | null = null): TrieNode => {
|
||||||
const node = new Map<string, TrieNode>() as TrieNode;
|
const node = new Map<string, TrieNode>() as TrieNode;
|
||||||
node[SENTINEL] = false;
|
node[SENTINEL] = false;
|
||||||
|
node[PARENT] = parent;
|
||||||
node[Bun.inspect.custom] = trieNodeInspectCustom;
|
node[Bun.inspect.custom] = trieNodeInspectCustom;
|
||||||
return node;
|
return node;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false) => {
|
export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
|
||||||
let size = 0;
|
let size = 0;
|
||||||
const root: TrieNode = createNode();
|
const root: TrieNode = createNode();
|
||||||
|
|
||||||
@ -75,11 +78,35 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
|
|
||||||
if (node.has(token)) {
|
if (node.has(token)) {
|
||||||
node = node.get(token)!;
|
node = node.get(token)!;
|
||||||
|
|
||||||
|
if (smolTree) {
|
||||||
|
if (node.get('.')?.[SENTINEL] === true) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// return;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const newNode = createNode();
|
const newNode = createNode(node);
|
||||||
node.set(token, newNode);
|
node.set(token, newNode);
|
||||||
node = newNode;
|
node = newNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (smolTree) {
|
||||||
|
if (i === 1 && tokens[0] === '.') {
|
||||||
|
node[SENTINEL] = false;
|
||||||
|
// Trying to add `.sub.example.com` where there is already a `blog.sub.example.com` in the trie
|
||||||
|
const newNode = createNode(node);
|
||||||
|
node.set('.', newNode);
|
||||||
|
node = newNode;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (i === 0) {
|
||||||
|
// Trying to add `example.com` when there is already a `.example.com` in the trie
|
||||||
|
if (node.get('.')?.[SENTINEL] === true) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do we need to increase size?
|
// Do we need to increase size?
|
||||||
@ -107,10 +134,15 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method used to retrieve every item in the trie with the given prefix.
|
* Method used to retrieve every item in the trie with the given prefix.
|
||||||
*/
|
*/
|
||||||
const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
|
const find = (inputSuffix: string, /** @default true */ includeEqualWithSuffix = true): string[] => {
|
||||||
|
if (smolTree) {
|
||||||
|
throw new Error('A Trie with smolTree enabled cannot perform find!');
|
||||||
|
}
|
||||||
|
|
||||||
let node: TrieNode | undefined = root;
|
let node: TrieNode | undefined = root;
|
||||||
let token: string;
|
let token: string;
|
||||||
|
|
||||||
@ -153,10 +185,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
nodeStack.push(childNode);
|
nodeStack.push(childNode);
|
||||||
|
|
||||||
if (hostnameMode) {
|
if (hostnameMode) {
|
||||||
const stack = (suffix as string[]).slice();
|
suffixStack.push([k, ...suffix]);
|
||||||
stack.unshift(k);
|
|
||||||
|
|
||||||
suffixStack.push(stack);
|
|
||||||
} else {
|
} else {
|
||||||
suffixStack.push(k + (suffix as string));
|
suffixStack.push(k + (suffix as string));
|
||||||
}
|
}
|
||||||
@ -170,6 +199,10 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
* Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
|
* Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
|
||||||
*/
|
*/
|
||||||
const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
|
const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
|
||||||
|
if (smolTree) {
|
||||||
|
throw new Error('A Trie with smolTree enabled cannot perform substractSetInPlaceFromFound!');
|
||||||
|
}
|
||||||
|
|
||||||
let node: TrieNode | undefined = root;
|
let node: TrieNode | undefined = root;
|
||||||
let token: string;
|
let token: string;
|
||||||
|
|
||||||
@ -205,8 +238,7 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
node.forEach((childNode, k) => {
|
node.forEach((childNode, k) => {
|
||||||
nodeStack.push(childNode);
|
nodeStack.push(childNode);
|
||||||
if (hostnameMode) {
|
if (hostnameMode) {
|
||||||
const stack = (suffix as string[]).slice();
|
const stack = [k, ...suffix];
|
||||||
stack.unshift(k);
|
|
||||||
suffixStack.push(stack);
|
suffixStack.push(stack);
|
||||||
} else {
|
} else {
|
||||||
suffixStack.push(k + (suffix as string));
|
suffixStack.push(k + (suffix as string));
|
||||||
@ -294,35 +326,43 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
}
|
}
|
||||||
|
|
||||||
const dump = () => {
|
const dump = () => {
|
||||||
const node = root;
|
|
||||||
const nodeStack: TrieNode[] = [];
|
const nodeStack: TrieNode[] = [];
|
||||||
const suffixStack: string[] = [];
|
const suffixStack: Array<string | string[]> = [];
|
||||||
// Resolving initial string
|
// Resolving initial string
|
||||||
const suffix = '';
|
const suffix = hostnameMode ? [] : '';
|
||||||
|
|
||||||
nodeStack.push(node);
|
nodeStack.push(root);
|
||||||
suffixStack.push(suffix);
|
suffixStack.push(suffix);
|
||||||
|
|
||||||
const results: string[] = [];
|
const results: string[] = [];
|
||||||
|
|
||||||
let currentNode: TrieNode;
|
let node: TrieNode;
|
||||||
let currentPrefix: string;
|
|
||||||
let hasValue = false;
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
currentNode = nodeStack.pop()!;
|
let hasValue = false;
|
||||||
currentPrefix = suffixStack.pop()!;
|
|
||||||
|
|
||||||
if (currentNode[SENTINEL]) {
|
node = nodeStack.pop()!;
|
||||||
|
const suffix = suffixStack.pop()!;
|
||||||
|
|
||||||
|
if (node[SENTINEL]) {
|
||||||
hasValue = true;
|
hasValue = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
node.forEach((childNode, k) => {
|
node.forEach((childNode, k) => {
|
||||||
nodeStack.push(childNode);
|
nodeStack.push(childNode);
|
||||||
suffixStack.push(k + suffix);
|
|
||||||
|
if (hostnameMode) {
|
||||||
|
suffixStack.push([k, ...suffix]);
|
||||||
|
} else {
|
||||||
|
suffixStack.push(k + (suffix as string));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (hasValue) results.push(currentPrefix);
|
if (hasValue) {
|
||||||
|
results.push(
|
||||||
|
hostnameMode ? (suffix as string[]).join('') : (suffix as string)
|
||||||
|
);
|
||||||
|
}
|
||||||
} while (nodeStack.length);
|
} while (nodeStack.length);
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
@ -338,6 +378,9 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
|
|||||||
has,
|
has,
|
||||||
dump,
|
dump,
|
||||||
get size() {
|
get size() {
|
||||||
|
if (smolTree) {
|
||||||
|
throw new Error('A Trie with smolTree enabled cannot have correct size!');
|
||||||
|
}
|
||||||
return size;
|
return size;
|
||||||
},
|
},
|
||||||
get root() {
|
get root() {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user