Chore: update source deduping tool

This commit is contained in:
SukkaW 2025-11-11 00:07:43 +08:00
parent 4d9c2a5154
commit bb6c7cb3fa
3 changed files with 93 additions and 8 deletions

View File

@ -365,4 +365,47 @@ describe('smol tree', () => {
trie.whitelist('cdn.example.com');
expect(trie.dump()).toStrictEqual(['blog.cdn.example.com']);
});
it('contains - normal', () => {
const trie = createTrie([
'skk.moe',
'anotherskk.moe',
'blog.anotherskk.moe',
'blog.skk.moe'
], true);
expect(trie.contains('skk.moe')).toBe(true);
expect(trie.contains('blog.skk.moe')).toBe(true);
expect(trie.contains('anotherskk.moe')).toBe(true);
expect(trie.contains('blog.anotherskk.moe')).toBe(true);
expect(trie.contains('example.com')).toBe(false);
expect(trie.contains('blog.example.com')).toBe(false);
expect(trie.contains('skk.mo')).toBe(false);
expect(trie.contains('cdn.skk.moe')).toBe(false);
});
it('contains - subdomain', () => {
const trie = createTrie([
'index.rubygems.org'
], true);
expect(trie.contains('rubygems.org')).toBe(false);
expect(trie.contains('index.rubygems.org')).toBe(true);
expect(trie.contains('sub.index.rubygems.org')).toBe(false);
});
it('contains - include subdomains', () => {
const trie = createTrie([
'.skk.moe'
], true);
expect(trie.contains('skk.moe')).toBe(true);
expect(trie.contains('blog.skk.moe')).toBe(true);
expect(trie.contains('image.cdn.skk.moe')).toBe(true);
expect(trie.contains('example.com')).toBe(false);
expect(trie.contains('blog.example.com')).toBe(false);
expect(trie.contains('skk.mo')).toBe(false);
});
});

View File

@ -186,10 +186,44 @@ abstract class Triebase<Meta = unknown> {
public contains(suffix: string, includeAllSubdomain = suffix[0] === '.'): boolean {
const hostnameFromIndex = suffix[0] === '.' ? 1 : 0;
const res = this.walkIntoLeafWithSuffix(suffix, hostnameFromIndex);
if (!res) return false;
if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
return true;
let node: TrieNode = this.$root;
// let parent: TrieNode = node;
let child: Map<string, TrieNode<Meta>> = node[2];
let result = false;
const onToken = (token: string) => {
// if (token === '') {
// return true;
// }
// parent = node;
child = node[2];
if (child.has(token)) {
node = child.get(token)!;
} else {
if (getBit(node[0], INCLUDE_ALL_SUBDOMAIN)) {
result = true;
}
return null;
}
return false;
};
if (walkHostnameTokens(suffix, onToken, hostnameFromIndex) === null) {
return result;
}
if (includeAllSubdomain) return getBit(node[0], INCLUDE_ALL_SUBDOMAIN);
return getBit(node[0], START);
// if (res === null) return false;
// if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN);
// return true;
};
private static bfsResults: [node: TrieNode | null, suffix: string[]] = [null, []];

View File

@ -4,7 +4,7 @@ import fsp from 'node:fs/promises';
import { SOURCE_DIR } from './constants/dir';
import { readFileByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
import { HostnameSmolTrie, HostnameTrie } from './lib/trie';
import { HostnameSmolTrie } from './lib/trie';
import { task } from './trace';
const ENFORCED_WHITELIST = [
@ -21,7 +21,8 @@ const ENFORCED_WHITELIST = [
'samsungqbe.com',
'ntp.api.bz',
'cdn.tuk.dev',
'vocadb-analytics.fly.dev'
'vocadb-analytics.fly.dev',
'img.vim-cn.com'
];
const WHITELIST: string[] = ['httpdns.bilivideo.com', 'ntp.api.bz', 'httpdns-v6.gslb.yy.com', 'img.vim-cn.com', 'img.jjbb.me', 'thingproxy.freeboard.io', 'assets.chess24.com', 'cdn.chess24.com', 'static-assets.freeanimehentai.net', 'static.javcdn.info', 'cdn.vidible.tv', 'it.apache.contactlab.it', 'mirror.netinch.com', 'de.freedif.org', 'league1.maoyuncloud.cn', 'spl.ztvx8.com', 'zls.xz6d.com', 'iadmatapk.nosdn.127.net', 'show.buzzcity.net', 'click.buzzcity.net', 'apps.buzzcity.net', 'content-cdn.y2mate.com', 'images.voguehk.com', 'cdn.amh.moe', 'statics.mnnews.tw'];
@ -51,10 +52,13 @@ task(require.main === module, __filename)(async (span) => {
async function dedupeFile(file: string, whitelist: HostnameSmolTrie) {
const result: string[] = [];
const trie = new HostnameTrie();
const trie = new HostnameSmolTrie();
let line: string | null = '';
// eslint-disable-next-line @typescript-eslint/unbound-method -- .call
let trieHasOrContains = HostnameSmolTrie.prototype.has;
for await (const l of readFileByLine(file)) {
line = processLine(l);
@ -62,12 +66,16 @@ async function dedupeFile(file: string, whitelist: HostnameSmolTrie) {
if (l.startsWith('# $ skip_dedupe_src')) {
return;
}
if (l.startsWith('# $ dedupe_use_trie_contains')) {
// eslint-disable-next-line @typescript-eslint/unbound-method -- .call
trieHasOrContains = HostnameSmolTrie.prototype.contains;
}
result.push(l); // keep all comments and blank lines
continue;
}
if (trie.has(line)) {
if (trieHasOrContains.call(trie, line)) {
continue; // drop duplicate
}