From bb6c7cb3fae78ae3104920e1274e42b16e2990c9 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Tue, 11 Nov 2025 00:07:43 +0800 Subject: [PATCH] Chore: update source deduping tool --- Build/lib/trie.test.ts | 43 +++++++++++++++++++++++++++++++++++++++ Build/lib/trie.ts | 42 ++++++++++++++++++++++++++++++++++---- Build/tools-dedupe-src.ts | 16 +++++++++++---- 3 files changed, 93 insertions(+), 8 deletions(-) diff --git a/Build/lib/trie.test.ts b/Build/lib/trie.test.ts index 5fd64f24..ce4b991e 100644 --- a/Build/lib/trie.test.ts +++ b/Build/lib/trie.test.ts @@ -365,4 +365,47 @@ describe('smol tree', () => { trie.whitelist('cdn.example.com'); expect(trie.dump()).toStrictEqual(['blog.cdn.example.com']); }); + + it('contains - normal', () => { + const trie = createTrie([ + 'skk.moe', + 'anotherskk.moe', + 'blog.anotherskk.moe', + 'blog.skk.moe' + ], true); + + expect(trie.contains('skk.moe')).toBe(true); + expect(trie.contains('blog.skk.moe')).toBe(true); + expect(trie.contains('anotherskk.moe')).toBe(true); + expect(trie.contains('blog.anotherskk.moe')).toBe(true); + + expect(trie.contains('example.com')).toBe(false); + expect(trie.contains('blog.example.com')).toBe(false); + expect(trie.contains('skk.mo')).toBe(false); + expect(trie.contains('cdn.skk.moe')).toBe(false); + }); + + it('contains - subdomain', () => { + const trie = createTrie([ + 'index.rubygems.org' + ], true); + + expect(trie.contains('rubygems.org')).toBe(false); + expect(trie.contains('index.rubygems.org')).toBe(true); + expect(trie.contains('sub.index.rubygems.org')).toBe(false); + }); + + it('contains - include subdomains', () => { + const trie = createTrie([ + '.skk.moe' + ], true); + + expect(trie.contains('skk.moe')).toBe(true); + expect(trie.contains('blog.skk.moe')).toBe(true); + expect(trie.contains('image.cdn.skk.moe')).toBe(true); + + expect(trie.contains('example.com')).toBe(false); + expect(trie.contains('blog.example.com')).toBe(false); + expect(trie.contains('skk.mo')).toBe(false); + }); }); diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index 92d03552..6285a780 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -186,10 +186,44 @@ abstract class Triebase { public contains(suffix: string, includeAllSubdomain = suffix[0] === '.'): boolean { const hostnameFromIndex = suffix[0] === '.' ? 1 : 0; - const res = this.walkIntoLeafWithSuffix(suffix, hostnameFromIndex); - if (!res) return false; - if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN); - return true; + let node: TrieNode = this.$root; + // let parent: TrieNode = node; + + let child: Map> = node[2]; + + let result = false; + + const onToken = (token: string) => { + // if (token === '') { + // return true; + // } + + // parent = node; + + child = node[2]; + + if (child.has(token)) { + node = child.get(token)!; + } else { + if (getBit(node[0], INCLUDE_ALL_SUBDOMAIN)) { + result = true; + } + return null; + } + + return false; + }; + + if (walkHostnameTokens(suffix, onToken, hostnameFromIndex) === null) { + return result; + } + + if (includeAllSubdomain) return getBit(node[0], INCLUDE_ALL_SUBDOMAIN); + return getBit(node[0], START); + + // if (res === null) return false; + // if (includeAllSubdomain) return getBit(res.node[0], INCLUDE_ALL_SUBDOMAIN); + // return true; }; private static bfsResults: [node: TrieNode | null, suffix: string[]] = [null, []]; diff --git a/Build/tools-dedupe-src.ts b/Build/tools-dedupe-src.ts index 10d951a5..ff60cbbd 100644 --- a/Build/tools-dedupe-src.ts +++ b/Build/tools-dedupe-src.ts @@ -4,7 +4,7 @@ import fsp from 'node:fs/promises'; import { SOURCE_DIR } from './constants/dir'; import { readFileByLine } from './lib/fetch-text-by-line'; import { processLine } from './lib/process-line'; -import { HostnameSmolTrie, HostnameTrie } from './lib/trie'; +import { HostnameSmolTrie } from './lib/trie'; import { task } from './trace'; const ENFORCED_WHITELIST = [ @@ -21,7 +21,8 @@ const ENFORCED_WHITELIST = [ 'samsungqbe.com', 'ntp.api.bz', 'cdn.tuk.dev', - 'vocadb-analytics.fly.dev' + 'vocadb-analytics.fly.dev', + 'img.vim-cn.com' ]; const WHITELIST: string[] = ['httpdns.bilivideo.com', 'ntp.api.bz', 'httpdns-v6.gslb.yy.com', 'img.vim-cn.com', 'img.jjbb.me', 'thingproxy.freeboard.io', 'assets.chess24.com', 'cdn.chess24.com', 'static-assets.freeanimehentai.net', 'static.javcdn.info', 'cdn.vidible.tv', 'it.apache.contactlab.it', 'mirror.netinch.com', 'de.freedif.org', 'league1.maoyuncloud.cn', 'spl.ztvx8.com', 'zls.xz6d.com', 'iadmatapk.nosdn.127.net', 'show.buzzcity.net', 'click.buzzcity.net', 'apps.buzzcity.net', 'content-cdn.y2mate.com', 'images.voguehk.com', 'cdn.amh.moe', 'statics.mnnews.tw']; @@ -51,10 +52,13 @@ task(require.main === module, __filename)(async (span) => { async function dedupeFile(file: string, whitelist: HostnameSmolTrie) { const result: string[] = []; - const trie = new HostnameTrie(); + const trie = new HostnameSmolTrie(); let line: string | null = ''; + // eslint-disable-next-line @typescript-eslint/unbound-method -- .call + let trieHasOrContains = HostnameSmolTrie.prototype.has; + for await (const l of readFileByLine(file)) { line = processLine(l); @@ -62,12 +66,16 @@ async function dedupeFile(file: string, whitelist: HostnameSmolTrie) { if (l.startsWith('# $ skip_dedupe_src')) { return; } + if (l.startsWith('# $ dedupe_use_trie_contains')) { + // eslint-disable-next-line @typescript-eslint/unbound-method -- .call + trieHasOrContains = HostnameSmolTrie.prototype.contains; + } result.push(l); // keep all comments and blank lines continue; } - if (trie.has(line)) { + if (trieHasOrContains.call(trie, line)) { continue; // drop duplicate }