diff --git a/Build/build-microsoft-cdn.ts b/Build/build-microsoft-cdn.ts index 5d8adce0..7aaf3d0b 100644 --- a/Build/build-microsoft-cdn.ts +++ b/Build/build-microsoft-cdn.ts @@ -33,7 +33,7 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => { trie.add(domain); } } - return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain, false))); + return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain))); }); // Second trie is to remove blacklisted domains diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 4a747091..2e29f3c9 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -79,48 +79,58 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { console.log(`Import ${previousSize} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`); // Dedupe domainSets - await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async () => { - /** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */ + await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async (childSpan) => { + /** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */ const domainSuffixSet = new Set(); /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */ const domainKeywordsSet = new Set(); - for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) { - const [type, keyword] = line.split(','); + await childSpan.traceChild('collect keywords/suffixes').traceAsyncFn(async () => { + for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) { + const [type, value] = line.split(','); - if (type === 'DOMAIN-KEYWORD') { - domainKeywordsSet.add(keyword.trim()); - } else if (type === 'DOMAIN-SUFFIX') { - domainSuffixSet.add(keyword.trim()); - } - } - - const trie1 = createTrie(domainSets); - - domainSuffixSet.forEach(suffix => { - trie1.find(suffix, true).forEach(f => domainSets.delete(f)); - }); - filterRuleWhitelistDomainSets.forEach(suffix => { - trie1.find(suffix, true).forEach(f => domainSets.delete(f)); - - if (suffix[0] === '.') { - // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) - domainSets.delete(suffix.slice(1)); - } else { - // If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set - domainSets.delete(`.${suffix}`); + if (type === 'DOMAIN-KEYWORD') { + domainKeywordsSet.add(value.trim()); + } else if (type === 'DOMAIN-SUFFIX') { + domainSuffixSet.add(value.trim()); + } } }); - // remove pre-defined enforced blacklist from whitelist - const kwfilter = createKeywordFilter(domainKeywordsSet); + // Remove as many domains as possible from domainSets before creating trie + SetHelpers.subtract(domainSets, domainSuffixSet); + SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets); - for (const domain of domainSets) { + childSpan.traceChild('dedupe from white/suffixes').traceSyncFn(() => { + const trie = createTrie(domainSets); + + domainSuffixSet.forEach(suffix => { + trie.remove(suffix); + trie.substractSetInPlaceFromFound(suffix, domainSets); + }); + filterRuleWhitelistDomainSets.forEach(suffix => { + trie.substractSetInPlaceFromFound(suffix, domainSets); + + if (suffix[0] === '.') { + // handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) + domainSets.delete(suffix.slice(1)); + } else { + // If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set + domainSets.delete(`.${suffix}`); + } + }); + }); + + childSpan.traceChild('dedupe from black keywords').traceSyncFn(() => { + const kwfilter = createKeywordFilter(domainKeywordsSet); + + for (const domain of domainSets) { // Remove keyword - if (kwfilter(domain)) { - domainSets.delete(domain); + if (kwfilter(domain)) { + domainSets.delete(domain); + } } - } + }); console.log(`Deduped ${previousSize} - ${domainSets.size} = ${previousSize - domainSets.size} from black keywords and suffixes!`); }); diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts index 5f8fe0af..3235b46d 100644 --- a/Build/lib/cache-filesystem.ts +++ b/Build/lib/cache-filesystem.ts @@ -16,8 +16,12 @@ const enum CacheStatus { } export interface CacheOptions { + /** Path to sqlite file dir */ cachePath?: string, - tbd?: number + /** Time before deletion */ + tbd?: number, + /** Cache table name */ + tableName?: string } interface CacheApplyNonStringOption { @@ -60,13 +64,18 @@ export const TTL = { export class Cache { db: Database; - tbd = 60 * 1000; // time before deletion + /** Time before deletion */ + tbd = 60 * 1000; + /** SQLite file path */ cachePath: string; + /** Table name */ + tableName: string; - constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) { + constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd, tableName = 'cache' }: CacheOptions = {}) { this.cachePath = cachePath; mkdirSync(this.cachePath, { recursive: true }); if (tbd != null) this.tbd = tbd; + this.tableName = tableName; const db = new Database(path.join(this.cachePath, 'cache.db')); @@ -75,8 +84,8 @@ export class Cache { db.exec('PRAGMA temp_store = memory;'); db.exec('PRAGMA optimize;'); - db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run(); - db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run(); + db.prepare(`CREATE TABLE IF NOT EXISTS ${this.tableName} (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);`).run(); + db.prepare(`CREATE INDEX IF NOT EXISTS cache_ttl ON ${this.tableName} (ttl);`).run(); const date = new Date(); @@ -84,7 +93,7 @@ export class Cache { // ttl + tbd < now => ttl < now - tbd const now = date.getTime() - this.tbd; - db.prepare('DELETE FROM cache WHERE ttl < ?').run(now); + db.prepare(`DELETE FROM ${this.tableName} WHERE ttl < ?`).run(now); this.db = db; @@ -100,7 +109,7 @@ export class Cache { set(key: string, value: string, ttl = 60 * 1000): void { const insert = this.db.prepare( - 'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid' + `INSERT INTO ${this.tableName} (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid` ); insert.run({ @@ -112,7 +121,7 @@ export class Cache { get(key: string, defaultValue?: string): string | undefined { const rv = this.db.prepare<{ value: string }, string>( - 'SELECT value FROM cache WHERE key = ?' + `SELECT value FROM ${this.tableName} WHERE key = ?` ).get(key); if (!rv) return defaultValue; @@ -121,13 +130,13 @@ export class Cache { has(key: string): CacheStatus { const now = Date.now(); - const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key); + const rv = this.db.prepare<{ ttl: number }, string>(`SELECT ttl FROM ${this.tableName} WHERE key = ?`).get(key); return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale); } del(key: string): void { - this.db.prepare('DELETE FROM cache WHERE key = ?').run(key); + this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key); } async apply( @@ -167,9 +176,9 @@ export class Cache { } } -export const fsCache = traceSync('initializing filesystem cache', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') })); +export const fsFetchCache = traceSync('initializing filesystem cache for fetch', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') })); // process.on('exit', () => { -// fsCache.destroy(); +// fsFetchCache.destroy(); // }); const separator = '\u0000'; diff --git a/Build/lib/create-file.ts b/Build/lib/create-file.ts index c9b58186..8f566114 100644 --- a/Build/lib/create-file.ts +++ b/Build/lib/create-file.ts @@ -35,11 +35,11 @@ export async function compareAndWriteFile(span: Span, linesA: string[], filePath } if ( lineA[0] === '/' - && lineA[1] === '/' - && lineA[3] === '#' - && lineB[0] === '/' - && lineB[1] === '/' - && lineB[3] === '#' + && lineA[1] === '/' + && lineB[0] === '/' + && lineB[1] === '/' + && lineA[3] === '#' + && lineB[3] === '#' ) { continue; } diff --git a/Build/lib/domain-deduper.ts b/Build/lib/domain-deduper.ts index 9c9da2f3..9865ea89 100644 --- a/Build/lib/domain-deduper.ts +++ b/Build/lib/domain-deduper.ts @@ -12,12 +12,7 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[] continue; } - const found = trie.find(d, false); - - for (let j = 0, len2 = found.length; j < len2; j++) { - sets.delete(found[j]); - } - + trie.substractSetInPlaceFromFound(d, sets); sets.delete(d.slice(1)); } diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 91359719..d614a013 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -104,11 +104,10 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => { for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) { const white = WHITELIST_DOMAIN[i]; - const found = trieForRemovingWhiteListed.find(`.${white}`, true); - for (let j = 0, len2 = found.length; j < len2; j++) { - domainSet.delete(found[j]); - } domainSet.delete(white); + domainSet.delete(`.${white}`); + + trieForRemovingWhiteListed.substractSetInPlaceFromFound(`.${white}`, domainSet); } }); }); diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 7c3d24df..1b2df9c9 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -8,14 +8,14 @@ import type { PublicSuffixList } from '@gorhill/publicsuffixlist'; import picocolors from 'picocolors'; import { normalizeDomain } from './normalize-domain'; import { fetchAssets } from './fetch-assets'; -import { deserializeSet, fsCache, serializeSet } from './cache-filesystem'; +import { deserializeSet, fsFetchCache, serializeSet } from './cache-filesystem'; import type { Span } from '../trace'; const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null let foundDebugDomain = false; export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { - return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsCache.apply( + return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsFetchCache.apply( domainListsUrl, async () => { const domainSets = new Set(); @@ -45,7 +45,7 @@ export function processDomainLists(span: Span, domainListsUrl: string, includeAl )); } export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) { - return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsCache.apply( + return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply( hostsUrl, async () => { const domainSets = new Set(); @@ -119,7 +119,7 @@ export async function processFilterRules( fallbackUrls?: readonly string[] | undefined | null, ttl: number | null = null ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { - const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsCache.apply fsFetchCache.apply Math.floor(Math.random() * (max - min + 1)) + min; - -group('random-int', () => { - bench('crypto.randomInt', () => { - nativeRandomInt(3, 7); - }); - - bench('Math.random', () => { - randomInt(3, 7); - }); -}); - -run(); diff --git a/Build/lib/trie.test.ts b/Build/lib/trie.test.ts index d5b22a30..ecbe6301 100644 --- a/Build/lib/trie.test.ts +++ b/Build/lib/trie.test.ts @@ -11,6 +11,7 @@ describe('Trie', () => { trie.add('akku'); expect(trie.size).toBe(3); + expect(trie.has('sukka')).toBeTrue(); expect(trie.has('ukka')).toBeTrue(); expect(trie.has('akku')).toBeTrue(); @@ -86,59 +87,6 @@ describe('Trie', () => { expect(trie.find('')).toEqual(['greek', 'roman', 'esqueroman', 'sesqueroman']); }); - // it('should work with custom tokens.', () => { - // const trie = new Trie(Array); - - // trie.add(['the', 'cat', 'eats', 'the', 'mouse']); - // trie.add(['the', 'mouse', 'eats', 'cheese']); - // trie.add(['hello', 'world']); - - // assert.strictEqual(trie.size, 3); - - // assert.strictEqual(trie.has(['the', 'mouse', 'eats', 'cheese']), true); - // assert.strictEqual(trie.has(['the', 'mouse', 'eats']), false); - - // assert.strictEqual(trie.delete(['hello']), false); - // assert.strictEqual(trie.delete(['hello', 'world']), true); - - // assert.strictEqual(trie.size, 2); - // }); - - // it('should be possible to iterate over the trie\'s prefixes.', () => { - // const trie = new Trie(); - - // trie.add('rat'); - // trie.add('rate'); - - // let prefixes = take(trie.prefixes()); - - // assert.deepStrictEqual(prefixes, ['rat', 'rate']); - - // trie.add('rater'); - // trie.add('rates'); - - // prefixes = take(trie.keys('rate')); - - // assert.deepStrictEqual(prefixes, ['rate', 'rates', 'rater']); - // }); - - // it('should be possible to iterate over the trie\'s prefixes using for...of.', () => { - // const trie = new Trie(); - - // trie.add('rat'); - // trie.add('rate'); - - // const tests = [ - // 'rat', - // 'rate' - // ]; - - // let i = 0; - - // for (const prefix of trie) - // assert.deepStrictEqual(prefix, tests[i++]); - // }); - it('should be possible to create a trie from an arbitrary iterable.', () => { const words = ['roman', 'esqueroman']; @@ -159,9 +107,10 @@ describe('surge domainset dedupe', () => { it('should remove subdomain', () => { const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']); - // trie.find('noc.one').toBe(['www.noc.one']); + + console.log(trie); + expect(trie.find('.skk.moe')).toStrictEqual(['image.cdn.skk.moe', 'blog.skk.moe']); - // trie.find('sukkaw.net').toBe(['cdn.sukkaw.net']); expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']); }); diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index 2d50e52c..1f96800f 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -2,20 +2,34 @@ * Suffix Trie based on Mnemonist Trie */ +// import { Trie } from 'mnemonist'; + export const SENTINEL = Symbol('SENTINEL'); type TrieNode = { - [SENTINEL]: boolean + [SENTINEL]: boolean, + [Bun.inspect.custom]: () => string } & Map; +const deepTrieNodeToJSON = (node: TrieNode) => { + const obj: Record = {}; + if (node[SENTINEL]) { + obj['[start]'] = node[SENTINEL]; + } + node.forEach((value, key) => { + obj[key] = deepTrieNodeToJSON(value); + }); + return obj; +}; + const createNode = (): TrieNode => { - const map = new Map(); - const node = map as TrieNode; + const node = new Map() as TrieNode; node[SENTINEL] = false; + node[Bun.inspect.custom] = () => JSON.stringify(deepTrieNodeToJSON(node), null, 2); return node; }; -export const createTrie = (from?: string[] | Set) => { +export const createTrie = (from?: string[] | Set | null) => { let size = 0; const root: TrieNode = createNode(); @@ -25,6 +39,7 @@ export const createTrie = (from?: string[] | Set) => { const add = (suffix: string): void => { let node: TrieNode = root; let token: string; + for (let i = suffix.length - 1; i >= 0; i--) { token = suffix[i]; @@ -40,8 +55,8 @@ export const createTrie = (from?: string[] | Set) => { // Do we need to increase size? if (!node[SENTINEL]) { size++; - node[SENTINEL] = true; } + node[SENTINEL] = true; }; /** @@ -84,8 +99,8 @@ export const createTrie = (from?: string[] | Set) => { const nodeStack: TrieNode[] = [node]; const suffixStack: string[] = [inputSuffix]; - while (nodeStack.length) { - const suffix = suffixStack.pop()!; + do { + const suffix: string = suffixStack.pop()!; node = nodeStack.pop()!; if (node[SENTINEL]) { @@ -98,11 +113,50 @@ export const createTrie = (from?: string[] | Set) => { nodeStack.push(childNode); suffixStack.push(k + suffix); }); - } + } while (nodeStack.length); return matches; }; + /** + * Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place. + */ + const substractSetInPlaceFromFound = (inputSuffix: string, set: Set) => { + let node: TrieNode | undefined = root; + let token: string; + + // Find the leaf-est node, and early return if not any + for (let i = inputSuffix.length - 1; i >= 0; i--) { + token = inputSuffix[i]; + + node = node.get(token); + if (!node) { + return; + } + } + + // Performing DFS from prefix + const nodeStack: TrieNode[] = [node]; + const suffixStack: string[] = [inputSuffix]; + + do { + const suffix = suffixStack.pop()!; + node = nodeStack.pop()!; + + if (node[SENTINEL]) { + if (suffix !== inputSuffix) { + // found match, delete it from set + set.delete(suffix); + } + } + + node.forEach((childNode, k) => { + nodeStack.push(childNode); + suffixStack.push(k + suffix); + }); + } while (nodeStack.length); + }; + /** * Method used to delete a prefix from the trie. */ @@ -169,23 +223,65 @@ export const createTrie = (from?: string[] | Set) => { return node[SENTINEL]; }; - if (from) { + if (Array.isArray(from)) { + for (let i = 0, l = from.length; i < l; i++) { + add(from[i]); + } + } else if (from) { from.forEach(add); } + const dump = () => { + const node = root; + const nodeStack: TrieNode[] = []; + const suffixStack: string[] = []; + // Resolving initial string + const suffix = ''; + + nodeStack.push(node); + suffixStack.push(suffix); + + const results: string[] = []; + + let currentNode: TrieNode; + let currentPrefix: string; + let hasValue = false; + + do { + currentNode = nodeStack.pop()!; + currentPrefix = suffixStack.pop()!; + + if (currentNode[SENTINEL]) { + hasValue = true; + } + + node.forEach((childNode, k) => { + nodeStack.push(childNode); + suffixStack.push(k + suffix); + }); + + if (hasValue) results.push(currentPrefix); + } while (nodeStack.length); + + return results; + }; + return { add, contains, find, + substractSetInPlaceFromFound, remove, delete: remove, has, + dump, get size() { return size; }, get root() { return root; - } + }, + [Bun.inspect.custom]: () => JSON.stringify(deepTrieNodeToJSON(root), null, 2) }; };