mirror of
https://github.com/SukkaW/Surge.git
synced 2026-01-29 01:51:52 +08:00
Chore: refine reject domainset building
This commit is contained in:
@@ -33,7 +33,7 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => {
|
|||||||
trie.add(domain);
|
trie.add(domain);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain, false)));
|
return new Set(PROBE_DOMAINS.flatMap(domain => trie.find(domain)));
|
||||||
});
|
});
|
||||||
|
|
||||||
// Second trie is to remove blacklisted domains
|
// Second trie is to remove blacklisted domains
|
||||||
|
|||||||
@@ -79,48 +79,58 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
|
|||||||
console.log(`Import ${previousSize} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
|
console.log(`Import ${previousSize} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`);
|
||||||
|
|
||||||
// Dedupe domainSets
|
// Dedupe domainSets
|
||||||
await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async () => {
|
await span.traceChild('dedupe from black keywords/suffixes').traceAsyncFn(async (childSpan) => {
|
||||||
/** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
|
/** Collect DOMAIN-SUFFIX from non_ip/reject.conf for deduplication */
|
||||||
const domainSuffixSet = new Set<string>();
|
const domainSuffixSet = new Set<string>();
|
||||||
/** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
|
/** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
|
||||||
const domainKeywordsSet = new Set<string>();
|
const domainKeywordsSet = new Set<string>();
|
||||||
|
|
||||||
for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
|
await childSpan.traceChild('collect keywords/suffixes').traceAsyncFn(async () => {
|
||||||
const [type, keyword] = line.split(',');
|
for await (const line of readFileByLine(path.resolve(import.meta.dir, '../Source/non_ip/reject.conf'))) {
|
||||||
|
const [type, value] = line.split(',');
|
||||||
|
|
||||||
if (type === 'DOMAIN-KEYWORD') {
|
if (type === 'DOMAIN-KEYWORD') {
|
||||||
domainKeywordsSet.add(keyword.trim());
|
domainKeywordsSet.add(value.trim());
|
||||||
} else if (type === 'DOMAIN-SUFFIX') {
|
} else if (type === 'DOMAIN-SUFFIX') {
|
||||||
domainSuffixSet.add(keyword.trim());
|
domainSuffixSet.add(value.trim());
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
const trie1 = createTrie(domainSets);
|
|
||||||
|
|
||||||
domainSuffixSet.forEach(suffix => {
|
|
||||||
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
|
|
||||||
});
|
|
||||||
filterRuleWhitelistDomainSets.forEach(suffix => {
|
|
||||||
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
|
|
||||||
|
|
||||||
if (suffix[0] === '.') {
|
|
||||||
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
|
|
||||||
domainSets.delete(suffix.slice(1));
|
|
||||||
} else {
|
|
||||||
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
|
|
||||||
domainSets.delete(`.${suffix}`);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// remove pre-defined enforced blacklist from whitelist
|
// Remove as many domains as possible from domainSets before creating trie
|
||||||
const kwfilter = createKeywordFilter(domainKeywordsSet);
|
SetHelpers.subtract(domainSets, domainSuffixSet);
|
||||||
|
SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets);
|
||||||
|
|
||||||
for (const domain of domainSets) {
|
childSpan.traceChild('dedupe from white/suffixes').traceSyncFn(() => {
|
||||||
|
const trie = createTrie(domainSets);
|
||||||
|
|
||||||
|
domainSuffixSet.forEach(suffix => {
|
||||||
|
trie.remove(suffix);
|
||||||
|
trie.substractSetInPlaceFromFound(suffix, domainSets);
|
||||||
|
});
|
||||||
|
filterRuleWhitelistDomainSets.forEach(suffix => {
|
||||||
|
trie.substractSetInPlaceFromFound(suffix, domainSets);
|
||||||
|
|
||||||
|
if (suffix[0] === '.') {
|
||||||
|
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
|
||||||
|
domainSets.delete(suffix.slice(1));
|
||||||
|
} else {
|
||||||
|
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
|
||||||
|
domainSets.delete(`.${suffix}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
childSpan.traceChild('dedupe from black keywords').traceSyncFn(() => {
|
||||||
|
const kwfilter = createKeywordFilter(domainKeywordsSet);
|
||||||
|
|
||||||
|
for (const domain of domainSets) {
|
||||||
// Remove keyword
|
// Remove keyword
|
||||||
if (kwfilter(domain)) {
|
if (kwfilter(domain)) {
|
||||||
domainSets.delete(domain);
|
domainSets.delete(domain);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
console.log(`Deduped ${previousSize} - ${domainSets.size} = ${previousSize - domainSets.size} from black keywords and suffixes!`);
|
console.log(`Deduped ${previousSize} - ${domainSets.size} = ${previousSize - domainSets.size} from black keywords and suffixes!`);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -16,8 +16,12 @@ const enum CacheStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface CacheOptions {
|
export interface CacheOptions {
|
||||||
|
/** Path to sqlite file dir */
|
||||||
cachePath?: string,
|
cachePath?: string,
|
||||||
tbd?: number
|
/** Time before deletion */
|
||||||
|
tbd?: number,
|
||||||
|
/** Cache table name */
|
||||||
|
tableName?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
interface CacheApplyNonStringOption<T> {
|
interface CacheApplyNonStringOption<T> {
|
||||||
@@ -60,13 +64,18 @@ export const TTL = {
|
|||||||
|
|
||||||
export class Cache {
|
export class Cache {
|
||||||
db: Database;
|
db: Database;
|
||||||
tbd = 60 * 1000; // time before deletion
|
/** Time before deletion */
|
||||||
|
tbd = 60 * 1000;
|
||||||
|
/** SQLite file path */
|
||||||
cachePath: string;
|
cachePath: string;
|
||||||
|
/** Table name */
|
||||||
|
tableName: string;
|
||||||
|
|
||||||
constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) {
|
constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd, tableName = 'cache' }: CacheOptions = {}) {
|
||||||
this.cachePath = cachePath;
|
this.cachePath = cachePath;
|
||||||
mkdirSync(this.cachePath, { recursive: true });
|
mkdirSync(this.cachePath, { recursive: true });
|
||||||
if (tbd != null) this.tbd = tbd;
|
if (tbd != null) this.tbd = tbd;
|
||||||
|
this.tableName = tableName;
|
||||||
|
|
||||||
const db = new Database(path.join(this.cachePath, 'cache.db'));
|
const db = new Database(path.join(this.cachePath, 'cache.db'));
|
||||||
|
|
||||||
@@ -75,8 +84,8 @@ export class Cache {
|
|||||||
db.exec('PRAGMA temp_store = memory;');
|
db.exec('PRAGMA temp_store = memory;');
|
||||||
db.exec('PRAGMA optimize;');
|
db.exec('PRAGMA optimize;');
|
||||||
|
|
||||||
db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run();
|
db.prepare(`CREATE TABLE IF NOT EXISTS ${this.tableName} (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);`).run();
|
||||||
db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run();
|
db.prepare(`CREATE INDEX IF NOT EXISTS cache_ttl ON ${this.tableName} (ttl);`).run();
|
||||||
|
|
||||||
const date = new Date();
|
const date = new Date();
|
||||||
|
|
||||||
@@ -84,7 +93,7 @@ export class Cache {
|
|||||||
|
|
||||||
// ttl + tbd < now => ttl < now - tbd
|
// ttl + tbd < now => ttl < now - tbd
|
||||||
const now = date.getTime() - this.tbd;
|
const now = date.getTime() - this.tbd;
|
||||||
db.prepare('DELETE FROM cache WHERE ttl < ?').run(now);
|
db.prepare(`DELETE FROM ${this.tableName} WHERE ttl < ?`).run(now);
|
||||||
|
|
||||||
this.db = db;
|
this.db = db;
|
||||||
|
|
||||||
@@ -100,7 +109,7 @@ export class Cache {
|
|||||||
|
|
||||||
set(key: string, value: string, ttl = 60 * 1000): void {
|
set(key: string, value: string, ttl = 60 * 1000): void {
|
||||||
const insert = this.db.prepare(
|
const insert = this.db.prepare(
|
||||||
'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid'
|
`INSERT INTO ${this.tableName} (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid`
|
||||||
);
|
);
|
||||||
|
|
||||||
insert.run({
|
insert.run({
|
||||||
@@ -112,7 +121,7 @@ export class Cache {
|
|||||||
|
|
||||||
get(key: string, defaultValue?: string): string | undefined {
|
get(key: string, defaultValue?: string): string | undefined {
|
||||||
const rv = this.db.prepare<{ value: string }, string>(
|
const rv = this.db.prepare<{ value: string }, string>(
|
||||||
'SELECT value FROM cache WHERE key = ?'
|
`SELECT value FROM ${this.tableName} WHERE key = ?`
|
||||||
).get(key);
|
).get(key);
|
||||||
|
|
||||||
if (!rv) return defaultValue;
|
if (!rv) return defaultValue;
|
||||||
@@ -121,13 +130,13 @@ export class Cache {
|
|||||||
|
|
||||||
has(key: string): CacheStatus {
|
has(key: string): CacheStatus {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key);
|
const rv = this.db.prepare<{ ttl: number }, string>(`SELECT ttl FROM ${this.tableName} WHERE key = ?`).get(key);
|
||||||
|
|
||||||
return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale);
|
return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale);
|
||||||
}
|
}
|
||||||
|
|
||||||
del(key: string): void {
|
del(key: string): void {
|
||||||
this.db.prepare('DELETE FROM cache WHERE key = ?').run(key);
|
this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
async apply<T>(
|
async apply<T>(
|
||||||
@@ -167,9 +176,9 @@ export class Cache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fsCache = traceSync('initializing filesystem cache', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }));
|
export const fsFetchCache = traceSync('initializing filesystem cache for fetch', () => new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }));
|
||||||
// process.on('exit', () => {
|
// process.on('exit', () => {
|
||||||
// fsCache.destroy();
|
// fsFetchCache.destroy();
|
||||||
// });
|
// });
|
||||||
|
|
||||||
const separator = '\u0000';
|
const separator = '\u0000';
|
||||||
|
|||||||
@@ -35,11 +35,11 @@ export async function compareAndWriteFile(span: Span, linesA: string[], filePath
|
|||||||
}
|
}
|
||||||
if (
|
if (
|
||||||
lineA[0] === '/'
|
lineA[0] === '/'
|
||||||
&& lineA[1] === '/'
|
&& lineA[1] === '/'
|
||||||
&& lineA[3] === '#'
|
&& lineB[0] === '/'
|
||||||
&& lineB[0] === '/'
|
&& lineB[1] === '/'
|
||||||
&& lineB[1] === '/'
|
&& lineA[3] === '#'
|
||||||
&& lineB[3] === '#'
|
&& lineB[3] === '#'
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,12 +12,7 @@ export function domainDeduper(inputDomains: string[], toArray = true): string[]
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const found = trie.find(d, false);
|
trie.substractSetInPlaceFromFound(d, sets);
|
||||||
|
|
||||||
for (let j = 0, len2 = found.length; j < len2; j++) {
|
|
||||||
sets.delete(found[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
sets.delete(d.slice(1));
|
sets.delete(d.slice(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -104,11 +104,10 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
|
|||||||
return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
|
return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
|
||||||
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
|
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
|
||||||
const white = WHITELIST_DOMAIN[i];
|
const white = WHITELIST_DOMAIN[i];
|
||||||
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
|
|
||||||
for (let j = 0, len2 = found.length; j < len2; j++) {
|
|
||||||
domainSet.delete(found[j]);
|
|
||||||
}
|
|
||||||
domainSet.delete(white);
|
domainSet.delete(white);
|
||||||
|
domainSet.delete(`.${white}`);
|
||||||
|
|
||||||
|
trieForRemovingWhiteListed.substractSetInPlaceFromFound(`.${white}`, domainSet);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -8,14 +8,14 @@ import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
|
|||||||
import picocolors from 'picocolors';
|
import picocolors from 'picocolors';
|
||||||
import { normalizeDomain } from './normalize-domain';
|
import { normalizeDomain } from './normalize-domain';
|
||||||
import { fetchAssets } from './fetch-assets';
|
import { fetchAssets } from './fetch-assets';
|
||||||
import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
|
import { deserializeSet, fsFetchCache, serializeSet } from './cache-filesystem';
|
||||||
import type { Span } from '../trace';
|
import type { Span } from '../trace';
|
||||||
|
|
||||||
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
|
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
|
||||||
let foundDebugDomain = false;
|
let foundDebugDomain = false;
|
||||||
|
|
||||||
export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
|
export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
|
||||||
return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsCache.apply(
|
return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsFetchCache.apply(
|
||||||
domainListsUrl,
|
domainListsUrl,
|
||||||
async () => {
|
async () => {
|
||||||
const domainSets = new Set<string>();
|
const domainSets = new Set<string>();
|
||||||
@@ -45,7 +45,7 @@ export function processDomainLists(span: Span, domainListsUrl: string, includeAl
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
|
export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
|
||||||
return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsCache.apply(
|
return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
|
||||||
hostsUrl,
|
hostsUrl,
|
||||||
async () => {
|
async () => {
|
||||||
const domainSets = new Set<string>();
|
const domainSets = new Set<string>();
|
||||||
@@ -119,7 +119,7 @@ export async function processFilterRules(
|
|||||||
fallbackUrls?: readonly string[] | undefined | null,
|
fallbackUrls?: readonly string[] | undefined | null,
|
||||||
ttl: number | null = null
|
ttl: number | null = null
|
||||||
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
|
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
|
||||||
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsCache.apply<Readonly<[
|
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.apply<Readonly<[
|
||||||
white: string[],
|
white: string[],
|
||||||
black: string[],
|
black: string[],
|
||||||
warningMessages: string[]
|
warningMessages: string[]
|
||||||
@@ -187,7 +187,6 @@ export async function processFilterRules(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO-SUKKA: add cache here
|
|
||||||
if (!fallbackUrls || fallbackUrls.length === 0) {
|
if (!fallbackUrls || fallbackUrls.length === 0) {
|
||||||
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
|
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
|
||||||
// don't trim here
|
// don't trim here
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
import { bench, group, run } from 'mitata';
|
|
||||||
import { randomInt as nativeRandomInt } from 'crypto';
|
|
||||||
|
|
||||||
const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
|
|
||||||
|
|
||||||
group('random-int', () => {
|
|
||||||
bench('crypto.randomInt', () => {
|
|
||||||
nativeRandomInt(3, 7);
|
|
||||||
});
|
|
||||||
|
|
||||||
bench('Math.random', () => {
|
|
||||||
randomInt(3, 7);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
run();
|
|
||||||
@@ -11,6 +11,7 @@ describe('Trie', () => {
|
|||||||
trie.add('akku');
|
trie.add('akku');
|
||||||
|
|
||||||
expect(trie.size).toBe(3);
|
expect(trie.size).toBe(3);
|
||||||
|
|
||||||
expect(trie.has('sukka')).toBeTrue();
|
expect(trie.has('sukka')).toBeTrue();
|
||||||
expect(trie.has('ukka')).toBeTrue();
|
expect(trie.has('ukka')).toBeTrue();
|
||||||
expect(trie.has('akku')).toBeTrue();
|
expect(trie.has('akku')).toBeTrue();
|
||||||
@@ -86,59 +87,6 @@ describe('Trie', () => {
|
|||||||
expect(trie.find('')).toEqual(['greek', 'roman', 'esqueroman', 'sesqueroman']);
|
expect(trie.find('')).toEqual(['greek', 'roman', 'esqueroman', 'sesqueroman']);
|
||||||
});
|
});
|
||||||
|
|
||||||
// it('should work with custom tokens.', () => {
|
|
||||||
// const trie = new Trie(Array);
|
|
||||||
|
|
||||||
// trie.add(['the', 'cat', 'eats', 'the', 'mouse']);
|
|
||||||
// trie.add(['the', 'mouse', 'eats', 'cheese']);
|
|
||||||
// trie.add(['hello', 'world']);
|
|
||||||
|
|
||||||
// assert.strictEqual(trie.size, 3);
|
|
||||||
|
|
||||||
// assert.strictEqual(trie.has(['the', 'mouse', 'eats', 'cheese']), true);
|
|
||||||
// assert.strictEqual(trie.has(['the', 'mouse', 'eats']), false);
|
|
||||||
|
|
||||||
// assert.strictEqual(trie.delete(['hello']), false);
|
|
||||||
// assert.strictEqual(trie.delete(['hello', 'world']), true);
|
|
||||||
|
|
||||||
// assert.strictEqual(trie.size, 2);
|
|
||||||
// });
|
|
||||||
|
|
||||||
// it('should be possible to iterate over the trie\'s prefixes.', () => {
|
|
||||||
// const trie = new Trie();
|
|
||||||
|
|
||||||
// trie.add('rat');
|
|
||||||
// trie.add('rate');
|
|
||||||
|
|
||||||
// let prefixes = take(trie.prefixes());
|
|
||||||
|
|
||||||
// assert.deepStrictEqual(prefixes, ['rat', 'rate']);
|
|
||||||
|
|
||||||
// trie.add('rater');
|
|
||||||
// trie.add('rates');
|
|
||||||
|
|
||||||
// prefixes = take(trie.keys('rate'));
|
|
||||||
|
|
||||||
// assert.deepStrictEqual(prefixes, ['rate', 'rates', 'rater']);
|
|
||||||
// });
|
|
||||||
|
|
||||||
// it('should be possible to iterate over the trie\'s prefixes using for...of.', () => {
|
|
||||||
// const trie = new Trie();
|
|
||||||
|
|
||||||
// trie.add('rat');
|
|
||||||
// trie.add('rate');
|
|
||||||
|
|
||||||
// const tests = [
|
|
||||||
// 'rat',
|
|
||||||
// 'rate'
|
|
||||||
// ];
|
|
||||||
|
|
||||||
// let i = 0;
|
|
||||||
|
|
||||||
// for (const prefix of trie)
|
|
||||||
// assert.deepStrictEqual(prefix, tests[i++]);
|
|
||||||
// });
|
|
||||||
|
|
||||||
it('should be possible to create a trie from an arbitrary iterable.', () => {
|
it('should be possible to create a trie from an arbitrary iterable.', () => {
|
||||||
const words = ['roman', 'esqueroman'];
|
const words = ['roman', 'esqueroman'];
|
||||||
|
|
||||||
@@ -159,9 +107,10 @@ describe('surge domainset dedupe', () => {
|
|||||||
|
|
||||||
it('should remove subdomain', () => {
|
it('should remove subdomain', () => {
|
||||||
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']);
|
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']);
|
||||||
// trie.find('noc.one').toBe(['www.noc.one']);
|
|
||||||
|
console.log(trie);
|
||||||
|
|
||||||
expect(trie.find('.skk.moe')).toStrictEqual(['image.cdn.skk.moe', 'blog.skk.moe']);
|
expect(trie.find('.skk.moe')).toStrictEqual(['image.cdn.skk.moe', 'blog.skk.moe']);
|
||||||
// trie.find('sukkaw.net').toBe(['cdn.sukkaw.net']);
|
|
||||||
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
|
expect(trie.find('.sukkaw.com')).toStrictEqual(['www.sukkaw.com']);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -2,20 +2,34 @@
|
|||||||
* Suffix Trie based on Mnemonist Trie
|
* Suffix Trie based on Mnemonist Trie
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// import { Trie } from 'mnemonist';
|
||||||
|
|
||||||
export const SENTINEL = Symbol('SENTINEL');
|
export const SENTINEL = Symbol('SENTINEL');
|
||||||
|
|
||||||
type TrieNode = {
|
type TrieNode = {
|
||||||
[SENTINEL]: boolean
|
[SENTINEL]: boolean,
|
||||||
|
[Bun.inspect.custom]: () => string
|
||||||
} & Map<string, TrieNode>;
|
} & Map<string, TrieNode>;
|
||||||
|
|
||||||
|
const deepTrieNodeToJSON = (node: TrieNode) => {
|
||||||
|
const obj: Record<string, any> = {};
|
||||||
|
if (node[SENTINEL]) {
|
||||||
|
obj['[start]'] = node[SENTINEL];
|
||||||
|
}
|
||||||
|
node.forEach((value, key) => {
|
||||||
|
obj[key] = deepTrieNodeToJSON(value);
|
||||||
|
});
|
||||||
|
return obj;
|
||||||
|
};
|
||||||
|
|
||||||
const createNode = (): TrieNode => {
|
const createNode = (): TrieNode => {
|
||||||
const map = new Map<string, TrieNode>();
|
const node = new Map<string, TrieNode>() as TrieNode;
|
||||||
const node = map as TrieNode;
|
|
||||||
node[SENTINEL] = false;
|
node[SENTINEL] = false;
|
||||||
|
node[Bun.inspect.custom] = () => JSON.stringify(deepTrieNodeToJSON(node), null, 2);
|
||||||
return node;
|
return node;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const createTrie = (from?: string[] | Set<string>) => {
|
export const createTrie = (from?: string[] | Set<string> | null) => {
|
||||||
let size = 0;
|
let size = 0;
|
||||||
const root: TrieNode = createNode();
|
const root: TrieNode = createNode();
|
||||||
|
|
||||||
@@ -25,6 +39,7 @@ export const createTrie = (from?: string[] | Set<string>) => {
|
|||||||
const add = (suffix: string): void => {
|
const add = (suffix: string): void => {
|
||||||
let node: TrieNode = root;
|
let node: TrieNode = root;
|
||||||
let token: string;
|
let token: string;
|
||||||
|
|
||||||
for (let i = suffix.length - 1; i >= 0; i--) {
|
for (let i = suffix.length - 1; i >= 0; i--) {
|
||||||
token = suffix[i];
|
token = suffix[i];
|
||||||
|
|
||||||
@@ -40,8 +55,8 @@ export const createTrie = (from?: string[] | Set<string>) => {
|
|||||||
// Do we need to increase size?
|
// Do we need to increase size?
|
||||||
if (!node[SENTINEL]) {
|
if (!node[SENTINEL]) {
|
||||||
size++;
|
size++;
|
||||||
node[SENTINEL] = true;
|
|
||||||
}
|
}
|
||||||
|
node[SENTINEL] = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -84,8 +99,8 @@ export const createTrie = (from?: string[] | Set<string>) => {
|
|||||||
const nodeStack: TrieNode[] = [node];
|
const nodeStack: TrieNode[] = [node];
|
||||||
const suffixStack: string[] = [inputSuffix];
|
const suffixStack: string[] = [inputSuffix];
|
||||||
|
|
||||||
while (nodeStack.length) {
|
do {
|
||||||
const suffix = suffixStack.pop()!;
|
const suffix: string = suffixStack.pop()!;
|
||||||
node = nodeStack.pop()!;
|
node = nodeStack.pop()!;
|
||||||
|
|
||||||
if (node[SENTINEL]) {
|
if (node[SENTINEL]) {
|
||||||
@@ -98,11 +113,50 @@ export const createTrie = (from?: string[] | Set<string>) => {
|
|||||||
nodeStack.push(childNode);
|
nodeStack.push(childNode);
|
||||||
suffixStack.push(k + suffix);
|
suffixStack.push(k + suffix);
|
||||||
});
|
});
|
||||||
}
|
} while (nodeStack.length);
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Works like trie.find, but instead of returning the matches as an array, it removes them from the given set in-place.
|
||||||
|
*/
|
||||||
|
const substractSetInPlaceFromFound = (inputSuffix: string, set: Set<string>) => {
|
||||||
|
let node: TrieNode | undefined = root;
|
||||||
|
let token: string;
|
||||||
|
|
||||||
|
// Find the leaf-est node, and early return if not any
|
||||||
|
for (let i = inputSuffix.length - 1; i >= 0; i--) {
|
||||||
|
token = inputSuffix[i];
|
||||||
|
|
||||||
|
node = node.get(token);
|
||||||
|
if (!node) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performing DFS from prefix
|
||||||
|
const nodeStack: TrieNode[] = [node];
|
||||||
|
const suffixStack: string[] = [inputSuffix];
|
||||||
|
|
||||||
|
do {
|
||||||
|
const suffix = suffixStack.pop()!;
|
||||||
|
node = nodeStack.pop()!;
|
||||||
|
|
||||||
|
if (node[SENTINEL]) {
|
||||||
|
if (suffix !== inputSuffix) {
|
||||||
|
// found match, delete it from set
|
||||||
|
set.delete(suffix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
node.forEach((childNode, k) => {
|
||||||
|
nodeStack.push(childNode);
|
||||||
|
suffixStack.push(k + suffix);
|
||||||
|
});
|
||||||
|
} while (nodeStack.length);
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method used to delete a prefix from the trie.
|
* Method used to delete a prefix from the trie.
|
||||||
*/
|
*/
|
||||||
@@ -169,23 +223,65 @@ export const createTrie = (from?: string[] | Set<string>) => {
|
|||||||
return node[SENTINEL];
|
return node[SENTINEL];
|
||||||
};
|
};
|
||||||
|
|
||||||
if (from) {
|
if (Array.isArray(from)) {
|
||||||
|
for (let i = 0, l = from.length; i < l; i++) {
|
||||||
|
add(from[i]);
|
||||||
|
}
|
||||||
|
} else if (from) {
|
||||||
from.forEach(add);
|
from.forEach(add);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const dump = () => {
|
||||||
|
const node = root;
|
||||||
|
const nodeStack: TrieNode[] = [];
|
||||||
|
const suffixStack: string[] = [];
|
||||||
|
// Resolving initial string
|
||||||
|
const suffix = '';
|
||||||
|
|
||||||
|
nodeStack.push(node);
|
||||||
|
suffixStack.push(suffix);
|
||||||
|
|
||||||
|
const results: string[] = [];
|
||||||
|
|
||||||
|
let currentNode: TrieNode;
|
||||||
|
let currentPrefix: string;
|
||||||
|
let hasValue = false;
|
||||||
|
|
||||||
|
do {
|
||||||
|
currentNode = nodeStack.pop()!;
|
||||||
|
currentPrefix = suffixStack.pop()!;
|
||||||
|
|
||||||
|
if (currentNode[SENTINEL]) {
|
||||||
|
hasValue = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
node.forEach((childNode, k) => {
|
||||||
|
nodeStack.push(childNode);
|
||||||
|
suffixStack.push(k + suffix);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (hasValue) results.push(currentPrefix);
|
||||||
|
} while (nodeStack.length);
|
||||||
|
|
||||||
|
return results;
|
||||||
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
add,
|
add,
|
||||||
contains,
|
contains,
|
||||||
find,
|
find,
|
||||||
|
substractSetInPlaceFromFound,
|
||||||
remove,
|
remove,
|
||||||
delete: remove,
|
delete: remove,
|
||||||
has,
|
has,
|
||||||
|
dump,
|
||||||
get size() {
|
get size() {
|
||||||
return size;
|
return size;
|
||||||
},
|
},
|
||||||
get root() {
|
get root() {
|
||||||
return root;
|
return root;
|
||||||
}
|
},
|
||||||
|
[Bun.inspect.custom]: () => JSON.stringify(deepTrieNodeToJSON(root), null, 2)
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user