Fix: avoid non-doaminlist into trie

This commit is contained in:
SukkaW 2024-09-18 13:57:00 +08:00
parent c5513ef363
commit a004ffb960
4 changed files with 15 additions and 6 deletions

View File

@ -10,10 +10,20 @@ import { appendArrayInPlace } from './lib/append-array-in-place';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { output } from './lib/misc'; import { output } from './lib/misc';
import { SOURCE_DIR } from './constants/dir'; import { SOURCE_DIR } from './constants/dir';
import { processLine } from './lib/process-line';
const getS3OSSDomainsPromise = (async (): Promise<string[]> => { const getS3OSSDomainsPromise = (async (): Promise<string[]> => {
const trie = createTrie( const trie = createTrie(
await getPublicSuffixListTextPromise(), (await getPublicSuffixListTextPromise()).reduce<string[]>(
(acc, cur) => {
const tmp = processLine(cur);
if (tmp) {
acc.push(tmp);
}
return acc;
},
[]
),
false false
); );

View File

@ -7,7 +7,6 @@ import { createTrie } from './lib/trie';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source'; import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { domainsetDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick'; import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain'; import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
@ -148,8 +147,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
}); });
// Dedupe domainSets // Dedupe domainSets
const dedupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainsetDeduper(baseTrie)); const dedupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => baseTrie.dump());
const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainsetDeduper(extraTrie)); const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => extraTrie.dump());
console.log(`Final size ${dedupedDominArray.length} + ${dudupedDominArrayExtra.length}`); console.log(`Final size ${dedupedDominArray.length} + ${dudupedDominArrayExtra.length}`);

View File

@ -1,4 +1,3 @@
import { domainsetDeduper } from './lib/domain-deduper';
import path from 'node:path'; import path from 'node:path';
import { createRuleset } from './lib/create-file'; import { createRuleset } from './lib/create-file';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
@ -235,7 +234,7 @@ export const buildSpeedtestDomainSet = task(require.main === module, __filename)
} }
})))); }))));
const deduped = span.traceChildSync('sort result', () => sortDomains(domainsetDeduper(domainTrie))); const deduped = span.traceChildSync('sort result', () => sortDomains(domainTrie.dump()));
const description = [ const description = [
...SHARED_DESCRIPTION, ...SHARED_DESCRIPTION,

View File

@ -16,6 +16,7 @@ export const processLine = (line: string): string | null => {
|| line_0 === '\r' || line_0 === '\r'
|| line_0 === '\n' || line_0 === '\n'
|| line_0 === '!' || line_0 === '!'
|| (line_0 === '/' && trimmed[1] === '/')
) { ) {
return null; return null;
} }