From 90079b99879803582646f247ef8b477f0200ef9f Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 8 Sep 2024 01:28:54 +0800 Subject: [PATCH] Chore: dedupe and sort other rulesets --- Build/build-cdn-download-conf.ts | 6 +- Build/build-common.ts | 4 +- Build/build-reject-domainset.ts | 6 +- Build/build-speedtest-domainset.ts | 4 +- Build/lib/bitwise.ts | 9 +++ Build/lib/create-file.ts | 94 ++++++++++++++++++------------ Build/lib/domain-deduper.ts | 29 +-------- Build/lib/trie.ts | 49 ++++++++++------ 8 files changed, 110 insertions(+), 91 deletions(-) create mode 100644 Build/lib/bitwise.ts diff --git a/Build/build-cdn-download-conf.ts b/Build/build-cdn-download-conf.ts index 6eaa5c4b..377e8488 100644 --- a/Build/build-cdn-download-conf.ts +++ b/Build/build-cdn-download-conf.ts @@ -5,7 +5,7 @@ import { createTrie } from './lib/trie'; import { task } from './trace'; import { SHARED_DESCRIPTION } from './lib/constants'; import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist'; -import { domainDeduper } from './lib/domain-deduper'; +import { domainsetDeduper } from './lib/domain-deduper'; import { appendArrayInPlace } from './lib/append-array-in-place'; import { sortDomains } from './lib/stable-sort-domain'; import { output } from './lib/misc'; @@ -76,7 +76,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as 'This file contains object storage and static assets CDN domains.' ], new Date(), - sortDomains(domainDeduper(cdnDomainsList)), + sortDomains(domainsetDeduper(cdnDomainsList)), 'domainset', output('cdn', 'domainset') ), @@ -89,7 +89,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as 'This file contains domains for software updating & large file hosting.' ], new Date(), - sortDomains(domainDeduper(downloadDomainSet)), + sortDomains(domainsetDeduper(downloadDomainSet)), 'domainset', output('download', 'domainset') ) diff --git a/Build/build-common.ts b/Build/build-common.ts index c075f524..0945323c 100644 --- a/Build/build-common.ts +++ b/Build/build-common.ts @@ -4,7 +4,7 @@ import * as path from 'node:path'; import { readFileByLine } from './lib/fetch-text-by-line'; import { processLine } from './lib/process-line'; import { createRuleset } from './lib/create-file'; -import { domainDeduper } from './lib/domain-deduper'; +import { domainsetDeduper } from './lib/domain-deduper'; import type { Span } from './trace'; import { task } from './trace'; import { SHARED_DESCRIPTION } from './lib/constants'; @@ -116,7 +116,7 @@ function transformDomainset(parentSpan: Span, sourcePath: string, relativePath: const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length); const [title, descriptions, lines] = res; - const deduped = domainDeduper(lines); + const deduped = domainsetDeduper(lines); let description: string[]; if (descriptions.length) { diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index c5f7fd48..1f2f7e6d 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -7,7 +7,7 @@ import { createTrie } from './lib/trie'; import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source'; import { createRuleset, compareAndWriteFile } from './lib/create-file'; -import { domainDeduper } from './lib/domain-deduper'; +import { domainsetDeduper } from './lib/domain-deduper'; import createKeywordFilter from './lib/aho-corasick'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain'; @@ -149,8 +149,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as }); // Dedupe domainSets - const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie)); - const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie)); + const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainsetDeduper(baseTrie)); + const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainsetDeduper(extraTrie)); console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`); diff --git a/Build/build-speedtest-domainset.ts b/Build/build-speedtest-domainset.ts index afab4157..5704f02d 100644 --- a/Build/build-speedtest-domainset.ts +++ b/Build/build-speedtest-domainset.ts @@ -1,4 +1,4 @@ -import { domainDeduper } from './lib/domain-deduper'; +import { domainsetDeduper } from './lib/domain-deduper'; import path from 'node:path'; import { createRuleset } from './lib/create-file'; import { sortDomains } from './lib/stable-sort-domain'; @@ -235,7 +235,7 @@ export const buildSpeedtestDomainSet = task(require.main === module, __filename) } })))); - const deduped = span.traceChildSync('sort result', () => sortDomains(domainDeduper(domainTrie))); + const deduped = span.traceChildSync('sort result', () => sortDomains(domainsetDeduper(domainTrie))); const description = [ ...SHARED_DESCRIPTION, diff --git a/Build/lib/bitwise.ts b/Build/lib/bitwise.ts new file mode 100644 index 00000000..0ff90436 --- /dev/null +++ b/Build/lib/bitwise.ts @@ -0,0 +1,9 @@ +/** Packs two 16-bit integers into one 32-bit integer */ +export const pack = (a: number, b: number): number => { + return (a << 16) | b; +}; + +/** Unpacks two 16-bit integers from one 32-bit integer */ +export const unpack = (value: number): [a: number, b: number] => { + return [(value >> 16) & 0xFFFF, value & 0xFFFF]; +}; diff --git a/Build/lib/create-file.ts b/Build/lib/create-file.ts index f2a6c13a..5602cd6e 100644 --- a/Build/lib/create-file.ts +++ b/Build/lib/create-file.ts @@ -8,6 +8,8 @@ import { fastStringArrayJoin, writeFile } from './misc'; import { readFileByLine } from './fetch-text-by-line'; import stringify from 'json-stringify-pretty-compact'; import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox'; +import { createTrie } from './trie'; +import { pack, unpack } from './bitwise'; export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) { let isEqual = true; @@ -92,17 +94,6 @@ const withBannerArray = (title: string, description: string[] | readonly string[ ]; }; -const collectType = (rule: string) => { - let buf = ''; - for (let i = 0, len = rule.length; i < len; i++) { - if (rule[i] === ',') { - return buf; - } - buf += rule[i]; - } - return null; -}; - const defaultSortTypeOrder = Symbol('defaultSortTypeOrder'); const sortTypeOrder: Record = { DOMAIN: 1, @@ -120,33 +111,62 @@ const sortTypeOrder: Record = { 'IP-CIDR': 400, 'IP-CIDR6': 400 }; -// sort DOMAIN-SUFFIX and DOMAIN first, then DOMAIN-KEYWORD, then IP-CIDR and IP-CIDR6 if any -export const sortRuleSet = (ruleSet: string[]) => { - return ruleSet.map((rule) => { - const type = collectType(rule); - if (!type) { - return [10, rule] as const; - } - if (!(type in sortTypeOrder)) { - return [sortTypeOrder[defaultSortTypeOrder], rule] as const; - } - if (type === 'URL-REGEX') { - let extraWeight = 0; - if (rule.includes('.+') || rule.includes('.*')) { - extraWeight += 10; - } - if (rule.includes('|')) { - extraWeight += 1; - } - return [ - sortTypeOrder[type] + extraWeight, - rule - ] as const; +const flagDomain = 1 << 2; +const flagDomainSuffix = 1 << 3; + +// dedupe and sort based on rule type +const processRuleSet = (ruleSet: string[]) => { + const trie = createTrie(null, true); + + const sortMap: Array<[value: number, weight: number]> = []; + for (let i = 0, len = ruleSet.length; i < len; i++) { + const line = ruleSet[i]; + const [type, value] = line.split(','); + + let extraWeight = 0; + + switch (type) { + case 'DOMAIN': + trie.add(value, pack(i, flagDomain)); + break; + case 'DOMAIN-SUFFIX': + trie.add('.' + value, pack(i, flagDomainSuffix)); + break; + case 'URL-REGEX': + if (value.includes('.+') || value.includes('.*')) { + extraWeight += 10; + } + if (value.includes('|')) { + extraWeight += 1; + } + sortMap.push([i, sortTypeOrder[type] + extraWeight]); + break; + case null: + sortMap.push([i, 10]); + break; + default: + if (type in sortTypeOrder) { + sortMap.push([i, sortTypeOrder[type]]); + } else { + sortMap.push([i, sortTypeOrder[defaultSortTypeOrder]]); + } } - return [sortTypeOrder[type], rule] as const; - }).sort((a, b) => a[0] - b[0]) - .map(c => c[1]); + } + + const dumped = trie.dumpWithMeta(); + for (let i = 0, len = dumped.length; i < len; i++) { + const [originalIndex, flag] = unpack(dumped[i][1]); + console.log(dumped[i][0], ruleSet[originalIndex]); + + const type = flag === flagDomain ? 'DOMAIN' : 'DOMAIN-SUFFIX'; + + sortMap.push([originalIndex, sortTypeOrder[type]]); + } + + return sortMap + .sort((a, b) => a[1] - b[1]) + .map(c => ruleSet[c[0]]); }; const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe'; @@ -162,7 +182,7 @@ export const createRuleset = ( _clashMrsPath?: string ] ) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => { - content = sortRuleSet(content); + content = processRuleSet(content); const surgeContent = childSpan.traceChildSync('process surge ruleset', () => { let _surgeContent; switch (type) { diff --git a/Build/lib/domain-deduper.ts b/Build/lib/domain-deduper.ts index c0d78d47..1a6cabe0 100644 --- a/Build/lib/domain-deduper.ts +++ b/Build/lib/domain-deduper.ts @@ -1,8 +1,6 @@ import { createTrie, type Trie } from './trie'; -export function domainDeduper(inputDomains: string[] | Trie, toArray?: true): string[]; -export function domainDeduper(inputDomains: string[] | Trie, toArray: false): Set; -export function domainDeduper(inputDomains: string[] | Trie, toArray = true): string[] | Set { +export function domainsetDeduper(inputDomains: string[] | Trie): string[] { let trie: Trie; if (Array.isArray(inputDomains)) { trie = createTrie(inputDomains, true); @@ -12,28 +10,5 @@ export function domainDeduper(inputDomains: string[] | Trie, toArray = true): st throw new Error('Invalid trie'); } - const dumped = trie.dump(); - if (toArray) { - return dumped; - } - return new Set(dumped); - - // const trie = createTrie(inputDomains, true); - // const sets = new Set(inputDomains); - - // for (let i = 0, len1 = inputDomains.length; i < len1; i++) { - // const d = inputDomains[i]; - // if (d[0] !== '.') { - // continue; - // } - - // trie.substractSetInPlaceFromFound(d, sets); - // sets.delete(d.slice(1)); - // } - - // if (toArray) { - // return Array.from(sets); - // } - - // return sets; + return trie.dump(); } diff --git a/Build/lib/trie.ts b/Build/lib/trie.ts index 57523538..bc47fecf 100644 --- a/Build/lib/trie.ts +++ b/Build/lib/trie.ts @@ -7,10 +7,11 @@ import { inspect } from 'node:util'; const noop = () => { /** noop */ }; -type TrieNode = [ +type TrieNode = [ boolean, /** sentinel */ TrieNode | null, /** parent */ - Map /** children */ + Map, /** children */ + Meta /** meta */ ]; const deepTrieNodeToJSON = (node: TrieNode) => { @@ -18,14 +19,17 @@ const deepTrieNodeToJSON = (node: TrieNode) => { if (node[0]) { obj['[start]'] = node[0]; } + if (node[3] !== undefined) { + obj['[meta]'] = node[3]; + } node[2].forEach((value, key) => { obj[key] = deepTrieNodeToJSON(value); }); return obj; }; -const createNode = (parent: TrieNode | null = null): TrieNode => { - return [false, parent, new Map()] as TrieNode; +const createNode = (parent: TrieNode | null = null, meta: Meta | null = null): TrieNode => { + return [false, parent, new Map(), meta] as TrieNode; }; export const hostnameToTokens = (hostname: string): string[] => { @@ -72,16 +76,16 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea return false; }; -export const createTrie = (from?: string[] | Set | null, smolTree = false) => { +export const createTrie = (from?: string[] | Set | null, smolTree = false) => { let size = 0; - const root: TrieNode = createNode(); + const root: TrieNode = createNode(); /** * Method used to add the given suffix to the trie. */ const add = smolTree - ? (suffix: string): void => { - let node: TrieNode = root; + ? (suffix: string, meta?: Meta): void => { + let node: TrieNode = root; const onToken = (token: string) => { if (node[2].has(token)) { @@ -98,6 +102,7 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals node = newNode; } + node[3] = meta!; return false; }; @@ -128,8 +133,8 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals node[0] = true; } - : (suffix: string): void => { - let node: TrieNode = root; + : (suffix: string, meta?: Meta): void => { + let node: TrieNode = root; const onToken = (token: string) => { if (node[2].has(token)) { @@ -140,6 +145,7 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals node = newNode; } + node[3] = meta!; return false; }; @@ -221,15 +227,15 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals }; const walk = ( - onMatches: (suffix: string[]) => void, + onMatches: (suffix: string[], meta: Meta) => void, initialNode = root, initialSuffix: string[] = [] ) => { - const nodeStack: TrieNode[] = [initialNode]; + const nodeStack: Array> = [initialNode]; // Resolving initial string (begin the start of the stack) const suffixStack: string[][] = [initialSuffix]; - let node: TrieNode = root; + let node: TrieNode = root; do { node = nodeStack.pop()!; @@ -244,7 +250,7 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals // If the node is a sentinel, we push the suffix to the results if (node[0]) { - onMatches(suffix); + onMatches(suffix, node[3]); } } while (nodeStack.length); }; @@ -383,6 +389,16 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals return results; }; + const dumpWithMeta = () => { + const results: Array<[string, Meta]> = []; + + walk((suffix, meta) => { + results.push([fastStringArrayJoin(suffix, ''), meta]); + }); + + return results; + }; + const whitelist = (suffix: string) => { if (!smolTree) { throw new Error('whitelist method is only available in smolTree mode.'); @@ -428,7 +444,7 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals add(from[i]); } } else if (from) { - from.forEach(add); + from.forEach((value) => add(value)); } return { @@ -440,6 +456,7 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals delete: remove, has, dump, + dumpWithMeta, get size() { if (smolTree) { throw new Error('A Trie with smolTree enabled cannot have correct size!'); @@ -460,5 +477,3 @@ export const createTrie = (from?: string[] | Set | null, smolTree = fals }; export type Trie = ReturnType; - -export default createTrie;