Chore: dedupe and sort other rulesets

This commit is contained in:
SukkaW 2024-09-08 01:28:54 +08:00
parent d4ee25e75a
commit 90079b9987
8 changed files with 110 additions and 91 deletions

View File

@ -5,7 +5,7 @@ import { createTrie } from './lib/trie';
import { task } from './trace'; import { task } from './trace';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist'; import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist';
import { domainDeduper } from './lib/domain-deduper'; import { domainsetDeduper } from './lib/domain-deduper';
import { appendArrayInPlace } from './lib/append-array-in-place'; import { appendArrayInPlace } from './lib/append-array-in-place';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { output } from './lib/misc'; import { output } from './lib/misc';
@ -76,7 +76,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
'This file contains object storage and static assets CDN domains.' 'This file contains object storage and static assets CDN domains.'
], ],
new Date(), new Date(),
sortDomains(domainDeduper(cdnDomainsList)), sortDomains(domainsetDeduper(cdnDomainsList)),
'domainset', 'domainset',
output('cdn', 'domainset') output('cdn', 'domainset')
), ),
@ -89,7 +89,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
'This file contains domains for software updating & large file hosting.' 'This file contains domains for software updating & large file hosting.'
], ],
new Date(), new Date(),
sortDomains(domainDeduper(downloadDomainSet)), sortDomains(domainsetDeduper(downloadDomainSet)),
'domainset', 'domainset',
output('download', 'domainset') output('download', 'domainset')
) )

View File

@ -4,7 +4,7 @@ import * as path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line'; import { readFileByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line'; import { processLine } from './lib/process-line';
import { createRuleset } from './lib/create-file'; import { createRuleset } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper'; import { domainsetDeduper } from './lib/domain-deduper';
import type { Span } from './trace'; import type { Span } from './trace';
import { task } from './trace'; import { task } from './trace';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
@ -116,7 +116,7 @@ function transformDomainset(parentSpan: Span, sourcePath: string, relativePath:
const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length); const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length);
const [title, descriptions, lines] = res; const [title, descriptions, lines] = res;
const deduped = domainDeduper(lines); const deduped = domainsetDeduper(lines);
let description: string[]; let description: string[];
if (descriptions.length) { if (descriptions.length) {

View File

@ -7,7 +7,7 @@ import { createTrie } from './lib/trie';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source'; import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper'; import { domainsetDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick'; import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain'; import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
@ -149,8 +149,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
}); });
// Dedupe domainSets // Dedupe domainSets
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie)); const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainsetDeduper(baseTrie));
const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie)); const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainsetDeduper(extraTrie));
console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`); console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`);

View File

@ -1,4 +1,4 @@
import { domainDeduper } from './lib/domain-deduper'; import { domainsetDeduper } from './lib/domain-deduper';
import path from 'node:path'; import path from 'node:path';
import { createRuleset } from './lib/create-file'; import { createRuleset } from './lib/create-file';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
@ -235,7 +235,7 @@ export const buildSpeedtestDomainSet = task(require.main === module, __filename)
} }
})))); }))));
const deduped = span.traceChildSync('sort result', () => sortDomains(domainDeduper(domainTrie))); const deduped = span.traceChildSync('sort result', () => sortDomains(domainsetDeduper(domainTrie)));
const description = [ const description = [
...SHARED_DESCRIPTION, ...SHARED_DESCRIPTION,

9
Build/lib/bitwise.ts Normal file
View File

@ -0,0 +1,9 @@
/** Packs two 16-bit integers into one 32-bit integer */
export const pack = (a: number, b: number): number => {
return (a << 16) | b;
};
/** Unpacks two 16-bit integers from one 32-bit integer */
export const unpack = (value: number): [a: number, b: number] => {
return [(value >> 16) & 0xFFFF, value & 0xFFFF];
};

View File

@ -8,6 +8,8 @@ import { fastStringArrayJoin, writeFile } from './misc';
import { readFileByLine } from './fetch-text-by-line'; import { readFileByLine } from './fetch-text-by-line';
import stringify from 'json-stringify-pretty-compact'; import stringify from 'json-stringify-pretty-compact';
import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox'; import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox';
import { createTrie } from './trie';
import { pack, unpack } from './bitwise';
export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) { export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) {
let isEqual = true; let isEqual = true;
@ -92,17 +94,6 @@ const withBannerArray = (title: string, description: string[] | readonly string[
]; ];
}; };
const collectType = (rule: string) => {
let buf = '';
for (let i = 0, len = rule.length; i < len; i++) {
if (rule[i] === ',') {
return buf;
}
buf += rule[i];
}
return null;
};
const defaultSortTypeOrder = Symbol('defaultSortTypeOrder'); const defaultSortTypeOrder = Symbol('defaultSortTypeOrder');
const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = { const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
DOMAIN: 1, DOMAIN: 1,
@ -120,33 +111,62 @@ const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
'IP-CIDR': 400, 'IP-CIDR': 400,
'IP-CIDR6': 400 'IP-CIDR6': 400
}; };
// sort DOMAIN-SUFFIX and DOMAIN first, then DOMAIN-KEYWORD, then IP-CIDR and IP-CIDR6 if any
export const sortRuleSet = (ruleSet: string[]) => { const flagDomain = 1 << 2;
return ruleSet.map((rule) => { const flagDomainSuffix = 1 << 3;
const type = collectType(rule);
if (!type) { // dedupe and sort based on rule type
return [10, rule] as const; const processRuleSet = (ruleSet: string[]) => {
} const trie = createTrie<number>(null, true);
if (!(type in sortTypeOrder)) {
return [sortTypeOrder[defaultSortTypeOrder], rule] as const; const sortMap: Array<[value: number, weight: number]> = [];
} for (let i = 0, len = ruleSet.length; i < len; i++) {
if (type === 'URL-REGEX') { const line = ruleSet[i];
const [type, value] = line.split(',');
let extraWeight = 0; let extraWeight = 0;
if (rule.includes('.+') || rule.includes('.*')) {
switch (type) {
case 'DOMAIN':
trie.add(value, pack(i, flagDomain));
break;
case 'DOMAIN-SUFFIX':
trie.add('.' + value, pack(i, flagDomainSuffix));
break;
case 'URL-REGEX':
if (value.includes('.+') || value.includes('.*')) {
extraWeight += 10; extraWeight += 10;
} }
if (rule.includes('|')) { if (value.includes('|')) {
extraWeight += 1; extraWeight += 1;
} }
sortMap.push([i, sortTypeOrder[type] + extraWeight]);
return [ break;
sortTypeOrder[type] + extraWeight, case null:
rule sortMap.push([i, 10]);
] as const; break;
default:
if (type in sortTypeOrder) {
sortMap.push([i, sortTypeOrder[type]]);
} else {
sortMap.push([i, sortTypeOrder[defaultSortTypeOrder]]);
} }
return [sortTypeOrder[type], rule] as const; }
}).sort((a, b) => a[0] - b[0]) }
.map(c => c[1]);
const dumped = trie.dumpWithMeta();
for (let i = 0, len = dumped.length; i < len; i++) {
const [originalIndex, flag] = unpack(dumped[i][1]);
console.log(dumped[i][0], ruleSet[originalIndex]);
const type = flag === flagDomain ? 'DOMAIN' : 'DOMAIN-SUFFIX';
sortMap.push([originalIndex, sortTypeOrder[type]]);
}
return sortMap
.sort((a, b) => a[1] - b[1])
.map(c => ruleSet[c[0]]);
}; };
const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe'; const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe';
@ -162,7 +182,7 @@ export const createRuleset = (
_clashMrsPath?: string _clashMrsPath?: string
] ]
) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => { ) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => {
content = sortRuleSet(content); content = processRuleSet(content);
const surgeContent = childSpan.traceChildSync('process surge ruleset', () => { const surgeContent = childSpan.traceChildSync('process surge ruleset', () => {
let _surgeContent; let _surgeContent;
switch (type) { switch (type) {

View File

@ -1,8 +1,6 @@
import { createTrie, type Trie } from './trie'; import { createTrie, type Trie } from './trie';
export function domainDeduper(inputDomains: string[] | Trie, toArray?: true): string[]; export function domainsetDeduper(inputDomains: string[] | Trie): string[] {
export function domainDeduper(inputDomains: string[] | Trie, toArray: false): Set<string>;
export function domainDeduper(inputDomains: string[] | Trie, toArray = true): string[] | Set<string> {
let trie: Trie; let trie: Trie;
if (Array.isArray(inputDomains)) { if (Array.isArray(inputDomains)) {
trie = createTrie(inputDomains, true); trie = createTrie(inputDomains, true);
@ -12,28 +10,5 @@ export function domainDeduper(inputDomains: string[] | Trie, toArray = true): st
throw new Error('Invalid trie'); throw new Error('Invalid trie');
} }
const dumped = trie.dump(); return trie.dump();
if (toArray) {
return dumped;
}
return new Set(dumped);
// const trie = createTrie(inputDomains, true);
// const sets = new Set(inputDomains);
// for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
// const d = inputDomains[i];
// if (d[0] !== '.') {
// continue;
// }
// trie.substractSetInPlaceFromFound(d, sets);
// sets.delete(d.slice(1));
// }
// if (toArray) {
// return Array.from(sets);
// }
// return sets;
} }

View File

@ -7,10 +7,11 @@ import { inspect } from 'node:util';
const noop = () => { /** noop */ }; const noop = () => { /** noop */ };
type TrieNode = [ type TrieNode<Meta = any> = [
boolean, /** sentinel */ boolean, /** sentinel */
TrieNode | null, /** parent */ TrieNode | null, /** parent */
Map<string, TrieNode> /** children */ Map<string, TrieNode>, /** children */
Meta /** meta */
]; ];
const deepTrieNodeToJSON = (node: TrieNode) => { const deepTrieNodeToJSON = (node: TrieNode) => {
@ -18,14 +19,17 @@ const deepTrieNodeToJSON = (node: TrieNode) => {
if (node[0]) { if (node[0]) {
obj['[start]'] = node[0]; obj['[start]'] = node[0];
} }
if (node[3] !== undefined) {
obj['[meta]'] = node[3];
}
node[2].forEach((value, key) => { node[2].forEach((value, key) => {
obj[key] = deepTrieNodeToJSON(value); obj[key] = deepTrieNodeToJSON(value);
}); });
return obj; return obj;
}; };
const createNode = (parent: TrieNode | null = null): TrieNode => { const createNode = <Meta = any>(parent: TrieNode | null = null, meta: Meta | null = null): TrieNode => {
return [false, parent, new Map<string, TrieNode>()] as TrieNode; return [false, parent, new Map<string, TrieNode>(), meta] as TrieNode<Meta>;
}; };
export const hostnameToTokens = (hostname: string): string[] => { export const hostnameToTokens = (hostname: string): string[] => {
@ -72,16 +76,16 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea
return false; return false;
}; };
export const createTrie = (from?: string[] | Set<string> | null, smolTree = false) => { export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smolTree = false) => {
let size = 0; let size = 0;
const root: TrieNode = createNode(); const root: TrieNode<Meta> = createNode();
/** /**
* Method used to add the given suffix to the trie. * Method used to add the given suffix to the trie.
*/ */
const add = smolTree const add = smolTree
? (suffix: string): void => { ? (suffix: string, meta?: Meta): void => {
let node: TrieNode = root; let node: TrieNode<Meta> = root;
const onToken = (token: string) => { const onToken = (token: string) => {
if (node[2].has(token)) { if (node[2].has(token)) {
@ -98,6 +102,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node = newNode; node = newNode;
} }
node[3] = meta!;
return false; return false;
}; };
@ -128,8 +133,8 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node[0] = true; node[0] = true;
} }
: (suffix: string): void => { : (suffix: string, meta?: Meta): void => {
let node: TrieNode = root; let node: TrieNode<Meta> = root;
const onToken = (token: string) => { const onToken = (token: string) => {
if (node[2].has(token)) { if (node[2].has(token)) {
@ -140,6 +145,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node = newNode; node = newNode;
} }
node[3] = meta!;
return false; return false;
}; };
@ -221,15 +227,15 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
}; };
const walk = ( const walk = (
onMatches: (suffix: string[]) => void, onMatches: (suffix: string[], meta: Meta) => void,
initialNode = root, initialNode = root,
initialSuffix: string[] = [] initialSuffix: string[] = []
) => { ) => {
const nodeStack: TrieNode[] = [initialNode]; const nodeStack: Array<TrieNode<Meta>> = [initialNode];
// Resolving initial string (begin the start of the stack) // Resolving initial string (begin the start of the stack)
const suffixStack: string[][] = [initialSuffix]; const suffixStack: string[][] = [initialSuffix];
let node: TrieNode = root; let node: TrieNode<Meta> = root;
do { do {
node = nodeStack.pop()!; node = nodeStack.pop()!;
@ -244,7 +250,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
// If the node is a sentinel, we push the suffix to the results // If the node is a sentinel, we push the suffix to the results
if (node[0]) { if (node[0]) {
onMatches(suffix); onMatches(suffix, node[3]);
} }
} while (nodeStack.length); } while (nodeStack.length);
}; };
@ -383,6 +389,16 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
return results; return results;
}; };
const dumpWithMeta = () => {
const results: Array<[string, Meta]> = [];
walk((suffix, meta) => {
results.push([fastStringArrayJoin(suffix, ''), meta]);
});
return results;
};
const whitelist = (suffix: string) => { const whitelist = (suffix: string) => {
if (!smolTree) { if (!smolTree) {
throw new Error('whitelist method is only available in smolTree mode.'); throw new Error('whitelist method is only available in smolTree mode.');
@ -428,7 +444,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
add(from[i]); add(from[i]);
} }
} else if (from) { } else if (from) {
from.forEach(add); from.forEach((value) => add(value));
} }
return { return {
@ -440,6 +456,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
delete: remove, delete: remove,
has, has,
dump, dump,
dumpWithMeta,
get size() { get size() {
if (smolTree) { if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot have correct size!'); throw new Error('A Trie with smolTree enabled cannot have correct size!');
@ -460,5 +477,3 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
}; };
export type Trie = ReturnType<typeof createTrie>; export type Trie = ReturnType<typeof createTrie>;
export default createTrie;