Chore: dedupe and sort other rulesets

This commit is contained in:
SukkaW 2024-09-08 01:28:54 +08:00
parent d4ee25e75a
commit 90079b9987
8 changed files with 110 additions and 91 deletions

View File

@ -5,7 +5,7 @@ import { createTrie } from './lib/trie';
import { task } from './trace';
import { SHARED_DESCRIPTION } from './lib/constants';
import { getPublicSuffixListTextPromise } from './lib/download-publicsuffixlist';
import { domainDeduper } from './lib/domain-deduper';
import { domainsetDeduper } from './lib/domain-deduper';
import { appendArrayInPlace } from './lib/append-array-in-place';
import { sortDomains } from './lib/stable-sort-domain';
import { output } from './lib/misc';
@ -76,7 +76,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
'This file contains object storage and static assets CDN domains.'
],
new Date(),
sortDomains(domainDeduper(cdnDomainsList)),
sortDomains(domainsetDeduper(cdnDomainsList)),
'domainset',
output('cdn', 'domainset')
),
@ -89,7 +89,7 @@ export const buildCdnDownloadConf = task(require.main === module, __filename)(as
'This file contains domains for software updating & large file hosting.'
],
new Date(),
sortDomains(domainDeduper(downloadDomainSet)),
sortDomains(domainsetDeduper(downloadDomainSet)),
'domainset',
output('download', 'domainset')
)

View File

@ -4,7 +4,7 @@ import * as path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
import { createRuleset } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper';
import { domainsetDeduper } from './lib/domain-deduper';
import type { Span } from './trace';
import { task } from './trace';
import { SHARED_DESCRIPTION } from './lib/constants';
@ -116,7 +116,7 @@ function transformDomainset(parentSpan: Span, sourcePath: string, relativePath:
const clashFileBasename = relativePath.slice(0, -path.extname(relativePath).length);
const [title, descriptions, lines] = res;
const deduped = domainDeduper(lines);
const deduped = domainsetDeduper(lines);
let description: string[];
if (descriptions.length) {

View File

@ -7,7 +7,7 @@ import { createTrie } from './lib/trie';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper';
import { domainsetDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
@ -149,8 +149,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
});
// Dedupe domainSets
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie));
const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie));
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainsetDeduper(baseTrie));
const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainsetDeduper(extraTrie));
console.log(`Final size ${dudupedDominArray.length} + ${dudupedDominArrayExtra.length}`);

View File

@ -1,4 +1,4 @@
import { domainDeduper } from './lib/domain-deduper';
import { domainsetDeduper } from './lib/domain-deduper';
import path from 'node:path';
import { createRuleset } from './lib/create-file';
import { sortDomains } from './lib/stable-sort-domain';
@ -235,7 +235,7 @@ export const buildSpeedtestDomainSet = task(require.main === module, __filename)
}
}))));
const deduped = span.traceChildSync('sort result', () => sortDomains(domainDeduper(domainTrie)));
const deduped = span.traceChildSync('sort result', () => sortDomains(domainsetDeduper(domainTrie)));
const description = [
...SHARED_DESCRIPTION,

9
Build/lib/bitwise.ts Normal file
View File

@ -0,0 +1,9 @@
/** Packs two 16-bit integers into one 32-bit integer */
export const pack = (a: number, b: number): number => {
return (a << 16) | b;
};
/** Unpacks two 16-bit integers from one 32-bit integer */
export const unpack = (value: number): [a: number, b: number] => {
return [(value >> 16) & 0xFFFF, value & 0xFFFF];
};

View File

@ -8,6 +8,8 @@ import { fastStringArrayJoin, writeFile } from './misc';
import { readFileByLine } from './fetch-text-by-line';
import stringify from 'json-stringify-pretty-compact';
import { ipCidrListToSingbox, surgeDomainsetToSingbox, surgeRulesetToSingbox } from './singbox';
import { createTrie } from './trie';
import { pack, unpack } from './bitwise';
export async function compareAndWriteFile(span: Span, linesA: string[], filePath: string) {
let isEqual = true;
@ -92,17 +94,6 @@ const withBannerArray = (title: string, description: string[] | readonly string[
];
};
const collectType = (rule: string) => {
let buf = '';
for (let i = 0, len = rule.length; i < len; i++) {
if (rule[i] === ',') {
return buf;
}
buf += rule[i];
}
return null;
};
const defaultSortTypeOrder = Symbol('defaultSortTypeOrder');
const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
DOMAIN: 1,
@ -120,33 +111,62 @@ const sortTypeOrder: Record<string | typeof defaultSortTypeOrder, number> = {
'IP-CIDR': 400,
'IP-CIDR6': 400
};
// sort DOMAIN-SUFFIX and DOMAIN first, then DOMAIN-KEYWORD, then IP-CIDR and IP-CIDR6 if any
export const sortRuleSet = (ruleSet: string[]) => {
return ruleSet.map((rule) => {
const type = collectType(rule);
if (!type) {
return [10, rule] as const;
}
if (!(type in sortTypeOrder)) {
return [sortTypeOrder[defaultSortTypeOrder], rule] as const;
}
if (type === 'URL-REGEX') {
let extraWeight = 0;
if (rule.includes('.+') || rule.includes('.*')) {
extraWeight += 10;
}
if (rule.includes('|')) {
extraWeight += 1;
}
return [
sortTypeOrder[type] + extraWeight,
rule
] as const;
const flagDomain = 1 << 2;
const flagDomainSuffix = 1 << 3;
// dedupe and sort based on rule type
const processRuleSet = (ruleSet: string[]) => {
const trie = createTrie<number>(null, true);
const sortMap: Array<[value: number, weight: number]> = [];
for (let i = 0, len = ruleSet.length; i < len; i++) {
const line = ruleSet[i];
const [type, value] = line.split(',');
let extraWeight = 0;
switch (type) {
case 'DOMAIN':
trie.add(value, pack(i, flagDomain));
break;
case 'DOMAIN-SUFFIX':
trie.add('.' + value, pack(i, flagDomainSuffix));
break;
case 'URL-REGEX':
if (value.includes('.+') || value.includes('.*')) {
extraWeight += 10;
}
if (value.includes('|')) {
extraWeight += 1;
}
sortMap.push([i, sortTypeOrder[type] + extraWeight]);
break;
case null:
sortMap.push([i, 10]);
break;
default:
if (type in sortTypeOrder) {
sortMap.push([i, sortTypeOrder[type]]);
} else {
sortMap.push([i, sortTypeOrder[defaultSortTypeOrder]]);
}
}
return [sortTypeOrder[type], rule] as const;
}).sort((a, b) => a[0] - b[0])
.map(c => c[1]);
}
const dumped = trie.dumpWithMeta();
for (let i = 0, len = dumped.length; i < len; i++) {
const [originalIndex, flag] = unpack(dumped[i][1]);
console.log(dumped[i][0], ruleSet[originalIndex]);
const type = flag === flagDomain ? 'DOMAIN' : 'DOMAIN-SUFFIX';
sortMap.push([originalIndex, sortTypeOrder[type]]);
}
return sortMap
.sort((a, b) => a[1] - b[1])
.map(c => ruleSet[c[0]]);
};
const MARK = 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe';
@ -162,7 +182,7 @@ export const createRuleset = (
_clashMrsPath?: string
]
) => parentSpan.traceChild(`create ruleset: ${path.basename(surgePath, path.extname(surgePath))}`).traceAsyncFn(async (childSpan) => {
content = sortRuleSet(content);
content = processRuleSet(content);
const surgeContent = childSpan.traceChildSync('process surge ruleset', () => {
let _surgeContent;
switch (type) {

View File

@ -1,8 +1,6 @@
import { createTrie, type Trie } from './trie';
export function domainDeduper(inputDomains: string[] | Trie, toArray?: true): string[];
export function domainDeduper(inputDomains: string[] | Trie, toArray: false): Set<string>;
export function domainDeduper(inputDomains: string[] | Trie, toArray = true): string[] | Set<string> {
export function domainsetDeduper(inputDomains: string[] | Trie): string[] {
let trie: Trie;
if (Array.isArray(inputDomains)) {
trie = createTrie(inputDomains, true);
@ -12,28 +10,5 @@ export function domainDeduper(inputDomains: string[] | Trie, toArray = true): st
throw new Error('Invalid trie');
}
const dumped = trie.dump();
if (toArray) {
return dumped;
}
return new Set(dumped);
// const trie = createTrie(inputDomains, true);
// const sets = new Set(inputDomains);
// for (let i = 0, len1 = inputDomains.length; i < len1; i++) {
// const d = inputDomains[i];
// if (d[0] !== '.') {
// continue;
// }
// trie.substractSetInPlaceFromFound(d, sets);
// sets.delete(d.slice(1));
// }
// if (toArray) {
// return Array.from(sets);
// }
// return sets;
return trie.dump();
}

View File

@ -7,10 +7,11 @@ import { inspect } from 'node:util';
const noop = () => { /** noop */ };
type TrieNode = [
type TrieNode<Meta = any> = [
boolean, /** sentinel */
TrieNode | null, /** parent */
Map<string, TrieNode> /** children */
Map<string, TrieNode>, /** children */
Meta /** meta */
];
const deepTrieNodeToJSON = (node: TrieNode) => {
@ -18,14 +19,17 @@ const deepTrieNodeToJSON = (node: TrieNode) => {
if (node[0]) {
obj['[start]'] = node[0];
}
if (node[3] !== undefined) {
obj['[meta]'] = node[3];
}
node[2].forEach((value, key) => {
obj[key] = deepTrieNodeToJSON(value);
});
return obj;
};
const createNode = (parent: TrieNode | null = null): TrieNode => {
return [false, parent, new Map<string, TrieNode>()] as TrieNode;
const createNode = <Meta = any>(parent: TrieNode | null = null, meta: Meta | null = null): TrieNode => {
return [false, parent, new Map<string, TrieNode>(), meta] as TrieNode<Meta>;
};
export const hostnameToTokens = (hostname: string): string[] => {
@ -72,16 +76,16 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea
return false;
};
export const createTrie = (from?: string[] | Set<string> | null, smolTree = false) => {
export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smolTree = false) => {
let size = 0;
const root: TrieNode = createNode();
const root: TrieNode<Meta> = createNode();
/**
* Method used to add the given suffix to the trie.
*/
const add = smolTree
? (suffix: string): void => {
let node: TrieNode = root;
? (suffix: string, meta?: Meta): void => {
let node: TrieNode<Meta> = root;
const onToken = (token: string) => {
if (node[2].has(token)) {
@ -98,6 +102,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node = newNode;
}
node[3] = meta!;
return false;
};
@ -128,8 +133,8 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node[0] = true;
}
: (suffix: string): void => {
let node: TrieNode = root;
: (suffix: string, meta?: Meta): void => {
let node: TrieNode<Meta> = root;
const onToken = (token: string) => {
if (node[2].has(token)) {
@ -140,6 +145,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
node = newNode;
}
node[3] = meta!;
return false;
};
@ -221,15 +227,15 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
};
const walk = (
onMatches: (suffix: string[]) => void,
onMatches: (suffix: string[], meta: Meta) => void,
initialNode = root,
initialSuffix: string[] = []
) => {
const nodeStack: TrieNode[] = [initialNode];
const nodeStack: Array<TrieNode<Meta>> = [initialNode];
// Resolving initial string (begin the start of the stack)
const suffixStack: string[][] = [initialSuffix];
let node: TrieNode = root;
let node: TrieNode<Meta> = root;
do {
node = nodeStack.pop()!;
@ -244,7 +250,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
// If the node is a sentinel, we push the suffix to the results
if (node[0]) {
onMatches(suffix);
onMatches(suffix, node[3]);
}
} while (nodeStack.length);
};
@ -383,6 +389,16 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
return results;
};
const dumpWithMeta = () => {
const results: Array<[string, Meta]> = [];
walk((suffix, meta) => {
results.push([fastStringArrayJoin(suffix, ''), meta]);
});
return results;
};
const whitelist = (suffix: string) => {
if (!smolTree) {
throw new Error('whitelist method is only available in smolTree mode.');
@ -428,7 +444,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
add(from[i]);
}
} else if (from) {
from.forEach(add);
from.forEach((value) => add(value));
}
return {
@ -440,6 +456,7 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
delete: remove,
has,
dump,
dumpWithMeta,
get size() {
if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot have correct size!');
@ -460,5 +477,3 @@ export const createTrie = (from?: string[] | Set<string> | null, smolTree = fals
};
export type Trie = ReturnType<typeof createTrie>;
export default createTrie;