Chore: minor changes

This commit is contained in:
SukkaW 2024-05-27 02:42:56 +08:00
parent eb0623c1a9
commit efa34399b0
7 changed files with 94 additions and 68 deletions

View File

@ -9,7 +9,7 @@ import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { domainDeduper } from './lib/domain-deduper'; import { domainDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick'; import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain'; import { buildParseDomainMap, sortDomains } from './lib/stable-sort-domain';
import { task } from './trace'; import { task } from './trace';
// tldts-experimental is way faster than tldts, but very little bit inaccurate // tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is // (since it is hashes based). But the result is still deterministic, which is
@ -21,6 +21,10 @@ import { getPhishingDomains } from './lib/get-phishing-domains';
import { subtract as SetSubstract } from 'mnemonist/set'; import { subtract as SetSubstract } from 'mnemonist/set';
import { setAddFromArray, setAddFromArrayCurried } from './lib/set-add-from-array'; import { setAddFromArray, setAddFromArrayCurried } from './lib/set-add-from-array';
import { sort } from './lib/timsort'; import { sort } from './lib/timsort';
import { looseTldtsOpt } from './constants/loose-tldts-opt';
import { build } from 'bun';
const getRejectSukkaConfPromise = readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'));
export const buildRejectDomainSet = task(import.meta.path, async (span) => { export const buildRejectDomainSet = task(import.meta.path, async (span) => {
/** Whitelists */ /** Whitelists */
@ -37,11 +41,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
let shouldStop = false; let shouldStop = false;
await Promise.all([ await Promise.all([
// Parse from remote hosts & domain lists // Parse from remote hosts & domain lists
...HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)), HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)),
DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)),
...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)), ADGUARD_FILTERS.map(
...ADGUARD_FILTERS.map(
input => processFilterRules(childSpan, ...input) input => processFilterRules(childSpan, ...input)
.then(({ white, black, foundDebugDomain }) => { .then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) { if (foundDebugDomain) {
@ -53,7 +55,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
setAddFromArray(domainSets, black); setAddFromArray(domainSets, black);
}) })
), ),
...([ ([
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
].map( ].map(
@ -64,9 +66,8 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
}) })
)), )),
getPhishingDomains(childSpan).then(appendArrayToDomainSets), getPhishingDomains(childSpan).then(appendArrayToDomainSets),
childSpan.traceChildAsync('process reject_sukka.conf', () => readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')) getRejectSukkaConfPromise.then(appendArrayToDomainSets)
.then(appendArrayToDomainSets)) ].flat());
]);
// eslint-disable-next-line sukka/no-single-return -- not single return // eslint-disable-next-line sukka/no-single-return -- not single return
return shouldStop; return shouldStop;
}); });
@ -107,30 +108,31 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
}); });
}); });
const trie = span.traceChildSync('dedupe from white suffixes', () => { const trie = span.traceChildSync('create smol trie', () => createTrie(domainSets, true, true));
const trie = createTrie(domainSets, true, true);
filterRuleWhitelistDomainSets.forEach(trie.whitelist); span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist));
return trie;
});
// Dedupe domainSets // Dedupe domainSets
const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie)); const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie));
console.log(`Final size ${dudupedDominArray.length}`); console.log(`Final size ${dudupedDominArray.length}`);
const {
domainMap: domainArrayMainDomainMap,
subdomainMap: domainArraySubdomainMap
} = span.traceChildSync(
'build map for stat and sort',
() => buildParseDomainMap(dudupedDominArray)
);
// Create reject stats // Create reject stats
const rejectDomainsStats: Array<[string, number]> = span const rejectDomainsStats: Array<[string, number]> = span
.traceChild('create reject stats') .traceChild('create reject stats')
.traceSyncFn(() => { .traceSyncFn(() => {
const tldtsOpt = { allowPrivateDomains: false, detectIp: false, validateHostname: false };
const statMap = dudupedDominArray.reduce<Map<string, number>>((acc, cur) => { const statMap = dudupedDominArray.reduce<Map<string, number>>((acc, cur) => {
const suffix = tldts.getDomain(cur, tldtsOpt); const suffix = domainArrayMainDomainMap.get(cur);
if (!suffix) return acc; if (suffix) {
acc.set(suffix, (acc.get(suffix) ?? 0) + 1);
if (acc.has(suffix)) {
acc.set(suffix, acc.get(suffix)! + 1);
} else {
acc.set(suffix, 1);
} }
return acc; return acc;
}, new Map()); }, new Map());
@ -157,7 +159,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
'Sukka\'s Ruleset - Reject Base', 'Sukka\'s Ruleset - Reject Base',
description, description,
new Date(), new Date(),
span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray)), span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)),
'domainset', 'domainset',
path.resolve(import.meta.dir, '../List/domainset/reject.conf'), path.resolve(import.meta.dir, '../List/domainset/reject.conf'),
path.resolve(import.meta.dir, '../Clash/domainset/reject.txt') path.resolve(import.meta.dir, '../Clash/domainset/reject.txt')

View File

@ -0,0 +1,9 @@
import type * as tldts from 'tldts';
export const looseTldtsOpt: Parameters<typeof tldts.getSubdomain>[1] = {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};

View File

@ -1,8 +1,8 @@
interface Node { interface Node {
/** @default false */ /** @default false */
wordEnd?: boolean, wordEnd: boolean,
children: Map<string, Node | undefined>, children: Map<string, Node | undefined>,
fail?: Node fail: Node | undefined
} }
const createNode = (): Node => ({ const createNode = (): Node => ({

View File

@ -5,6 +5,7 @@ import { getSubdomain, getPublicSuffix } from 'tldts-experimental';
import type { Span } from '../trace'; import type { Span } from '../trace';
import { appendArrayInPlaceCurried } from './append-array-in-place'; import { appendArrayInPlaceCurried } from './append-array-in-place';
import { PHISHING_DOMAIN_LISTS } from './reject-data-source'; import { PHISHING_DOMAIN_LISTS } from './reject-data-source';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
const BLACK_TLD = new Set([ const BLACK_TLD = new Set([
'accountant', 'accountant',
@ -99,14 +100,6 @@ export const WHITELIST_MAIN_DOMAINS = new Set([
'notion.site' 'notion.site'
]); ]);
const tldtsOpt: Parameters<typeof getSubdomain>[1] = {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};
export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const gorhill = await getGorhillPublicSuffixPromise(); const gorhill = await getGorhillPublicSuffixPromise();
@ -132,7 +125,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
continue; continue;
} }
const tld = getPublicSuffix(safeGorhillLine, tldtsOpt); const tld = getPublicSuffix(safeGorhillLine, looseTldtsOpt);
if (!tld || !BLACK_TLD.has(tld)) continue; if (!tld || !BLACK_TLD.has(tld)) continue;
domainCountMap[apexDomain] ||= 0; domainCountMap[apexDomain] ||= 0;
@ -187,7 +180,7 @@ export function calcDomainAbuseScore(line: string) {
} }
} }
const subdomain = getSubdomain(line, tldtsOpt); const subdomain = getSubdomain(line, looseTldtsOpt);
if (subdomain) { if (subdomain) {
if (subdomain.slice(1).includes('.')) { if (subdomain.slice(1).includes('.')) {

View File

@ -7,6 +7,7 @@ export const normalizeDomain = (domain: string) => {
const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false }); const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false });
// if (parsed.isIp) return null; // if (parsed.isIp) return null;
if (!parsed.hostname) return null; if (!parsed.hostname) return null;
// Private invalid domain (things like .tor, .dn42, etc)
if (!parsed.isIcann && !parsed.isPrivate) return null; if (!parsed.isIcann && !parsed.isPrivate) return null;
let h = parsed.hostname; let h = parsed.hostname;

View File

@ -3,32 +3,51 @@
// enough when sorting. // enough when sorting.
import { getDomain, getSubdomain } from 'tldts-experimental'; import { getDomain, getSubdomain } from 'tldts-experimental';
import { sort } from './timsort'; import { sort } from './timsort';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
export const compare = (a: string, b: string) => { export const compare = (a: string, b: string) => {
if (a === b) return 0; if (a === b) return 0;
return (a.length - b.length) || a.localeCompare(b); return (a.length - b.length) || a.localeCompare(b);
}; };
const tldtsOpt: Parameters<typeof getDomain>[1] = { export const buildParseDomainMap = (inputs: string[]) => {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};
export const sortDomains = (inputs: string[]) => {
const domainMap = new Map<string, string>(); const domainMap = new Map<string, string>();
const subdomainMap = new Map<string, string>(); const subdomainMap = new Map<string, string>();
for (let i = 0, len = inputs.length; i < len; i++) { for (let i = 0, len = inputs.length; i < len; i++) {
const cur = inputs[i]; const cur = inputs[i];
if (!domainMap.has(cur)) { if (!domainMap.has(cur)) {
const topD = getDomain(cur, tldtsOpt); const topD = getDomain(cur, looseTldtsOpt);
domainMap.set(cur, topD ?? cur); domainMap.set(cur, topD ?? cur);
} }
if (!subdomainMap.has(cur)) { if (!subdomainMap.has(cur)) {
const subD = getSubdomain(cur, tldtsOpt); const subD = getSubdomain(cur, looseTldtsOpt);
subdomainMap.set(cur, subD ?? cur);
}
}
return { domainMap, subdomainMap };
};
export const sortDomains = (
inputs: string[],
domainMap?: Map<string, string>,
subdomainMap?: Map<string, string>
) => {
if (!domainMap || !subdomainMap) {
const { domainMap: dm, subdomainMap: sm } = buildParseDomainMap(inputs);
domainMap = dm;
subdomainMap = sm;
}
for (let i = 0, len = inputs.length; i < len; i++) {
const cur = inputs[i];
if (!domainMap.has(cur)) {
const topD = getDomain(cur, looseTldtsOpt);
domainMap.set(cur, topD ?? cur);
}
if (!subdomainMap.has(cur)) {
const subD = getSubdomain(cur, looseTldtsOpt);
subdomainMap.set(cur, subD ?? cur); subdomainMap.set(cur, subD ?? cur);
} }
} }

View File

@ -36,32 +36,34 @@ const createNode = (parent: TrieNode | null = null): TrieNode => {
return node; return node;
}; };
const hostnameToTokens = (hostname: string): string[] => {
let buf = '';
const tokens: string[] = [];
for (let i = 0, l = hostname.length; i < l; i++) {
const c = hostname[i];
if (c === '.') {
if (buf) {
tokens.push(buf, /* . */ c);
buf = '';
} else {
tokens.push(/* . */ c);
}
} else {
buf += c;
}
}
if (buf) {
tokens.push(buf);
}
return tokens;
};
export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => { export const createTrie = (from?: string[] | Set<string> | null, hostnameMode = false, smolTree = false) => {
let size = 0; let size = 0;
const root: TrieNode = createNode(); const root: TrieNode = createNode();
const suffixToTokens = hostnameMode const suffixToTokens = hostnameMode
? (suffix: string) => { ? hostnameToTokens
let buf = '';
const tokens: string[] = [];
for (let i = 0, l = suffix.length; i < l; i++) {
const c = suffix[i];
if (c === '.') {
if (buf) {
tokens.push(buf, /* . */ c);
buf = '';
} else {
tokens.push(/* . */ c);
}
} else {
buf += c;
}
}
if (buf) {
tokens.push(buf);
}
return tokens;
}
: (suffix: string) => suffix; : (suffix: string) => suffix;
/** /**