Perf: speed-up tldts

This commit is contained in:
SukkaW 2024-05-26 17:42:48 +08:00
parent 21a31e6c1f
commit aa3cb9e586
7 changed files with 74 additions and 15 deletions

View File

@ -59,7 +59,8 @@ export const buildDomesticRuleset = task(import.meta.path, async (span) => {
`${domain} = server:${dns}`,
`*.${domain} = server:${dns}`
])
])
]
)
],
path.resolve(import.meta.dir, '../Modules/sukka_local_dns_mapping.sgmodule')
)

View File

@ -4,7 +4,6 @@ import { readFileByLine } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace';
import { compareAndWriteFile } from './lib/create-file';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import { domainDeduper } from './lib/domain-deduper';
import { sort } from './lib/timsort';

View File

@ -1,5 +1,5 @@
// eslint-disable-next-line import-x/no-unresolved -- bun
import { describe, expect, it } from 'bun:test';
import { describe, it } from 'bun:test';
import { calcDomainAbuseScore } from './get-phishing-domains';

View File

@ -1,9 +1,8 @@
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processDomainLists } from './parse-filter';
import { getSubdomain } from 'tldts';
import { getSubdomain, getPublicSuffix } from 'tldts-experimental';
import { TTL } from './cache-filesystem';
import { add as SetAdd } from 'mnemonist/set';
import type { Span } from '../trace';
import { appendArrayInPlace } from './append-array-in-place';
@ -90,6 +89,14 @@ const BLACK_TLD = new Set([
'design'
]);
const tldtsOpt: Parameters<typeof getSubdomain>[1] = {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};
export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const gorhill = await getGorhillPublicSuffixPromise();
@ -117,7 +124,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
continue;
}
const tld = gorhill.getPublicSuffix(safeGorhillLine);
const tld = getPublicSuffix(safeGorhillLine, tldtsOpt);
if (!tld || !BLACK_TLD.has(tld)) continue;
domainCountMap[apexDomain] ||= 0;
@ -174,7 +181,7 @@ export function calcDomainAbuseScore(line: string) {
}
}
const subdomain = getSubdomain(line, { detectIp: false });
const subdomain = getSubdomain(line, tldtsOpt);
if (subdomain) {
if (subdomain.slice(1).includes('.')) {

View File

@ -1,7 +1,7 @@
// tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is
// enough when sorting.
import * as tldts from 'tldts-experimental';
import { getDomain, getSubdomain } from 'tldts-experimental';
import { sort } from './timsort';
export const compare = (a: string, b: string) => {
@ -9,7 +9,7 @@ export const compare = (a: string, b: string) => {
return (a.length - b.length) || a.localeCompare(b);
};
const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
const tldtsOpt: Parameters<typeof getDomain>[1] = {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
@ -24,11 +24,11 @@ export const sortDomains = (inputs: string[]) => {
for (let i = 0, len = inputs.length; i < len; i++) {
const cur = inputs[i];
if (!domainMap.has(cur)) {
const topD = tldts.getDomain(cur, tldtsOpt);
const topD = getDomain(cur, tldtsOpt);
domainMap.set(cur, topD ?? cur);
}
if (!subdomainMap.has(cur)) {
const subD = tldts.getSubdomain(cur, tldtsOpt);
const subD = getSubdomain(cur, tldtsOpt);
subdomainMap.set(cur, subD ?? cur);
}
}

53
Build/lib/tldts.bench.ts Normal file
View File

@ -0,0 +1,53 @@
import { fetchRemoteTextByLine } from './fetch-text-by-line';
import { processLineFromReadline } from './process-line';
import { bench, group, run } from 'mitata';
import * as tldts from 'tldts';
import * as tldtsExperimental from 'tldts-experimental';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
(async () => {
const data = await processLineFromReadline(await fetchRemoteTextByLine('https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt'));
const gorhill = await getGorhillPublicSuffixPromise();
const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
allowPrivateDomains: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};
(['getDomain', 'getPublicSuffix', 'getSubdomain'] as const).forEach(methodName => {
group(methodName, () => {
if (methodName in gorhill) {
bench('gorhill', () => {
for (let i = 0, len = data.length; i < len; i++) {
const line = data[i];
const safeGorhillLine = line[0] === '.' ? line.slice(1) : line;
// @ts-expect-error -- type guarded
gorhill[methodName](safeGorhillLine);
}
});
}
bench('tldts', () => {
for (let i = 0, len = data.length; i < len; i++) {
// eslint-disable-next-line import-x/namespace -- safe
tldts[methodName](data[i], tldtsOpt);
}
});
bench('tldts-experimental', () => {
for (let i = 0, len = data.length; i < len; i++) {
// eslint-disable-next-line import-x/namespace -- safe
tldtsExperimental[methodName](data[i], tldtsOpt);
}
});
});
});
run();
})();

View File

@ -3,7 +3,6 @@ import { parse } from 'csv-parse/sync';
import { createTrie } from './lib/trie';
import path from 'path';
import { processLine } from './lib/process-line';
import { extract } from 'tar-stream';
import { extractDomainsFromFelixDnsmasq } from './lib/parse-dnsmasq';
export const parseDomesticList = async () => {