Perf: use tldts-experimental when possible

This commit is contained in:
SukkaW 2024-05-26 04:32:08 +08:00
parent db6786a0d6
commit 1288460c48
9 changed files with 32 additions and 24 deletions

View File

@ -11,7 +11,10 @@ import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace';
import * as tldts from 'tldts';
// tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is
// enough when creating a simple stat of reject hosts.
import * as tldts from 'tldts-experimental';
import { SHARED_DESCRIPTION } from './lib/constants';
import { getPhishingDomains } from './lib/get-phishing-domains';

View File

@ -1,7 +1,7 @@
import path from 'path';
import { task } from './trace';
import { compareAndWriteFile } from './lib/create-file';
import * as tldts from 'tldts';
import { getHostname } from 'tldts';
function escapeRegExp(string = '') {
const reRegExpChar = /[$()*+.?[\\\]^{|}]/g;
@ -122,7 +122,7 @@ export const buildRedirectModule = task(import.meta.path, async (span) => {
const domains = Array.from(
new Set(
[
...REDIRECT_MIRROR.map(([from]) => tldts.getHostname(from, { detectIp: false })),
...REDIRECT_MIRROR.map(([from]) => getHostname(from, { detectIp: false })),
...REDIRECT_FAKEWEBSITES.flatMap(([from]) => [from, `www.${from}`])
]
)

View File

@ -4,11 +4,10 @@ import { createRuleset } from './lib/create-file';
import { sortDomains } from './lib/stable-sort-domain';
import { Sema } from 'async-sema';
import * as tldts from 'tldts';
import { getHostname } from 'tldts';
import { task } from './trace';
import { fetchWithRetry } from './lib/fetch-retry';
import { SHARED_DESCRIPTION } from './lib/constants';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import picocolors from 'picocolors';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
@ -64,7 +63,7 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
}
})).then(r => r.json()).then((data: Array<{ url: string }>) => data.reduce<string[]>(
(prev, cur) => {
const hn = tldts.getHostname(cur.url, { detectIp: false });
const hn = getHostname(cur.url, { detectIp: false });
if (hn) {
prev.push(hn);
}

View File

@ -1,6 +1,6 @@
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processDomainLists } from './parse-filter';
import * as tldts from 'tldts';
import { getSubdomain } from 'tldts';
import { TTL } from './cache-filesystem';
import { add as SetAdd } from 'mnemonist/set';
@ -177,7 +177,7 @@ export function calcDomainAbuseScore(line: string) {
}
}
const subdomain = tldts.getSubdomain(line, { detectIp: false });
const subdomain = getSubdomain(line, { detectIp: false });
if (subdomain) {
if (subdomain.slice(1).includes('.')) {

View File

@ -1,10 +1,10 @@
import * as tldts from 'tldts';
import { parse as tldtsParse } from 'tldts';
import { isProbablyIpv4 } from './is-fast-ip';
export const normalizeDomain = (domain: string) => {
if (!domain) return null;
if (isProbablyIpv4(domain)) return null;
const parsed = tldts.parse(domain, { allowPrivateDomains: true, detectIp: false });
const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false });
// if (parsed.isIp) return null;
if (!parsed.hostname) return null;
if (!parsed.isIcann && !parsed.isPrivate) return null;

View File

@ -1,8 +1,8 @@
import { fetchRemoteTextByLine } from './fetch-text-by-line';
import { parse } from 'tldts';
import { parse as tldtsParse } from 'tldts';
const isDomainLoose = (domain: string): boolean => {
const { isIcann, isPrivate, isIp } = parse(domain);
const { isIcann, isPrivate, isIp } = tldtsParse(domain);
return !!(!isIp && (isIcann || isPrivate));
};

View File

@ -1,4 +1,7 @@
import * as tldts from 'tldts';
// tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is
// enough when sorting.
import * as tldts from 'tldts-experimental';
import { sort } from './timsort';
export const compare = (a: string, b: string) => {
@ -6,11 +9,11 @@ export const compare = (a: string, b: string) => {
return (a.length - b.length) || a.localeCompare(b);
};
const tldtsOpt = {
extractHostname: false,
const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
allowPrivateDomains: false,
detectIp: false,
extractHostname: false,
validateHostname: false,
detectIp: false,
mixedInputs: false
};
@ -36,14 +39,16 @@ export const sortDomains = (inputs: string[]) => {
const main_domain_a = domainMap.get(a)!;
const main_domain_b = domainMap.get(b)!;
let t = compare(main_domain_a, main_domain_b);
let t = compare(
main_domain_a,
main_domain_b
) || compare(
/** subdomain_a */ subdomainMap.get(a)!,
/** subdomain_b */ subdomainMap.get(b)!
);
if (t !== 0) return t;
if (t === 0) {
const subdomain_a = subdomainMap.get(a)!;
const subdomain_b = subdomainMap.get(b)!;
t = compare(subdomain_a, subdomain_b);
}
if (t === 0 && (a !== main_domain_a || b !== main_domain_b)) {
if (a !== main_domain_a || b !== main_domain_b) {
t = compare(a, b);
}

BIN
bun.lockb

Binary file not shown.

View File

@ -29,7 +29,8 @@
"punycode": "^2.3.1",
"table": "^6.8.2",
"tar-stream": "^3.1.7",
"tldts": "^6.1.19"
"tldts": "^6.1.19",
"tldts-experimental": "^6.1.21"
},
"devDependencies": {
"@eslint-sukka/node": "6.0.0-beta.3",