Perf: use tldts-experimental when possible

This commit is contained in:
SukkaW 2024-05-26 04:32:08 +08:00
parent db6786a0d6
commit 1288460c48
9 changed files with 32 additions and 24 deletions

View File

@ -11,7 +11,10 @@ import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace'; import { task } from './trace';
import * as tldts from 'tldts'; // tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is
// enough when creating a simple stat of reject hosts.
import * as tldts from 'tldts-experimental';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
import { getPhishingDomains } from './lib/get-phishing-domains'; import { getPhishingDomains } from './lib/get-phishing-domains';

View File

@ -1,7 +1,7 @@
import path from 'path'; import path from 'path';
import { task } from './trace'; import { task } from './trace';
import { compareAndWriteFile } from './lib/create-file'; import { compareAndWriteFile } from './lib/create-file';
import * as tldts from 'tldts'; import { getHostname } from 'tldts';
function escapeRegExp(string = '') { function escapeRegExp(string = '') {
const reRegExpChar = /[$()*+.?[\\\]^{|}]/g; const reRegExpChar = /[$()*+.?[\\\]^{|}]/g;
@ -122,7 +122,7 @@ export const buildRedirectModule = task(import.meta.path, async (span) => {
const domains = Array.from( const domains = Array.from(
new Set( new Set(
[ [
...REDIRECT_MIRROR.map(([from]) => tldts.getHostname(from, { detectIp: false })), ...REDIRECT_MIRROR.map(([from]) => getHostname(from, { detectIp: false })),
...REDIRECT_FAKEWEBSITES.flatMap(([from]) => [from, `www.${from}`]) ...REDIRECT_FAKEWEBSITES.flatMap(([from]) => [from, `www.${from}`])
] ]
) )

View File

@ -4,11 +4,10 @@ import { createRuleset } from './lib/create-file';
import { sortDomains } from './lib/stable-sort-domain'; import { sortDomains } from './lib/stable-sort-domain';
import { Sema } from 'async-sema'; import { Sema } from 'async-sema';
import * as tldts from 'tldts'; import { getHostname } from 'tldts';
import { task } from './trace'; import { task } from './trace';
import { fetchWithRetry } from './lib/fetch-retry'; import { fetchWithRetry } from './lib/fetch-retry';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import picocolors from 'picocolors'; import picocolors from 'picocolors';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line'; import { processLine } from './lib/process-line';
@ -64,7 +63,7 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
} }
})).then(r => r.json()).then((data: Array<{ url: string }>) => data.reduce<string[]>( })).then(r => r.json()).then((data: Array<{ url: string }>) => data.reduce<string[]>(
(prev, cur) => { (prev, cur) => {
const hn = tldts.getHostname(cur.url, { detectIp: false }); const hn = getHostname(cur.url, { detectIp: false });
if (hn) { if (hn) {
prev.push(hn); prev.push(hn);
} }

View File

@ -1,6 +1,6 @@
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processDomainLists } from './parse-filter'; import { processDomainLists } from './parse-filter';
import * as tldts from 'tldts'; import { getSubdomain } from 'tldts';
import { TTL } from './cache-filesystem'; import { TTL } from './cache-filesystem';
import { add as SetAdd } from 'mnemonist/set'; import { add as SetAdd } from 'mnemonist/set';
@ -177,7 +177,7 @@ export function calcDomainAbuseScore(line: string) {
} }
} }
const subdomain = tldts.getSubdomain(line, { detectIp: false }); const subdomain = getSubdomain(line, { detectIp: false });
if (subdomain) { if (subdomain) {
if (subdomain.slice(1).includes('.')) { if (subdomain.slice(1).includes('.')) {

View File

@ -1,10 +1,10 @@
import * as tldts from 'tldts'; import { parse as tldtsParse } from 'tldts';
import { isProbablyIpv4 } from './is-fast-ip'; import { isProbablyIpv4 } from './is-fast-ip';
export const normalizeDomain = (domain: string) => { export const normalizeDomain = (domain: string) => {
if (!domain) return null; if (!domain) return null;
if (isProbablyIpv4(domain)) return null; if (isProbablyIpv4(domain)) return null;
const parsed = tldts.parse(domain, { allowPrivateDomains: true, detectIp: false }); const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false });
// if (parsed.isIp) return null; // if (parsed.isIp) return null;
if (!parsed.hostname) return null; if (!parsed.hostname) return null;
if (!parsed.isIcann && !parsed.isPrivate) return null; if (!parsed.isIcann && !parsed.isPrivate) return null;

View File

@ -1,8 +1,8 @@
import { fetchRemoteTextByLine } from './fetch-text-by-line'; import { fetchRemoteTextByLine } from './fetch-text-by-line';
import { parse } from 'tldts'; import { parse as tldtsParse } from 'tldts';
const isDomainLoose = (domain: string): boolean => { const isDomainLoose = (domain: string): boolean => {
const { isIcann, isPrivate, isIp } = parse(domain); const { isIcann, isPrivate, isIp } = tldtsParse(domain);
return !!(!isIp && (isIcann || isPrivate)); return !!(!isIp && (isIcann || isPrivate));
}; };

View File

@ -1,4 +1,7 @@
import * as tldts from 'tldts'; // tldts-experimental is way faster than tldts, but very little bit inaccurate
// (since it is hashes based). But the result is still deterministic, which is
// enough when sorting.
import * as tldts from 'tldts-experimental';
import { sort } from './timsort'; import { sort } from './timsort';
export const compare = (a: string, b: string) => { export const compare = (a: string, b: string) => {
@ -6,11 +9,11 @@ export const compare = (a: string, b: string) => {
return (a.length - b.length) || a.localeCompare(b); return (a.length - b.length) || a.localeCompare(b);
}; };
const tldtsOpt = { const tldtsOpt: Parameters<typeof tldts.getDomain>[1] = {
extractHostname: false,
allowPrivateDomains: false, allowPrivateDomains: false,
detectIp: false, extractHostname: false,
validateHostname: false, validateHostname: false,
detectIp: false,
mixedInputs: false mixedInputs: false
}; };
@ -36,14 +39,16 @@ export const sortDomains = (inputs: string[]) => {
const main_domain_a = domainMap.get(a)!; const main_domain_a = domainMap.get(a)!;
const main_domain_b = domainMap.get(b)!; const main_domain_b = domainMap.get(b)!;
let t = compare(main_domain_a, main_domain_b); let t = compare(
main_domain_a,
main_domain_b
) || compare(
/** subdomain_a */ subdomainMap.get(a)!,
/** subdomain_b */ subdomainMap.get(b)!
);
if (t !== 0) return t;
if (t === 0) { if (a !== main_domain_a || b !== main_domain_b) {
const subdomain_a = subdomainMap.get(a)!;
const subdomain_b = subdomainMap.get(b)!;
t = compare(subdomain_a, subdomain_b);
}
if (t === 0 && (a !== main_domain_a || b !== main_domain_b)) {
t = compare(a, b); t = compare(a, b);
} }

BIN
bun.lockb

Binary file not shown.

View File

@ -29,7 +29,8 @@
"punycode": "^2.3.1", "punycode": "^2.3.1",
"table": "^6.8.2", "table": "^6.8.2",
"tar-stream": "^3.1.7", "tar-stream": "^3.1.7",
"tldts": "^6.1.19" "tldts": "^6.1.19",
"tldts-experimental": "^6.1.21"
}, },
"devDependencies": { "devDependencies": {
"@eslint-sukka/node": "6.0.0-beta.3", "@eslint-sukka/node": "6.0.0-beta.3",