Perf: 1-pass domain parse

This commit is contained in:
SukkaW 2024-06-04 17:36:17 +08:00
parent 4438d3494f
commit 3b655f34aa
3 changed files with 10 additions and 29 deletions

View File

@ -101,9 +101,7 @@ process.on('unhandledRejection', (reason) => {
downloadMockAssetsPromise downloadMockAssetsPromise
]); ]);
await Promise.all([ await buildPublic(rootSpan);
buildPublic(rootSpan)
]);
rootSpan.stop(); rootSpan.stop();

View File

@ -1,6 +1,5 @@
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processDomainLists } from './parse-filter'; import { processDomainLists } from './parse-filter';
import { getSubdomain, getPublicSuffix } from 'tldts-experimental'; import { parse } from 'tldts-experimental';
import type { Span } from '../trace'; import type { Span } from '../trace';
import { appendArrayInPlaceCurried } from './append-array-in-place'; import { appendArrayInPlaceCurried } from './append-array-in-place';
@ -103,8 +102,6 @@ export const WHITELIST_MAIN_DOMAINS = new Set([
]); ]);
export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const gorhill = await getGorhillPublicSuffixPromise();
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const domainSet: string[] = []; const domainSet: string[] = [];
@ -122,16 +119,16 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
const safeGorhillLine = line[0] === '.' ? line.slice(1) : line; const safeGorhillLine = line[0] === '.' ? line.slice(1) : line;
const apexDomain = gorhill.getDomain(safeGorhillLine); const {
if (!apexDomain) { publicSuffix: tld,
continue; domain: apexDomain,
} subdomain
} = parse(safeGorhillLine, looseTldtsOpt);
const tld = getPublicSuffix(safeGorhillLine, looseTldtsOpt); if (!tld || !apexDomain || (!BLACK_TLD.has(tld) && tld.length < 7)) continue;
if (!tld || (!BLACK_TLD.has(tld) && tld.length < 7)) continue;
domainCountMap[apexDomain] ||= 0; domainCountMap[apexDomain] ||= 0;
domainCountMap[apexDomain] += calcDomainAbuseScore(line); domainCountMap[apexDomain] += calcDomainAbuseScore(line, subdomain);
} }
}); });
@ -144,7 +141,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
return domainArr; return domainArr;
}); });
export function calcDomainAbuseScore(line: string) { export function calcDomainAbuseScore(line: string, subdomain: string | null) {
let weight = 1; let weight = 1;
const isPhishingDomainMockingCoJp = line.includes('-co-jp'); const isPhishingDomainMockingCoJp = line.includes('-co-jp');
@ -183,8 +180,6 @@ export function calcDomainAbuseScore(line: string) {
} }
} }
const subdomain = getSubdomain(line, looseTldtsOpt);
if (subdomain) { if (subdomain) {
if (subdomain.slice(1).includes('.')) { if (subdomain.slice(1).includes('.')) {
weight += 1; weight += 1;

View File

@ -39,18 +39,6 @@ export const sortDomains = (
subdomainMap = sm; subdomainMap = sm;
} }
for (let i = 0, len = inputs.length; i < len; i++) {
const cur = inputs[i];
if (!domainMap.has(cur)) {
const topD = getDomain(cur, looseTldtsOpt);
domainMap.set(cur, topD ?? cur);
}
if (!subdomainMap.has(cur)) {
const subD = getSubdomain(cur, looseTldtsOpt);
subdomainMap.set(cur, subD ?? cur);
}
}
const sorter = (a: string, b: string) => { const sorter = (a: string, b: string) => {
if (a === b) return 0; if (a === b) return 0;