From 3b655f34aabe96c6ddaf9fda362ab943dab6377f Mon Sep 17 00:00:00 2001 From: SukkaW Date: Tue, 4 Jun 2024 17:36:17 +0800 Subject: [PATCH] Perf: 1-pass domain parse --- Build/index.ts | 4 +--- Build/lib/get-phishing-domains.ts | 23 +++++++++-------------- Build/lib/stable-sort-domain.ts | 12 ------------ 3 files changed, 10 insertions(+), 29 deletions(-) diff --git a/Build/index.ts b/Build/index.ts index fd22e511..9711cef0 100644 --- a/Build/index.ts +++ b/Build/index.ts @@ -101,9 +101,7 @@ process.on('unhandledRejection', (reason) => { downloadMockAssetsPromise ]); - await Promise.all([ - buildPublic(rootSpan) - ]); + await buildPublic(rootSpan); rootSpan.stop(); diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 16023383..32c825ab 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,6 +1,5 @@ -import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { processDomainLists } from './parse-filter'; -import { getSubdomain, getPublicSuffix } from 'tldts-experimental'; +import { parse } from 'tldts-experimental'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; @@ -103,8 +102,6 @@ export const WHITELIST_MAIN_DOMAINS = new Set([ ]); export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { - const gorhill = await getGorhillPublicSuffixPromise(); - const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { const domainSet: string[] = []; @@ -122,16 +119,16 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g const safeGorhillLine = line[0] === '.' ? line.slice(1) : line; - const apexDomain = gorhill.getDomain(safeGorhillLine); - if (!apexDomain) { - continue; - } + const { + publicSuffix: tld, + domain: apexDomain, + subdomain + } = parse(safeGorhillLine, looseTldtsOpt); - const tld = getPublicSuffix(safeGorhillLine, looseTldtsOpt); - if (!tld || (!BLACK_TLD.has(tld) && tld.length < 7)) continue; + if (!tld || !apexDomain || (!BLACK_TLD.has(tld) && tld.length < 7)) continue; domainCountMap[apexDomain] ||= 0; - domainCountMap[apexDomain] += calcDomainAbuseScore(line); + domainCountMap[apexDomain] += calcDomainAbuseScore(line, subdomain); } }); @@ -144,7 +141,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g return domainArr; }); -export function calcDomainAbuseScore(line: string) { +export function calcDomainAbuseScore(line: string, subdomain: string | null) { let weight = 1; const isPhishingDomainMockingCoJp = line.includes('-co-jp'); @@ -183,8 +180,6 @@ export function calcDomainAbuseScore(line: string) { } } - const subdomain = getSubdomain(line, looseTldtsOpt); - if (subdomain) { if (subdomain.slice(1).includes('.')) { weight += 1; diff --git a/Build/lib/stable-sort-domain.ts b/Build/lib/stable-sort-domain.ts index 66cf85c5..9b4952bd 100644 --- a/Build/lib/stable-sort-domain.ts +++ b/Build/lib/stable-sort-domain.ts @@ -39,18 +39,6 @@ export const sortDomains = ( subdomainMap = sm; } - for (let i = 0, len = inputs.length; i < len; i++) { - const cur = inputs[i]; - if (!domainMap.has(cur)) { - const topD = getDomain(cur, looseTldtsOpt); - domainMap.set(cur, topD ?? cur); - } - if (!subdomainMap.has(cur)) { - const subD = getSubdomain(cur, looseTldtsOpt); - subdomainMap.set(cur, subD ?? cur); - } - } - const sorter = (a: string, b: string) => { if (a === b) return 0;