From c9b53b1c1abe3cd8849039067530d1b914298c3c Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 2 Mar 2025 23:03:52 +0800 Subject: [PATCH] Perf: move phishing hosts process into a worker --- Build/constants/phishing-score-source.ts | 109 ++++++ Build/lib/get-phishing-domains.ts | 459 ++++++++++------------- 2 files changed, 298 insertions(+), 270 deletions(-) create mode 100644 Build/constants/phishing-score-source.ts diff --git a/Build/constants/phishing-score-source.ts b/Build/constants/phishing-score-source.ts new file mode 100644 index 00000000..55a22ea9 --- /dev/null +++ b/Build/constants/phishing-score-source.ts @@ -0,0 +1,109 @@ +import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; + +export const BLACK_TLD = new Set([ + 'accountant', 'art', 'autos', + 'bar', 'beauty', 'bid', 'bio', 'biz', 'bond', 'business', 'buzz', + 'cc', 'cf', 'cfd', 'click', 'cloud', 'club', 'cn', 'codes', + 'co.uk', 'co.in', 'com.br', 'com.cn', 'com.pl', 'com.vn', + 'cool', 'cricket', 'cyou', + 'date', 'design', 'digital', 'download', + 'faith', 'fit', 'fun', + 'ga', 'gd', 'gives', 'gq', 'group', 'host', + 'icu', 'id', 'info', 'ink', + 'lat', 'life', 'live', 'link', 'loan', 'lol', 'ltd', + 'me', 'men', 'ml', 'mobi', 'mom', 'monster', + 'net.pl', + 'one', 'online', + 'party', 'pro', 'pl', 'pw', + 'racing', 'rest', 'review', 'rf.gd', + 'sa.com', 'sbs', 'science', 'shop', 'site', 'skin', 'space', 'store', 'stream', 'su', 'surf', + 'tech', 'tk', 'tokyo', 'top', 'trade', + 'vip', 'vn', + 'webcam', 'website', 'win', + 'xyz', + 'za.com' +]); + +export const WHITELIST_MAIN_DOMAINS = new Set([ + // 'w3s.link', // ipfs gateway + // 'dweb.link', // ipfs gateway + // 'nftstorage.link', // ipfs gateway + 'fleek.cool', // ipfs gateway + 'flk-ipfs.xyz', // ipfs gateway + 'business.site', // Drag'n'Drop site building platform + 'page.link', // Firebase URL Shortener + // 'notion.site', + // 'vercel.app', + 'gitbook.io', + 'zendesk.com', + 'ipfs.eth.aragon.network', + 'wordpress.com' +]); + +export const leathalKeywords = createKeywordFilter([ + 'vinted-', + 'inpost-pl', + 'vlnted-', + 'allegrolokalnie', + 'thetollroads' +]); + +export const sensitiveKeywords = createKeywordFilter([ + '.amazon-', + '-amazon', + 'fb-com', + 'facebook-com', + '-facebook', + 'facebook-', + 'focebaak', + '.facebook.', + 'metamask', + 'www.apple', + '-coinbase', + 'coinbase-', + 'booking-com', + 'booking.com-', + 'booking-eu', + 'vinted-', + 'inpost-pl', + 'login.microsoft', + 'login-microsoft', + 'microsoftonline', + 'google.com-', + 'minecraft', + 'staemco', + 'oferta' +]); + +export const fakeTldKeywords = createKeywordFilter([ + '.pl-', + '.com-', + '.net-' +]); + +export const lowKeywords = createKeywordFilter([ + 'transactions-', + 'payment', + 'wallet', + '-transactions', + '-faceb', // facebook fake + '.faceb', // facebook fake + 'facebook', + 'virus-', + 'icloud-', + 'apple-', + '-roblox', + '-co-jp', + 'customer.', + 'customer-', + '.www-', + '.www.', + '.www2', + 'instagram', + 'microsof', + 'passwordreset', + '.google-', + 'recover', + 'banking', + 'shop' +]); diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 98af9795..132228c6 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,227 +1,191 @@ +import Worktank from 'worktank'; + import { processHostsWithPreload } from './parse-filter/hosts'; import { processDomainListsWithPreload } from './parse-filter/domainlists'; - -import * as tldts from 'tldts-experimental'; - import { dummySpan, printTraceResult } from '../trace'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; -import { DEBUG_DOMAIN_TO_FIND, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source'; -import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt'; -import picocolors from 'picocolors'; -import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; -import { deserializeArray, serializeArray } from './cache-filesystem'; -import { cache } from './fs-memo'; -import { isCI } from 'ci-info'; - -const BLACK_TLD = new Set([ - 'accountant', 'art', 'autos', - 'bar', 'beauty', 'bid', 'bio', 'biz', 'bond', 'business', 'buzz', - 'cc', 'cf', 'cfd', 'click', 'cloud', 'club', 'cn', 'codes', - 'co.uk', 'co.in', 'com.br', 'com.cn', 'com.pl', 'com.vn', - 'cool', 'cricket', 'cyou', - 'date', 'design', 'digital', 'download', - 'faith', 'fit', 'fun', - 'ga', 'gd', 'gives', 'gq', 'group', 'host', - 'icu', 'id', 'info', 'ink', - 'lat', 'life', 'live', 'link', 'loan', 'lol', 'ltd', - 'me', 'men', 'ml', 'mobi', 'mom', 'monster', - 'net.pl', - 'one', 'online', - 'party', 'pro', 'pl', 'pw', - 'racing', 'rest', 'review', 'rf.gd', - 'sa.com', 'sbs', 'science', 'shop', 'site', 'skin', 'space', 'store', 'stream', 'su', 'surf', - 'tech', 'tk', 'tokyo', 'top', 'trade', - 'vip', 'vn', - 'webcam', 'website', 'win', - 'xyz', - 'za.com' -]); - -const WHITELIST_MAIN_DOMAINS = new Set([ - // 'w3s.link', // ipfs gateway - // 'dweb.link', // ipfs gateway - // 'nftstorage.link', // ipfs gateway - 'fleek.cool', // ipfs gateway - 'flk-ipfs.xyz', // ipfs gateway - 'business.site', // Drag'n'Drop site building platform - 'page.link', // Firebase URL Shortener - // 'notion.site', - // 'vercel.app', - 'gitbook.io', - 'zendesk.com', - 'ipfs.eth.aragon.network', - 'wordpress.com' -]); - -const leathalKeywords = createKeywordFilter([ - 'vinted-', - 'inpost-pl', - 'vlnted-' -]); - -const sensitiveKeywords = createKeywordFilter([ - '.amazon-', - '-amazon', - 'fb-com', - 'facebook-com', - '-facebook', - 'facebook-', - 'focebaak', - '.facebook.', - 'metamask', - 'www.apple', - '-coinbase', - 'coinbase-', - 'booking-com', - 'booking.com-', - 'booking-eu', - 'vinted-', - 'inpost-pl', - 'login.microsoft', - 'login-microsoft', - 'microsoftonline', - 'google.com-', - 'minecraft', - 'staemco', - 'oferta', - 'allegrolokalnie', - 'thetollroads' -]); -const fakeTldKeywords = createKeywordFilter([ - '.pl-', - '.com-', - '.net-' -]); -const lowKeywords = createKeywordFilter([ - 'transactions-', - 'payment', - 'wallet', - '-transactions', - '-faceb', // facebook fake - '.faceb', // facebook fake - 'facebook', - 'virus-', - 'icloud-', - 'apple-', - '-roblox', - '-co-jp', - 'customer.', - 'customer-', - '.www-', - '.www.', - '.www2', - 'instagram', - 'microsof', - 'passwordreset', - '.google-', - 'recover', - 'banking', - 'shop' -]); - -const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): string[] { - const domainCountMap = new Map(); - const domainScoreMap: Record = {}; - - let line = ''; - let tld: string | null = ''; - let apexDomain: string | null = ''; - let subdomain: string | null = ''; - - // const set = new Set(); - // let duplicateCount = 0; - - for (let i = 0, len = domainArr.length; i < len; i++) { - line = domainArr[i]; - - // if (set.has(line)) { - // duplicateCount++; - // } else { - // set.add(line); - // } - - const parsed = tldts.parse(line, loosTldOptWithPrivateDomains); - if (parsed.isPrivate) { - continue; - } - - tld = parsed.publicSuffix; - apexDomain = parsed.domain; - - if (!tld) { - console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld }); - continue; - } - if (!apexDomain) { - console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain }); - continue; - } - - domainCountMap.set( - apexDomain, - domainCountMap.has(apexDomain) - ? domainCountMap.get(apexDomain)! + 1 - : 1 - ); - - if (!(apexDomain in domainScoreMap)) { - domainScoreMap[apexDomain] = 0; - if (BLACK_TLD.has(tld)) { - domainScoreMap[apexDomain] += 3; - } else if (tld.length > 6) { - domainScoreMap[apexDomain] += 2; - } - if (apexDomain.length >= 18) { - domainScoreMap[apexDomain] += 0.5; - } - } - - subdomain = parsed.subdomain; - - if ( - subdomain - && !WHITELIST_MAIN_DOMAINS.has(apexDomain) - ) { - domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line); - } - } - - domainCountMap.forEach((count, apexDomain) => { - if ( - // !WHITELIST_MAIN_DOMAINS.has(apexDomain) - (domainScoreMap[apexDomain] >= 24) - || (domainScoreMap[apexDomain] >= 16 && count >= 7) - || (domainScoreMap[apexDomain] >= 13 && count >= 11) - || (domainScoreMap[apexDomain] >= 5 && count >= 14) - || (domainScoreMap[apexDomain] >= 3 && count >= 21) - || (domainScoreMap[apexDomain] >= 1 && count >= 60) - ) { - domainArr.push('.' + apexDomain); - } - }); - - if (require.main === module) { - console.log({ - v: 1, - score: domainScoreMap['com-paytollbydv.world'], - count: domainCountMap.get('com-paytollbydv.worldx'), - domainArrLen: domainArr.length - }); - } - - return domainArr; -}, { - serializer: serializeArray, - deserializer: deserializeArray, - temporaryBypass: !isCI || DEBUG_DOMAIN_TO_FIND !== null -}); +import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source'; const downloads = [ ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)), ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry)) ]; +const pool = new Worktank({ + name: 'process-phishing-domains', + size: 1, + timeout: 10000, // The maximum number of milliseconds to wait for the result from the worker, if exceeded the worker is terminated and the execution promise rejects + warmup: true, + autoterminate: 30000, // The interval of milliseconds at which to check if the pool can be automatically terminated, to free up resources, workers will be spawned up again if needed + env: {}, + methods: { + // eslint-disable-next-line object-shorthand -- workertank + processPhihsingDomains: async function ( + domainArr: string[], + importMetaUrl: string, + /** require.main === module */ isDebug = false + ): Promise { + // TODO: createRequire is a temporary workaround for https://github.com/nodejs/node/issues/51956 + const { default: module } = await import('node:module'); + const __require = module.createRequire(importMetaUrl); + + const picocolors = __require('picocolors') as typeof import('picocolors'); + const tldts = __require('tldts-experimental') as typeof import('tldts-experimental'); + + const { loosTldOptWithPrivateDomains } = __require('../constants/loose-tldts-opt') as typeof import('../constants/loose-tldts-opt'); + const { BLACK_TLD, WHITELIST_MAIN_DOMAINS, leathalKeywords, lowKeywords, fakeTldKeywords, sensitiveKeywords } = __require('../constants/phishing-score-source') as typeof import('../constants/phishing-score-source'); + + const domainCountMap = new Map(); + const domainScoreMap: Record = {}; + + let line = ''; + let tld: string | null = ''; + let apexDomain: string | null = ''; + let subdomain: string | null = ''; + + // const set = new Set(); + // let duplicateCount = 0; + + for (let i = 0, len = domainArr.length; i < len; i++) { + line = domainArr[i]; + + // if (set.has(line)) { + // duplicateCount++; + // } else { + // set.add(line); + // } + + const parsed = tldts.parse(line, loosTldOptWithPrivateDomains); + if (parsed.isPrivate) { + continue; + } + + tld = parsed.publicSuffix; + apexDomain = parsed.domain; + + if (!tld) { + console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld }); + continue; + } + if (!apexDomain) { + console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain }); + continue; + } + + domainCountMap.set( + apexDomain, + domainCountMap.has(apexDomain) + ? domainCountMap.get(apexDomain)! + 1 + : 1 + ); + + if (!(apexDomain in domainScoreMap)) { + domainScoreMap[apexDomain] = 0; + if (BLACK_TLD.has(tld)) { + domainScoreMap[apexDomain] += 3; + } else if (tld.length > 6) { + domainScoreMap[apexDomain] += 2; + } + if (apexDomain.length >= 18) { + domainScoreMap[apexDomain] += 0.5; + } + } + + subdomain = parsed.subdomain; + + if ( + subdomain + && !WHITELIST_MAIN_DOMAINS.has(apexDomain) + ) { + domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line); + } + } + + domainCountMap.forEach((count, apexDomain) => { + if ( + // !WHITELIST_MAIN_DOMAINS.has(apexDomain) + (domainScoreMap[apexDomain] >= 24) + || (domainScoreMap[apexDomain] >= 16 && count >= 7) + || (domainScoreMap[apexDomain] >= 13 && count >= 11) + || (domainScoreMap[apexDomain] >= 5 && count >= 14) + || (domainScoreMap[apexDomain] >= 3 && count >= 21) + || (domainScoreMap[apexDomain] >= 1 && count >= 60) + ) { + domainArr.push('.' + apexDomain); + } + }); + + if (isDebug) { + console.log({ + v: 1, + score: domainScoreMap['com-paytollbydv.world'], + count: domainCountMap.get('com-paytollbydv.worldx'), + domainArrLen: domainArr.length + }); + } + + return domainArr; + + function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) { + if (leathalKeywords(fullDomain)) { + return 100; + } + + let weight = 0; + + const hitLowKeywords = lowKeywords(fullDomain); + const sensitiveKeywordsHit = sensitiveKeywords(fullDomain); + const fakeTldKeywordsHit = fakeTldKeywords(fullDomain); + + if (sensitiveKeywordsHit) { + weight += 15; + if (hitLowKeywords) { + weight += 10; + if (fakeTldKeywordsHit) { + weight += 8; + } + } + // besides add for low hit, always add extra here + if (fakeTldKeywordsHit) { + weight += 10; + } + } else if (hitLowKeywords) { + weight += 1.8; + if (fakeTldKeywordsHit) { + weight += 5; + } + } + + const subdomainLength = subdomain.length; + + if (subdomainLength > 6) { + weight += 0.015; + + if (subdomainLength > 13) { + weight += 0.2; + if (subdomainLength > 20) { + weight += 1; + if (subdomainLength > 30) { + weight += 5; + if (subdomainLength > 40) { + weight += 10; + } + } + } + + if (subdomain.indexOf('.', 1) > 1) { + weight += 1; + } + } + } + + return weight; + } + } + } +}); + export function getPhishingDomains(parentSpan: Span) { return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { @@ -235,67 +199,22 @@ export function getPhishingDomains(parentSpan: Span) { return span.traceChildAsync( 'process phishing domain set', - () => processPhihsingDomains(domainArr) + async () => { + const phishingDomains = await pool.exec( + 'processPhihsingDomains', + [ + domainArr, + import.meta.url, + require.main === module + ] + ); + pool.terminate(); + return phishingDomains; + } ); }); } -export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) { - if (leathalKeywords(fullDomain)) { - return 100; - } - - let weight = 0; - - const hitLowKeywords = lowKeywords(fullDomain); - const sensitiveKeywordsHit = sensitiveKeywords(fullDomain); - const fakeTldKeywordsHit = fakeTldKeywords(fullDomain); - - if (sensitiveKeywordsHit) { - weight += 15; - if (hitLowKeywords) { - weight += 10; - if (fakeTldKeywordsHit) { - weight += 8; - } - } - // besides add for low hit, always add extra here - if (fakeTldKeywordsHit) { - weight += 10; - } - } else if (hitLowKeywords) { - weight += 1.8; - if (fakeTldKeywordsHit) { - weight += 5; - } - } - - const subdomainLength = subdomain.length; - - if (subdomainLength > 6) { - weight += 0.015; - - if (subdomainLength > 13) { - weight += 0.2; - if (subdomainLength > 20) { - weight += 1; - if (subdomainLength > 30) { - weight += 5; - if (subdomainLength > 40) { - weight += 10; - } - } - } - - if (subdomain.indexOf('.', 1) > 1) { - weight += 1; - } - } - } - - return weight; -} - if (require.main === module) { getPhishingDomains(dummySpan) .catch(console.error)