From 19228a8216a093067152ab2ccb0a73eaf6557e60 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 10 Nov 2024 17:58:44 +0800 Subject: [PATCH] Chore: improve phishing hosts --- Build/lib/fs-memo.ts | 28 +++++++----- Build/lib/get-phishing-domains.ts | 72 ++++++++++++++++++------------- Build/lib/misc.ts | 2 +- 3 files changed, 59 insertions(+), 43 deletions(-) diff --git a/Build/lib/fs-memo.ts b/Build/lib/fs-memo.ts index dd03e3c5..19efb41a 100644 --- a/Build/lib/fs-memo.ts +++ b/Build/lib/fs-memo.ts @@ -6,7 +6,7 @@ import { isCI } from 'ci-info'; import { xxhash64 } from 'hash-wasm'; import picocolors from 'picocolors'; -import { identity } from './misc'; +import { fastStringArrayJoin, identity } from './misc'; const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' }); @@ -49,29 +49,35 @@ function createCache(onlyUseCachedIfFail: boolean) { fn: (...args: Args) => Promise, opt: FsMemoCacheOptions ): (...args: Args) => Promise { - const fixedKey = fn.toString(); - if (opt.temporaryBypass) { return fn; } + const serializer = 'serializer' in opt ? opt.serializer : identity; + const deserializer = 'deserializer' in opt ? opt.deserializer : identity; + + const fixedKey = fn.toString(); + + const fixedKeyHashPromise = xxhash64(fixedKey); + const devalueModulePromise = import('devalue'); + return async function cachedCb(...args: Args) { - const { stringify: devalueStringify } = await import('devalue'); + const devalueStringify = (await devalueModulePromise).stringify; // Construct the complete cache key for this function invocation // typeson.stringify is still limited. For now we uses typescript to guard the args. - const cacheKey = (await Promise.all([ - xxhash64(fixedKey), - xxhash64(devalueStringify(args)) - ])).join('|'); + const cacheKey = fastStringArrayJoin( + await Promise.all([ + fixedKeyHashPromise, + xxhash64(devalueStringify(args)) + ]), + '|' + ); const cacheName = picocolors.gray(fn.name || fixedKey || cacheKey); const cached = fsMemoCache.get(cacheKey); - const serializer = 'serializer' in opt ? opt.serializer : identity as any; - const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any; - if (onlyUseCachedIfFail) { try { const value = await fn(...args); diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index e10b940f..0e18baa2 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,7 +1,7 @@ import { processDomainLists, processHosts } from './parse-filter'; import * as tldts from 'tldts-experimental'; -import { dummySpan } from '../trace'; +import { dummySpan, printTraceResult } from '../trace'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source'; @@ -108,26 +108,25 @@ const lowKeywords = createKeywordFilter([ 'banking' ]); -const cacheKey = createCacheKey(__filename); - const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise { - const domainCountMap: Record = {}; + const domainCountMap = new Map(); const domainScoreMap: Record = {}; + let tld: string | null = ''; + let apexDomain: string | null = ''; + let subdomain: string | null = ''; + for (let i = 0, len = domainArr.length; i < len; i++) { const line = domainArr[i]; - const { - publicSuffix: tld, - domain: apexDomain, - subdomain, - isPrivate - } = tldts.parse(line, loosTldOptWithPrivateDomains); - - if (isPrivate) { + const parsed = tldts.parse(line, loosTldOptWithPrivateDomains); + if (parsed.isPrivate) { continue; } + tld = parsed.publicSuffix; + apexDomain = parsed.domain; + if (!tld) { console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld }); continue; @@ -137,8 +136,12 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: continue; } - domainCountMap[apexDomain] ||= 0; - domainCountMap[apexDomain] += 1; + domainCountMap.set( + apexDomain, + domainCountMap.has(apexDomain) + ? domainCountMap.get(apexDomain)! + 1 + : 1 + ); if (!(apexDomain in domainScoreMap)) { domainScoreMap[apexDomain] = 0; @@ -151,6 +154,9 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: domainScoreMap[apexDomain] += 0.5; } } + + subdomain = parsed.subdomain; + if ( subdomain && !WHITELIST_MAIN_DOMAINS.has(apexDomain) @@ -159,30 +165,33 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: } } - for (const apexDomain in domainCountMap) { + domainCountMap.forEach((count, apexDomain) => { if ( // !WHITELIST_MAIN_DOMAINS.has(apexDomain) (domainScoreMap[apexDomain] >= 24) - || (domainScoreMap[apexDomain] >= 16 && domainCountMap[apexDomain] >= 7) - || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 11) - || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 14) - || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 21) + || (domainScoreMap[apexDomain] >= 16 && count >= 7) + || (domainScoreMap[apexDomain] >= 13 && count >= 11) + || (domainScoreMap[apexDomain] >= 5 && count >= 14) + || (domainScoreMap[apexDomain] >= 3 && count >= 21) ) { domainArr.push('.' + apexDomain); } - } - - console.log({ - score: domainScoreMap['flk-ipfs.xyz'], - count: domainCountMap['flk-ipfs.xyz'] }); + // console.log({ + // score: domainScoreMap['flk-ipfs.xyz'], + // count: domainCountMap.get('flk-ipfs.xyz') + // }); + return Promise.resolve(domainArr); }, { serializer: serializeArray, - deserializer: deserializeArray + deserializer: deserializeArray, + temporaryBypass: true }); +const cacheKey = createCacheKey(__filename); + export function getPhishingDomains(parentSpan: Span) { return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { @@ -219,7 +228,7 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub weight += 6; } } else if (hitLowKeywords) { - weight += 1.5; + weight += 1.7; } const subdomainLength = subdomain.length; @@ -236,11 +245,8 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub } } - if (subdomain.slice(1).includes('.')) { + if (subdomain.indexOf('.', 1) > 1) { weight += 1; - if (subdomain.includes('www.')) { - weight += 1; - } } } @@ -249,5 +255,9 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub if (require.main === module) { getPhishingDomains(dummySpan) - .catch(console.error); + .catch(console.error) + .finally(() => { + dummySpan.stop(); + printTraceResult(dummySpan.traceResult); + }); } diff --git a/Build/lib/misc.ts b/Build/lib/misc.ts index 9cb13ccb..d2a106e3 100644 --- a/Build/lib/misc.ts +++ b/Build/lib/misc.ts @@ -66,7 +66,7 @@ export function domainWildCardToRegex(domain: string) { return result; } -export const identity = (x: T): T => x; +export const identity = (x: T): R => x as any; export function appendArrayFromSet(dest: T[], source: Set | Array>, transformer: (item: T) => T = identity) { const casted = Array.isArray(source) ? source : [source];