diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts index f6658b71..5f163354 100644 --- a/Build/lib/cache-filesystem.ts +++ b/Build/lib/cache-filesystem.ts @@ -28,7 +28,6 @@ export interface CacheOptions { interface CacheApplyRawOption { ttl?: number | null, - cacheName?: string, temporaryBypass?: boolean, incrementTtlWhenHit?: boolean } @@ -187,45 +186,6 @@ export class Cache { this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key); } - async apply( - key: string, - fn: () => Promise, - opt: CacheApplyOption - ): Promise { - const { ttl, temporaryBypass, incrementTtlWhenHit, cacheName } = opt; - - if (temporaryBypass) { - return fn(); - } - if (ttl == null) { - this.del(key); - return fn(); - } - - const cached = this.get(key); - if (cached == null) { - console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || key), picocolors.gray(`ttl: ${TTL.humanReadable(ttl)}`)); - - const serializer = 'serializer' in opt ? opt.serializer : identity as any; - - const promise = fn(); - - return promise.then((value) => { - this.set(key, serializer(value), ttl); - return value; - }); - } - - console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || key)); - - if (incrementTtlWhenHit) { - this.updateTtl(key, ttl); - } - - const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any; - return deserializer(cached); - } - async applyWithHttp304( url: string, extraCacheKey: string, diff --git a/Build/lib/fs-memo.ts b/Build/lib/fs-memo.ts index 4d497263..3f626e8c 100644 --- a/Build/lib/fs-memo.ts +++ b/Build/lib/fs-memo.ts @@ -3,7 +3,17 @@ import { Cache } from './cache-filesystem'; import type { CacheApplyOption } from './cache-filesystem'; import { isCI } from 'ci-info'; -const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache') }); +import { Typeson, set, map, typedArrays } from 'typeson-registry'; +import picocolors from 'picocolors'; +import { identity } from './misc'; + +const typeson = new Typeson().register([ + typedArrays, + set, + map +]); + +const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' }); const TTL = isCI // We run CI daily, so 1.5 days TTL is enough to persist the cache across runs @@ -11,41 +21,64 @@ const TTL = isCI // We run locally less frequently, so we need to persist the cache for longer, 7 days : 7 * 86400 * 1000; - type JSONValue = - | string - | number - | boolean - | null - | JSONObject - | JSONArray; +type TypesonValue = + | string + | number + | boolean + | null + | Set + | Map + | TypesonObject + | TypesonArray; -interface JSONObject { - [key: string]: JSONValue +interface TypesonObject { + [key: string]: TypesonValue } -interface JSONArray extends Array {} +interface TypesonArray extends Array { } -export function cache( - cb: (...args: Args) => Promise, - opt: Omit, 'ttl'> +export type FsMemoCacheOptions = CacheApplyOption & { + ttl?: undefined | never +}; + +export function cache( + fn: (...args: Args) => Promise, + opt: FsMemoCacheOptions ): (...args: Args) => Promise { // TODO if cb.toString() is long we should hash it - const fixedKey = cb.toString(); + const fixedKey = fn.toString(); return async function cachedCb(...args: Args) { // Construct the complete cache key for this function invocation - // TODO stringify is limited. For now we uses typescript to guard the args. - const cacheKey = `${fixedKey}|${JSON.stringify(args)}`; - const cacheName = cb.name || cacheKey; + // typeson.stringify is still limited. For now we uses typescript to guard the args. + const cacheKey = `${fixedKey}|${typeson.stringifySync(args)}`; + const cacheName = fn.name || cacheKey; - return fsMemoCache.apply( - cacheKey, - cb, - { - cacheName, - ...opt, - ttl: TTL - } as CacheApplyOption - ); + const { temporaryBypass, incrementTtlWhenHit } = opt; + + if (temporaryBypass) { + return fn(...args); + } + + const cached = fsMemoCache.get(cacheKey); + if (cached == null) { + console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || cacheKey)); + + const serializer = 'serializer' in opt ? opt.serializer : identity as any; + + const value = await fn(...args); + + fsMemoCache.set(cacheKey, serializer(value), TTL); + return value; + } + + console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || cacheKey)); + + if (incrementTtlWhenHit) { + fsMemoCache.updateTtl(cacheKey, TTL); + } + + const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any; + return deserializer(cached); }; } diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index ebd8075b..6e7ac720 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -8,9 +8,8 @@ import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/ import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt'; import picocolors from 'picocolors'; import createKeywordFilter from './aho-corasick'; -import { createCacheKey, deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem'; -import { fastStringArrayJoin } from './misc'; -import { stringHash } from './string-hash'; +import { createCacheKey, deserializeArray, serializeArray } from './cache-filesystem'; +import { cache } from './fs-memo'; const BLACK_TLD = new Set([ 'accountant', 'art', 'autos', @@ -102,6 +101,73 @@ const lowKeywords = createKeywordFilter([ const cacheKey = createCacheKey(__filename); +const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise { + const domainCountMap: Record = {}; + const domainScoreMap: Record = {}; + + for (let i = 0, len = domainArr.length; i < len; i++) { + const line = domainArr[i]; + + const { + publicSuffix: tld, + domain: apexDomain, + subdomain, + isPrivate + } = tldts.parse(line, loosTldOptWithPrivateDomains); + + if (isPrivate) { + continue; + } + + if (!tld) { + console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld }); + continue; + } + if (!apexDomain) { + console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain }); + continue; + } + + domainCountMap[apexDomain] ||= 0; + domainCountMap[apexDomain] += 1; + + if (!(apexDomain in domainScoreMap)) { + domainScoreMap[apexDomain] = 0; + if (BLACK_TLD.has(tld)) { + domainScoreMap[apexDomain] += 4; + } else if (tld.length > 6) { + domainScoreMap[apexDomain] += 2; + } + if (apexDomain.length >= 18) { + domainScoreMap[apexDomain] += 0.5; + } + } + if ( + subdomain + && !WHITELIST_MAIN_DOMAINS.has(apexDomain) + ) { + domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line); + } + } + + for (const apexDomain in domainCountMap) { + if ( + // !WHITELIST_MAIN_DOMAINS.has(apexDomain) + domainScoreMap[apexDomain] >= 16 + || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7) + || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10) + || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16) + ) { + domainArr.push('.' + apexDomain); + } + } + + return Promise.resolve(domainArr); +}, { + serializer: serializeArray, + deserializer: deserializeArray +}); + export function getPhishingDomains(parentSpan: Span) { return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { @@ -115,90 +181,13 @@ export function getPhishingDomains(parentSpan: Span) { return domainArr; }); - const cacheHash = span.traceChildSync('get hash', () => stringHash(fastStringArrayJoin(domainArr, '|'))); - return span.traceChildAsync( 'process phishing domain set', - () => processPhihsingDomains(domainArr, cacheHash) + () => processPhihsingDomains(domainArr) ); }); } -async function processPhihsingDomains(domainArr: string[], cacheHash = '') { - return fsFetchCache.apply( - cacheKey('processPhihsingDomains|' + cacheHash), - () => { - const domainCountMap: Record = {}; - const domainScoreMap: Record = {}; - - for (let i = 0, len = domainArr.length; i < len; i++) { - const line = domainArr[i]; - - const { - publicSuffix: tld, - domain: apexDomain, - subdomain, - isPrivate - } = tldts.parse(line, loosTldOptWithPrivateDomains); - - if (isPrivate) { - continue; - } - - if (!tld) { - console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld }); - continue; - } - if (!apexDomain) { - console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain }); - continue; - } - - domainCountMap[apexDomain] ||= 0; - domainCountMap[apexDomain] += 1; - - if (!(apexDomain in domainScoreMap)) { - domainScoreMap[apexDomain] = 0; - if (BLACK_TLD.has(tld)) { - domainScoreMap[apexDomain] += 4; - } else if (tld.length > 6) { - domainScoreMap[apexDomain] += 2; - } - if (apexDomain.length >= 18) { - domainScoreMap[apexDomain] += 0.5; - } - } - if ( - subdomain - && !WHITELIST_MAIN_DOMAINS.has(apexDomain) - ) { - domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line); - } - } - - for (const apexDomain in domainCountMap) { - if ( - // !WHITELIST_MAIN_DOMAINS.has(apexDomain) - domainScoreMap[apexDomain] >= 16 - || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7) - || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10) - || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16) - ) { - domainArr.push('.' + apexDomain); - } - } - - return Promise.resolve(domainArr); - }, - { - ttl: 2 * 86400 * 1000, - serializer: serializeArray, - deserializer: deserializeArray, - incrementTtlWhenHit: true - } - ); -} - export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) { let weight = 0;