Minor changes to fs memo implementation / Adapt fs memo

This commit is contained in:
SukkaW 2024-10-18 01:16:12 +08:00
parent 24b928dc32
commit a8c9cc5ac5
3 changed files with 130 additions and 148 deletions

View File

@ -28,7 +28,6 @@ export interface CacheOptions<S = string> {
interface CacheApplyRawOption { interface CacheApplyRawOption {
ttl?: number | null, ttl?: number | null,
cacheName?: string,
temporaryBypass?: boolean, temporaryBypass?: boolean,
incrementTtlWhenHit?: boolean incrementTtlWhenHit?: boolean
} }
@ -187,45 +186,6 @@ export class Cache<S = string> {
this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key); this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
} }
async apply<T>(
key: string,
fn: () => Promise<T>,
opt: CacheApplyOption<T, S>
): Promise<T> {
const { ttl, temporaryBypass, incrementTtlWhenHit, cacheName } = opt;
if (temporaryBypass) {
return fn();
}
if (ttl == null) {
this.del(key);
return fn();
}
const cached = this.get(key);
if (cached == null) {
console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || key), picocolors.gray(`ttl: ${TTL.humanReadable(ttl)}`));
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const promise = fn();
return promise.then((value) => {
this.set(key, serializer(value), ttl);
return value;
});
}
console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || key));
if (incrementTtlWhenHit) {
this.updateTtl(key, ttl);
}
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
return deserializer(cached);
}
async applyWithHttp304<T>( async applyWithHttp304<T>(
url: string, url: string,
extraCacheKey: string, extraCacheKey: string,

View File

@ -3,7 +3,17 @@ import { Cache } from './cache-filesystem';
import type { CacheApplyOption } from './cache-filesystem'; import type { CacheApplyOption } from './cache-filesystem';
import { isCI } from 'ci-info'; import { isCI } from 'ci-info';
const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache') }); import { Typeson, set, map, typedArrays } from 'typeson-registry';
import picocolors from 'picocolors';
import { identity } from './misc';
const typeson = new Typeson().register([
typedArrays,
set,
map
]);
const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' });
const TTL = isCI const TTL = isCI
// We run CI daily, so 1.5 days TTL is enough to persist the cache across runs // We run CI daily, so 1.5 days TTL is enough to persist the cache across runs
@ -11,41 +21,64 @@ const TTL = isCI
// We run locally less frequently, so we need to persist the cache for longer, 7 days // We run locally less frequently, so we need to persist the cache for longer, 7 days
: 7 * 86400 * 1000; : 7 * 86400 * 1000;
type JSONValue = type TypesonValue =
| string | string
| number | number
| boolean | boolean
| null | null
| JSONObject | Set<any>
| JSONArray; | Map<any, any>
| TypesonObject
| TypesonArray;
interface JSONObject { interface TypesonObject {
[key: string]: JSONValue [key: string]: TypesonValue
} }
interface JSONArray extends Array<JSONValue> {} interface TypesonArray extends Array<TypesonValue> { }
export function cache<Args extends JSONValue[], T>( export type FsMemoCacheOptions<T> = CacheApplyOption<T, string> & {
cb: (...args: Args) => Promise<T>, ttl?: undefined | never
opt: Omit<CacheApplyOption<T, string>, 'ttl'> };
export function cache<Args extends TypesonValue[], T>(
fn: (...args: Args) => Promise<T>,
opt: FsMemoCacheOptions<T>
): (...args: Args) => Promise<T> { ): (...args: Args) => Promise<T> {
// TODO if cb.toString() is long we should hash it // TODO if cb.toString() is long we should hash it
const fixedKey = cb.toString(); const fixedKey = fn.toString();
return async function cachedCb(...args: Args) { return async function cachedCb(...args: Args) {
// Construct the complete cache key for this function invocation // Construct the complete cache key for this function invocation
// TODO stringify is limited. For now we uses typescript to guard the args. // typeson.stringify is still limited. For now we uses typescript to guard the args.
const cacheKey = `${fixedKey}|${JSON.stringify(args)}`; const cacheKey = `${fixedKey}|${typeson.stringifySync(args)}`;
const cacheName = cb.name || cacheKey; const cacheName = fn.name || cacheKey;
return fsMemoCache.apply( const { temporaryBypass, incrementTtlWhenHit } = opt;
cacheKey,
cb, if (temporaryBypass) {
{ return fn(...args);
cacheName, }
...opt,
ttl: TTL const cached = fsMemoCache.get(cacheKey);
} as CacheApplyOption<T, string> if (cached == null) {
); console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || cacheKey));
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const value = await fn(...args);
fsMemoCache.set(cacheKey, serializer(value), TTL);
return value;
}
console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || cacheKey));
if (incrementTtlWhenHit) {
fsMemoCache.updateTtl(cacheKey, TTL);
}
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
return deserializer(cached);
}; };
} }

View File

@ -8,9 +8,8 @@ import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/
import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt'; import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
import picocolors from 'picocolors'; import picocolors from 'picocolors';
import createKeywordFilter from './aho-corasick'; import createKeywordFilter from './aho-corasick';
import { createCacheKey, deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem'; import { createCacheKey, deserializeArray, serializeArray } from './cache-filesystem';
import { fastStringArrayJoin } from './misc'; import { cache } from './fs-memo';
import { stringHash } from './string-hash';
const BLACK_TLD = new Set([ const BLACK_TLD = new Set([
'accountant', 'art', 'autos', 'accountant', 'art', 'autos',
@ -102,6 +101,73 @@ const lowKeywords = createKeywordFilter([
const cacheKey = createCacheKey(__filename); const cacheKey = createCacheKey(__filename);
const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise<string[]> {
const domainCountMap: Record<string, number> = {};
const domainScoreMap: Record<string, number> = {};
for (let i = 0, len = domainArr.length; i < len; i++) {
const line = domainArr[i];
const {
publicSuffix: tld,
domain: apexDomain,
subdomain,
isPrivate
} = tldts.parse(line, loosTldOptWithPrivateDomains);
if (isPrivate) {
continue;
}
if (!tld) {
console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
continue;
}
if (!apexDomain) {
console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
continue;
}
domainCountMap[apexDomain] ||= 0;
domainCountMap[apexDomain] += 1;
if (!(apexDomain in domainScoreMap)) {
domainScoreMap[apexDomain] = 0;
if (BLACK_TLD.has(tld)) {
domainScoreMap[apexDomain] += 4;
} else if (tld.length > 6) {
domainScoreMap[apexDomain] += 2;
}
if (apexDomain.length >= 18) {
domainScoreMap[apexDomain] += 0.5;
}
}
if (
subdomain
&& !WHITELIST_MAIN_DOMAINS.has(apexDomain)
) {
domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
}
}
for (const apexDomain in domainCountMap) {
if (
// !WHITELIST_MAIN_DOMAINS.has(apexDomain)
domainScoreMap[apexDomain] >= 16
|| (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7)
|| (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10)
|| (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16)
) {
domainArr.push('.' + apexDomain);
}
}
return Promise.resolve(domainArr);
}, {
serializer: serializeArray,
deserializer: deserializeArray
});
export function getPhishingDomains(parentSpan: Span) { export function getPhishingDomains(parentSpan: Span) {
return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
@ -115,90 +181,13 @@ export function getPhishingDomains(parentSpan: Span) {
return domainArr; return domainArr;
}); });
const cacheHash = span.traceChildSync('get hash', () => stringHash(fastStringArrayJoin(domainArr, '|')));
return span.traceChildAsync( return span.traceChildAsync(
'process phishing domain set', 'process phishing domain set',
() => processPhihsingDomains(domainArr, cacheHash) () => processPhihsingDomains(domainArr)
); );
}); });
} }
async function processPhihsingDomains(domainArr: string[], cacheHash = '') {
return fsFetchCache.apply(
cacheKey('processPhihsingDomains|' + cacheHash),
() => {
const domainCountMap: Record<string, number> = {};
const domainScoreMap: Record<string, number> = {};
for (let i = 0, len = domainArr.length; i < len; i++) {
const line = domainArr[i];
const {
publicSuffix: tld,
domain: apexDomain,
subdomain,
isPrivate
} = tldts.parse(line, loosTldOptWithPrivateDomains);
if (isPrivate) {
continue;
}
if (!tld) {
console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
continue;
}
if (!apexDomain) {
console.log(picocolors.yellow('[phishing domains] E0002'), 'missing domain', { line, apexDomain });
continue;
}
domainCountMap[apexDomain] ||= 0;
domainCountMap[apexDomain] += 1;
if (!(apexDomain in domainScoreMap)) {
domainScoreMap[apexDomain] = 0;
if (BLACK_TLD.has(tld)) {
domainScoreMap[apexDomain] += 4;
} else if (tld.length > 6) {
domainScoreMap[apexDomain] += 2;
}
if (apexDomain.length >= 18) {
domainScoreMap[apexDomain] += 0.5;
}
}
if (
subdomain
&& !WHITELIST_MAIN_DOMAINS.has(apexDomain)
) {
domainScoreMap[apexDomain] += calcDomainAbuseScore(subdomain, line);
}
}
for (const apexDomain in domainCountMap) {
if (
// !WHITELIST_MAIN_DOMAINS.has(apexDomain)
domainScoreMap[apexDomain] >= 16
|| (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7)
|| (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10)
|| (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16)
) {
domainArr.push('.' + apexDomain);
}
}
return Promise.resolve(domainArr);
},
{
ttl: 2 * 86400 * 1000,
serializer: serializeArray,
deserializer: deserializeArray,
incrementTtlWhenHit: true
}
);
}
export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) { export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
let weight = 0; let weight = 0;