Minor changes to fs memo implementation / Adapt fs memo

This commit is contained in:
SukkaW 2024-10-18 01:16:12 +08:00
parent 24b928dc32
commit a8c9cc5ac5
3 changed files with 130 additions and 148 deletions

View File

@ -28,7 +28,6 @@ export interface CacheOptions<S = string> {
interface CacheApplyRawOption {
ttl?: number | null,
cacheName?: string,
temporaryBypass?: boolean,
incrementTtlWhenHit?: boolean
}
@ -187,45 +186,6 @@ export class Cache<S = string> {
this.db.prepare(`DELETE FROM ${this.tableName} WHERE key = ?`).run(key);
}
async apply<T>(
key: string,
fn: () => Promise<T>,
opt: CacheApplyOption<T, S>
): Promise<T> {
const { ttl, temporaryBypass, incrementTtlWhenHit, cacheName } = opt;
if (temporaryBypass) {
return fn();
}
if (ttl == null) {
this.del(key);
return fn();
}
const cached = this.get(key);
if (cached == null) {
console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || key), picocolors.gray(`ttl: ${TTL.humanReadable(ttl)}`));
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const promise = fn();
return promise.then((value) => {
this.set(key, serializer(value), ttl);
return value;
});
}
console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || key));
if (incrementTtlWhenHit) {
this.updateTtl(key, ttl);
}
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
return deserializer(cached);
}
async applyWithHttp304<T>(
url: string,
extraCacheKey: string,

View File

@ -3,7 +3,17 @@ import { Cache } from './cache-filesystem';
import type { CacheApplyOption } from './cache-filesystem';
import { isCI } from 'ci-info';
const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache') });
import { Typeson, set, map, typedArrays } from 'typeson-registry';
import picocolors from 'picocolors';
import { identity } from './misc';
const typeson = new Typeson().register([
typedArrays,
set,
map
]);
const fsMemoCache = new Cache({ cachePath: path.resolve(__dirname, '../../.cache'), tableName: 'fs_memo_cache' });
const TTL = isCI
// We run CI daily, so 1.5 days TTL is enough to persist the cache across runs
@ -11,41 +21,64 @@ const TTL = isCI
// We run locally less frequently, so we need to persist the cache for longer, 7 days
: 7 * 86400 * 1000;
type JSONValue =
type TypesonValue =
| string
| number
| boolean
| null
| JSONObject
| JSONArray;
| Set<any>
| Map<any, any>
| TypesonObject
| TypesonArray;
interface JSONObject {
[key: string]: JSONValue
interface TypesonObject {
[key: string]: TypesonValue
}
interface JSONArray extends Array<JSONValue> {}
interface TypesonArray extends Array<TypesonValue> { }
export function cache<Args extends JSONValue[], T>(
cb: (...args: Args) => Promise<T>,
opt: Omit<CacheApplyOption<T, string>, 'ttl'>
export type FsMemoCacheOptions<T> = CacheApplyOption<T, string> & {
ttl?: undefined | never
};
export function cache<Args extends TypesonValue[], T>(
fn: (...args: Args) => Promise<T>,
opt: FsMemoCacheOptions<T>
): (...args: Args) => Promise<T> {
// TODO if cb.toString() is long we should hash it
const fixedKey = cb.toString();
const fixedKey = fn.toString();
return async function cachedCb(...args: Args) {
// Construct the complete cache key for this function invocation
// TODO stringify is limited. For now we uses typescript to guard the args.
const cacheKey = `${fixedKey}|${JSON.stringify(args)}`;
const cacheName = cb.name || cacheKey;
// typeson.stringify is still limited. For now we uses typescript to guard the args.
const cacheKey = `${fixedKey}|${typeson.stringifySync(args)}`;
const cacheName = fn.name || cacheKey;
return fsMemoCache.apply(
cacheKey,
cb,
{
cacheName,
...opt,
ttl: TTL
} as CacheApplyOption<T, string>
);
const { temporaryBypass, incrementTtlWhenHit } = opt;
if (temporaryBypass) {
return fn(...args);
}
const cached = fsMemoCache.get(cacheKey);
if (cached == null) {
console.log(picocolors.yellow('[cache] miss'), picocolors.gray(cacheName || cacheKey));
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const value = await fn(...args);
fsMemoCache.set(cacheKey, serializer(value), TTL);
return value;
}
console.log(picocolors.green('[cache] hit'), picocolors.gray(cacheName || cacheKey));
if (incrementTtlWhenHit) {
fsMemoCache.updateTtl(cacheKey, TTL);
}
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
return deserializer(cached);
};
}

View File

@ -8,9 +8,8 @@ import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/
import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
import picocolors from 'picocolors';
import createKeywordFilter from './aho-corasick';
import { createCacheKey, deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem';
import { fastStringArrayJoin } from './misc';
import { stringHash } from './string-hash';
import { createCacheKey, deserializeArray, serializeArray } from './cache-filesystem';
import { cache } from './fs-memo';
const BLACK_TLD = new Set([
'accountant', 'art', 'autos',
@ -102,32 +101,7 @@ const lowKeywords = createKeywordFilter([
const cacheKey = createCacheKey(__filename);
export function getPhishingDomains(parentSpan: Span) {
return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const domainArr: string[] = [];
(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
(await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
return domainArr;
});
const cacheHash = span.traceChildSync('get hash', () => stringHash(fastStringArrayJoin(domainArr, '|')));
return span.traceChildAsync(
'process phishing domain set',
() => processPhihsingDomains(domainArr, cacheHash)
);
});
}
async function processPhihsingDomains(domainArr: string[], cacheHash = '') {
return fsFetchCache.apply(
cacheKey('processPhihsingDomains|' + cacheHash),
() => {
const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: string[]): Promise<string[]> {
const domainCountMap: Record<string, number> = {};
const domainScoreMap: Record<string, number> = {};
@ -189,14 +163,29 @@ async function processPhihsingDomains(domainArr: string[], cacheHash = '') {
}
return Promise.resolve(domainArr);
},
{
ttl: 2 * 86400 * 1000,
}, {
serializer: serializeArray,
deserializer: deserializeArray,
incrementTtlWhenHit: true
}
deserializer: deserializeArray
});
export function getPhishingDomains(parentSpan: Span) {
return parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const domainArr: string[] = [];
(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
(await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
return domainArr;
});
return span.traceChildAsync(
'process phishing domain set',
() => processPhihsingDomains(domainArr)
);
});
}
export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {