Update Reject Hosts / Add mirror support for Hosts Source

This commit is contained in:
SukkaW 2024-01-22 12:09:08 +08:00
parent 41b2f543f8
commit af8cce4f45
7 changed files with 88 additions and 57 deletions

View File

@ -91,34 +91,36 @@ const processFile = (span: Span, sourcePath: string) => {
}); });
}; };
async function transformDomainset(parentSpan: Span, sourcePath: string, relativePath: string) { function transformDomainset(parentSpan: Span, sourcePath: string, relativePath: string) {
const span = parentSpan.traceChild(`transform domainset: ${path.basename(sourcePath, path.extname(sourcePath))}`); return parentSpan
.traceChild(`transform domainset: ${path.basename(sourcePath, path.extname(sourcePath))}`)
.traceAsyncFn(async (span) => {
const res = await processFile(span, sourcePath);
if (!res) return;
const res = await processFile(span, sourcePath); const [title, descriptions, lines] = res;
if (!res) return;
const [title, descriptions, lines] = res; const deduped = domainDeduper(lines);
const description = [
...SHARED_DESCRIPTION,
...(
descriptions.length
? ['', ...descriptions]
: []
)
];
const deduped = domainDeduper(lines); return createRuleset(
const description = [ span,
...SHARED_DESCRIPTION, title,
...( description,
descriptions.length new Date(),
? ['', ...descriptions] deduped,
: [] 'domainset',
) path.resolve(outputSurgeDir, relativePath),
]; path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)
);
return span.traceAsyncFn(() => createRuleset( });
span,
title,
description,
new Date(),
deduped,
'domainset',
path.resolve(outputSurgeDir, relativePath),
path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)
));
} }
/** /**

View File

@ -27,14 +27,15 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();
let shouldStop = false;
// Parse from AdGuard Filters // Parse from AdGuard Filters
await span const shouldStop = await span
.traceChild('download and process hosts / adblock filter rules') .traceChild('download and process hosts / adblock filter rules')
.traceAsyncFn(async (childSpan) => { .traceAsyncFn(async (childSpan) => {
// eslint-disable-next-line sukka/no-single-return -- not single return
let shouldStop = false;
await Promise.all([ await Promise.all([
// Parse from remote hosts & domain lists // Parse from remote hosts & domain lists
...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))), ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(hosts => SetHelpers.add(domainSets, hosts))),
...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))), ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))),
@ -44,6 +45,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
: processFilterRules(childSpan, input[0], input[1], input[2]) : processFilterRules(childSpan, input[0], input[1], input[2])
).then(({ white, black, foundDebugDomain }) => { ).then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) { if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true; shouldStop = true;
// we should not break here, as we want to see full matches from all data source // we should not break here, as we want to see full matches from all data source
} }
@ -65,7 +67,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
setAddFromArray(domainSets, await readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))); setAddFromArray(domainSets, await readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')));
}) })
]); ]);
// eslint-disable-next-line sukka/no-single-return -- not single return
return shouldStop; return shouldStop;
}); });

View File

@ -44,25 +44,25 @@ export function processDomainLists(span: Span, domainListsUrl: string, includeAl
} }
)); ));
} }
export function processHosts(span: Span, hostsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn(() => fsCache.apply( return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsCache.apply(
hostsUrl, hostsUrl,
async () => { async () => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();
for await (const l of await fetchRemoteTextByLine(hostsUrl)) { const lineCb = (l: string) => {
const line = processLine(l); const line = processLine(l);
if (!line) { if (!line) {
continue; return;
} }
const _domain = line.split(/\s/)[1]?.trim(); const _domain = line.split(/\s/)[1]?.trim();
if (!_domain) { if (!_domain) {
continue; return;
} }
const domain = normalizeDomain(_domain); const domain = normalizeDomain(_domain);
if (!domain) { if (!domain) {
continue; return;
} }
if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) { if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(picocolors.red(hostsUrl), '(black)', domain.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); console.warn(picocolors.red(hostsUrl), '(black)', domain.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
@ -70,6 +70,25 @@ export function processHosts(span: Span, hostsUrl: string, includeAllSubDomain =
} }
domainSets.add(includeAllSubDomain ? `.${domain}` : domain); domainSets.add(includeAllSubDomain ? `.${domain}` : domain);
};
if (mirrors == null || mirrors.length === 0) {
for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
lineCb(l);
}
} else {
// Avoid event loop starvation, so we wait for a macrotask before we start fetching.
await Promise.resolve();
const filterRules = await childSpan.traceChild('download hosts').traceAsyncFn(() => {
return fetchAssets(hostsUrl, mirrors).then(text => text.split('\n'));
});
childSpan.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
});
} }
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size)); console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));

View File

@ -1,21 +1,34 @@
import { TTL } from './cache-filesystem'; import { TTL } from './cache-filesystem';
export const HOSTS = [ type HostsSource = [main: string, mirrors: string[] | null, includeAllSubDomain: boolean, ttl: number];
['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true, TTL.THREE_HOURS()],
['https://someonewhocares.org/hosts/hosts', true, TTL.THREE_HOURS()], export const HOSTS: HostsSource[] = [
[
'https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext',
['https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/thirdparties/pgl.yoyo.org/as/serverlist'],
true,
TTL.THREE_HOURS()
],
['https://someonewhocares.org/hosts/hosts', null, true, TTL.THREE_HOURS()],
// no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', true, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl // have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, TTL.TWO_WEEKS()], ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', false, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()],
// ad-wars is not actively maintained, so we set a 7 days cache ttl // ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, TTL.ONE_WEEK()], ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true, TTL.THREE_HOURS()], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()],
// Curben's UrlHaus Malicious URL Blocklist // Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', [
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt',
['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, TTL.THREE_HOURS()] [
'https://urlhaus-filter.pages.dev/urlhaus-filter-hosts.txt',
'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-hosts.txt'
],
true,
TTL.THREE_HOURS()
]
// Curben's Phishing URL Blocklist // Curben's Phishing URL Blocklist
// Covered by lib/get-phishing-domains.ts // Covered by lib/get-phishing-domains.ts
// 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt' // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt'

View File

@ -9,7 +9,7 @@ export function traceSync<T>(prefix: string, fn: () => T, timeFormatter: Formatt
console.log(`${timeFormatter(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`); console.log(`${timeFormatter(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`);
return result; return result;
} }
traceSync.skip = <T>(_prefix: string, fn: () => T): T => fn(); // traceSync.skip = <T>(_prefix: string, fn: () => T): T => fn();
export const traceAsync = async <T>(prefix: string, fn: () => Promise<T>, timeFormatter: Formatter = picocolors.blue): Promise<T> => { export const traceAsync = async <T>(prefix: string, fn: () => Promise<T>, timeFormatter: Formatter = picocolors.blue): Promise<T> => {
const start = Bun.nanoseconds(); const start = Bun.nanoseconds();
@ -18,9 +18,3 @@ export const traceAsync = async <T>(prefix: string, fn: () => Promise<T>, timeFo
console.log(`${timeFormatter(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`); console.log(`${timeFormatter(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`);
return result; return result;
}; };
export interface TaskResult {
readonly start: number,
readonly end: number,
readonly taskName: string
}

View File

@ -52,7 +52,7 @@ export const createSpan = (name: string, parentTraceResult?: TraceResult): Span
const stop = (time?: number) => { const stop = (time?: number) => {
if (status === SPAN_STATUS_END) { if (status === SPAN_STATUS_END) {
throw new Error('span already stopped'); throw new Error(`span already stopped: ${name}`);
} }
const end = time ?? Bun.nanoseconds(); const end = time ?? Bun.nanoseconds();

View File

@ -108,9 +108,7 @@ DOMAIN-KEYWORD,adjust.
DOMAIN-KEYWORD,appsflyer DOMAIN-KEYWORD,appsflyer
DOMAIN-KEYWORD,dnserror DOMAIN-KEYWORD,dnserror
DOMAIN-KEYWORD,marketing.net DOMAIN-KEYWORD,marketing.net
AND,((DOMAIN-KEYWORD,f-log), (DOMAIN-SUFFIX,grammarly.io))
DOMAIN,stun.smartgslb.com DOMAIN,stun.smartgslb.com
AND,((DOMAIN-SUFFIX,prod.hosts.ooklaserver.net), (DOMAIN-KEYWORD,.ad.))
DOMAIN-KEYWORD,_vmind.qqvideo.tc.qq.com DOMAIN-KEYWORD,_vmind.qqvideo.tc.qq.com
DOMAIN-KEYWORD,-logging.nextmedia.com DOMAIN-KEYWORD,-logging.nextmedia.com
@ -120,7 +118,10 @@ DOMAIN-KEYWORD,.engage.3m.
# -telemetry.officeapps.live.com # -telemetry.officeapps.live.com
DOMAIN-KEYWORD,telemetry.officeapps.live.com DOMAIN-KEYWORD,telemetry.officeapps.live.com
DOMAIN-KEYWORD,-launches.appsflyersdk.com DOMAIN-KEYWORD,-launches.appsflyersdk.com
DOMAIN-KEYWORD,-s2s.sensic.net
AND,((DOMAIN-KEYWORD,f-log), (DOMAIN-SUFFIX,grammarly.io))
AND,((DOMAIN-SUFFIX,prod.hosts.ooklaserver.net), (DOMAIN-KEYWORD,.ad.))
AND,((DOMAIN-KEYWORD,genuine), (DOMAIN-KEYWORD,autodesk)) AND,((DOMAIN-KEYWORD,genuine), (DOMAIN-KEYWORD,autodesk))
# Important: Force add the following domains without whitelisting # Important: Force add the following domains without whitelisting