Refactor: separate modules

This commit is contained in:
SukkaW 2025-01-11 23:16:39 +08:00
parent eca2949062
commit 29410eb1c3
7 changed files with 175 additions and 177 deletions

View File

@ -2,7 +2,9 @@
import path from 'node:path';
import process from 'node:process';
import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { processHosts } from './lib/parse-filter/hosts';
import { processDomainLists } from './lib/parse-filter/domainlists';
import { processFilterRules } from './lib/parse-filter/filters';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
import { compareAndWriteFile } from './lib/create-file';
@ -18,6 +20,7 @@ import { addArrayElementsToSet } from 'foxts/add-array-elements-to-set';
import { appendArrayInPlace } from './lib/append-array-in-place';
import { OUTPUT_INTERNAL_DIR, SOURCE_DIR } from './constants/dir';
import { DomainsetOutput } from './lib/create-file';
import { foundDebugDomain } from './lib/parse-filter/shared';
const readLocalRejectDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf'));
const readLocalRejectExtraDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka_extra.conf'));
@ -63,65 +66,49 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
// Parse from AdGuard Filters
const shouldStop = await span
await span
.traceChild('download and process hosts / adblock filter rules')
.traceAsyncFn(async (childSpan) => {
// eslint-disable-next-line sukka/no-single-return -- not single return
let shouldStop = false;
await Promise.all([
// Parse from remote hosts & domain lists
HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
.traceAsyncFn((childSpan) => Promise.all([
// Parse from remote hosts & domain lists
HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),
ADGUARD_FILTERS.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectOutput(black);
})
),
ADGUARD_FILTERS_EXTRA.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectExtraOutput(black);
})
),
ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
})),
getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
// Dedupe domainSets
// span.traceChildAsync('collect black keywords/suffixes', async () =>
/**
ADGUARD_FILTERS.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectOutput(black);
})
),
ADGUARD_FILTERS_EXTRA.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectExtraOutput(black);
})
),
ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
})),
getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
// Dedupe domainSets
// span.traceChildAsync('collect black keywords/suffixes', async () =>
/**
* Collect DOMAIN, DOMAIN-SUFFIX, and DOMAIN-KEYWORD from non_ip/reject.conf for deduplication
* DOMAIN-WILDCARD is not really useful for deduplication, it is only included in AdGuardHome output
*/
rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
].flat());
// eslint-disable-next-line sukka/no-single-return -- not single return
return shouldStop;
});
rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
].flat()));
if (shouldStop) {
if (foundDebugDomain.value) {
process.exit(1);
}

View File

@ -1,4 +1,6 @@
import { processDomainLists, processHosts } from './parse-filter';
import { processHosts } from './parse-filter/hosts';
import { processDomainLists } from './parse-filter/domainlists';
import * as tldts from 'tldts-experimental';
import { dummySpan, printTraceResult } from '../trace';

View File

@ -1,7 +1,7 @@
import { describe, it } from 'mocha';
import { parse, processFilterRules } from './parse-filter';
import type { ParseType } from './parse-filter';
import { parse, processFilterRules } from './parse-filter/filters';
import type { ParseType } from './parse-filter/filters';
import { createCacheKey } from './cache-filesystem';
import { createSpan } from '../trace';
@ -20,8 +20,7 @@ describe.skip('processFilterRules', () => {
console.log(processFilterRules(
createSpan('noop'),
cacheKey('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt'),
[],
7_200_000
[]
));
});
});

View File

@ -0,0 +1,51 @@
import picocolors from 'picocolors';
import { normalizeDomain } from '../normalize-domain';
import { processLine } from '../process-line';
import { onBlackFound } from './shared';
import { fetchAssetsWithout304 } from '../fetch-assets';
import type { Span } from '../../trace';
function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
let line = processLine(l);
if (!line) return;
line = line.toLowerCase();
const domain = normalizeDomain(line);
if (!domain) return;
if (domain !== line) {
console.log(
picocolors.red('[process domain list]'),
picocolors.gray(`line: ${line}`),
picocolors.gray(`domain: ${domain}`),
picocolors.gray(meta)
);
return;
}
onBlackFound(domain, meta);
set.push(includeAllSubDomain ? `.${line}` : line);
}
export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
domainListsUrl,
mirrors
));
const domainSets: string[] = [];
const filterRules = text.split('\n');
span.traceChildSync('parse domain list', () => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});
return domainSets;
});
}

View File

@ -1,121 +1,12 @@
import { NetworkFilter } from '@ghostery/adblocker';
import { processLine } from './process-line';
import tldts from 'tldts-experimental';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import type { Span } from '../trace';
import type { Span } from '../../trace';
import { fetchAssetsWithout304 } from '../fetch-assets';
import { onBlackFound, onWhiteFound } from './shared';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source';
import { noop } from 'foxts/noop';
import { fetchAssetsWithout304 } from './fetch-assets';
let foundDebugDomain = false;
const onBlackFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain = true;
}
}
: noop;
const onWhiteFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain = true;
}
}
: noop;
function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
let line = processLine(l);
if (!line) return;
line = line.toLowerCase();
const domain = normalizeDomain(line);
if (!domain) return;
if (domain !== line) {
console.log(
picocolors.red('[process domain list]'),
picocolors.gray(`line: ${line}`),
picocolors.gray(`domain: ${domain}`),
picocolors.gray(meta)
);
return;
}
onBlackFound(domain, meta);
set.push(includeAllSubDomain ? `.${line}` : line);
}
export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
domainListsUrl,
mirrors
));
const domainSets: string[] = [];
const filterRules = text.split('\n');
span.traceChildSync('parse domain list', () => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});
return domainSets;
});
}
function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
const line = processLine(l);
if (!line) {
return;
}
const _domain = line.split(/\s/)[1]?.trim();
if (!_domain) {
return;
}
const domain = normalizeDomain(_domain);
if (!domain) {
return;
}
onBlackFound(domain, meta);
set.push(includeAllSubDomain ? `.${domain}` : domain);
}
export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
const domainSets: string[] = [];
const filterRules = text.split('\n');
span.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});
return domainSets;
});
}
import { normalizeDomain } from '../normalize-domain';
import { looseTldtsOpt } from '../../constants/loose-tldts-opt';
import tldts from 'tldts-experimental';
import { NetworkFilter } from '@ghostery/adblocker';
const enum ParseType {
WhiteIncludeSubdomain = 0,
@ -134,7 +25,7 @@ export async function processFilterRules(
filterRulesUrl: string,
fallbackUrls?: string[] | null,
allowThirdParty = false
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
): Promise<{ white: string[], black: string[] }> {
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => {
const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls);
@ -226,8 +117,7 @@ export async function processFilterRules(
return {
white,
black,
foundDebugDomain
black
};
}

View File

@ -0,0 +1,46 @@
import type { Span } from '../../trace';
import { fetchAssetsWithout304 } from '../fetch-assets';
import { normalizeDomain } from '../normalize-domain';
import { processLine } from '../process-line';
import { onBlackFound } from './shared';
function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
const line = processLine(l);
if (!line) {
return;
}
const _domain = line.split(/\s/)[1]?.trim();
if (!_domain) {
return;
}
const domain = normalizeDomain(_domain);
if (!domain) {
return;
}
onBlackFound(domain, meta);
set.push(includeAllSubDomain ? `.${domain}` : domain);
}
export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
const domainSets: string[] = [];
const filterRules = text.split('\n');
span.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});
return domainSets;
});
}

View File

@ -0,0 +1,23 @@
import picocolors from 'picocolors';
import { DEBUG_DOMAIN_TO_FIND } from '../../constants/reject-data-source';
import { noop } from 'foxts/noop';
export const foundDebugDomain = { value: false };
export const onBlackFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain.value = true;
}
}
: noop;
export const onWhiteFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain.value = true;
}
}
: noop;