From 2d706f47752cb7f87f2b18a7912a2f29aafbca10 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 27 Apr 2025 23:33:56 +0800 Subject: [PATCH] Chore: new util run against source file --- Build/lib/run-against-source-file.ts | 41 +++++++++++++++++ Build/tools-lum-apex-domains.ts | 27 ++++-------- Build/tools-migrate-domains.ts | 29 +++--------- Build/validate-domain-alive.ts | 66 +++++++++++----------------- Build/validate-domestic.ts | 32 ++++++-------- Build/validate-gfwlist.ts | 51 ++++++--------------- Build/validate-global-tld.ts | 30 ++++--------- Build/validate-reject-stats.ts | 33 +++++++------- 8 files changed, 132 insertions(+), 177 deletions(-) create mode 100644 Build/lib/run-against-source-file.ts diff --git a/Build/lib/run-against-source-file.ts b/Build/lib/run-against-source-file.ts new file mode 100644 index 00000000..6d5b78dc --- /dev/null +++ b/Build/lib/run-against-source-file.ts @@ -0,0 +1,41 @@ +import { never } from 'foxts/guard'; +import { readFileByLine } from './fetch-text-by-line'; +import { processLine } from './process-line'; + +export default async function runAgainstSourceFile( + filePath: string, + callback: (domain: string, includeAllSubDomain: boolean) => void, + type?: 'ruleset' | 'domainset' +) { + for await (const line of readFileByLine(filePath)) { + const l = processLine(line); + if (!l) { + continue; + } + if (type == null) { + if (l.includes(',')) { + type = 'ruleset'; + } else { + type = 'domainset'; + } + } + + if (type === 'ruleset') { + const [ruleType, domain] = l.split(',', 3); + if (ruleType === 'DOMAIN') { + callback(domain, false); + } else if (ruleType === 'DOMAIN-SUFFIX') { + callback(domain, true); + } + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition -- exhaus options + } else if (type === 'domainset') { + if (l[0] === '.') { + callback(l.slice(1), true); + } else { + callback(l, false); + } + } else { + never(type); + } + } +} diff --git a/Build/tools-lum-apex-domains.ts b/Build/tools-lum-apex-domains.ts index 1894b1c5..921711ab 100644 --- a/Build/tools-lum-apex-domains.ts +++ b/Build/tools-lum-apex-domains.ts @@ -1,9 +1,9 @@ -import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import tldts from 'tldts'; import { HostnameSmolTrie } from './lib/trie'; import path from 'node:path'; import { SOURCE_DIR } from './constants/dir'; -import { processLine } from './lib/process-line'; +import runAgainstSourceFile from './lib/run-against-source-file'; (async () => { const lines1 = await Array.fromAsync(await fetchRemoteTextByLine('https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true)); @@ -31,23 +31,12 @@ import { processLine } from './lib/process-line'; }); } - for await (const line of readFileByLine(path.join(SOURCE_DIR, 'domainset', 'reject.conf'))) { - const l = processLine(line); - if (l) { - trie.whitelist(l); - } - } - for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'))) { - const l = processLine(line); - if (l) { - const [type, domain] = l.split(',', 3); - if (type === 'DOMAIN') { - trie.whitelist(domain, false); - } else if (type === 'DOMAIN-SUFFIX') { - trie.whitelist(domain, true); - } - } - } + await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), (domain, includeAllSubDomain) => { + trie.whitelist(domain, includeAllSubDomain); + }, 'domainset'); + await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), (domain, includeAllSubDomain) => { + trie.whitelist(domain, includeAllSubDomain); + }, 'ruleset'); console.log(trie.dump().map(i => '.' + i).join('\n')); })(); diff --git a/Build/tools-migrate-domains.ts b/Build/tools-migrate-domains.ts index e43ba1e0..00d02792 100644 --- a/Build/tools-migrate-domains.ts +++ b/Build/tools-migrate-domains.ts @@ -1,20 +1,23 @@ import path from 'node:path'; -import { readFileByLine } from './lib/fetch-text-by-line'; import { processFilterRulesWithPreload } from './lib/parse-filter/filters'; import { processHosts } from './lib/parse-filter/hosts'; -import { processLine } from './lib/process-line'; import { HostnameSmolTrie } from './lib/trie'; import { dummySpan } from './trace'; import { SOURCE_DIR } from './constants/dir'; import { PREDEFINED_WHITELIST } from './constants/reject-data-source'; +import runAgainstSourceFile from './lib/run-against-source-file'; (async () => { const trie = new HostnameSmolTrie(); await writeHostsToTrie(trie, 'https://cdn.jsdelivr.net/gh/jerryn70/GoodbyeAds@master/Extension/GoodbyeAds-Xiaomi-Extension.txt', true); - await runWhiteOnSource(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), trie); - await runWhiteOnSource(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), trie); + const callback = (domain: string, includeAllSubDomain: boolean) => { + trie.whitelist(domain, includeAllSubDomain); + }; + + await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), callback, 'domainset'); + await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), callback, 'ruleset'); for (let i = 0, len = PREDEFINED_WHITELIST.length; i < len; i++) { trie.whitelist(PREDEFINED_WHITELIST[i]); @@ -25,24 +28,6 @@ import { PREDEFINED_WHITELIST } from './constants/reject-data-source'; console.log('---------------------------'); })(); -async function runWhiteOnSource(sourceFile: string, trie: HostnameSmolTrie) { - for await (const line of readFileByLine(sourceFile)) { - const l = processLine(line); - if (l) { - if (l.includes(',')) { - const [type, domain] = l.split(',', 3); - if (type === 'DOMAIN') { - trie.whitelist(domain, false); - } else if (type === 'DOMAIN-SUFFIX') { - trie.whitelist(domain, true); - } - } else { - trie.whitelist(l); - } - } - } -} - async function writeHostsToTrie(trie: HostnameSmolTrie, hostsUrl: string, includeAllSubDomain = false) { const hosts = await processHosts(dummySpan, hostsUrl, [], includeAllSubDomain); diff --git a/Build/validate-domain-alive.ts b/Build/validate-domain-alive.ts index 008c72fc..5d98f2aa 100644 --- a/Build/validate-domain-alive.ts +++ b/Build/validate-domain-alive.ts @@ -1,11 +1,9 @@ -import { readFileByLine } from './lib/fetch-text-by-line'; -import { processLine } from './lib/process-line'; - import { SOURCE_DIR } from './constants/dir'; import path from 'node:path'; import { newQueue } from '@henrygd/queue'; import { isDomainAlive, keyedAsyncMutexWithQueue } from './lib/is-domain-alive'; import { fdir as Fdir } from 'fdir'; +import runAgainstSourceFile from './lib/run-against-source-file'; const queue = newQueue(24); @@ -19,10 +17,20 @@ function onDomain(args: [string, boolean]) { (async () => { const domainSets = await new Fdir() .withFullPaths() + .filter((filePath, isDirectory) => { + if (isDirectory) return false; + const extname = path.extname(filePath); + return extname === '.txt' || extname === '.conf'; + }) .crawl(SOURCE_DIR + path.sep + 'domainset') .withPromise(); const domainRules = await new Fdir() .withFullPaths() + .filter((filePath, isDirectory) => { + if (isDirectory) return false; + const extname = path.extname(filePath); + return extname === '.txt' || extname === '.conf'; + }) .crawl(SOURCE_DIR + path.sep + 'non_ip') .withPromise(); @@ -37,53 +45,29 @@ function onDomain(args: [string, boolean]) { })(); export async function runAgainstRuleset(filepath: string) { - const extname = path.extname(filepath); - if (extname !== '.conf') { - console.log('[skip]', filepath); - return; - } - const promises: Array> = []; - - for await (const l of readFileByLine(filepath)) { - const line = processLine(l); - if (!line) continue; - const [type, domain] = line.split(','); - switch (type) { - case 'DOMAIN-SUFFIX': - case 'DOMAIN': { - promises.push( - queue.add(() => keyedAsyncMutexWithQueue(domain, () => isDomainAlive(domain, type === 'DOMAIN-SUFFIX'))) - .then(onDomain) - ); - break; - } - // no default - } - } + await runAgainstSourceFile( + filepath, + (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue( + domain, + () => isDomainAlive(domain, includeAllSubdomain) + ).then(onDomain)) + ); await Promise.all(promises); console.log('[done]', filepath); } export async function runAgainstDomainset(filepath: string) { - const extname = path.extname(filepath); - if (extname !== '.conf') { - console.log('[skip]', filepath); - return; - } - const promises: Array> = []; - for await (const l of readFileByLine(filepath)) { - const line = processLine(l); - if (!line) continue; - promises.push( - queue.add(() => keyedAsyncMutexWithQueue(line, () => isDomainAlive(line, line[0] === '.'))) - .then(onDomain) - ); - } - + await runAgainstSourceFile( + filepath, + (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue( + domain, + () => isDomainAlive(domain, includeAllSubdomain) + ).then(onDomain)) + ); await Promise.all(promises); console.log('[done]', filepath); } diff --git a/Build/validate-domestic.ts b/Build/validate-domestic.ts index 2e5c0a95..b2247cd9 100644 --- a/Build/validate-domestic.ts +++ b/Build/validate-domestic.ts @@ -1,11 +1,10 @@ -import { readFileByLine } from './lib/fetch-text-by-line'; import { parse } from 'csv-parse/sync'; import { HostnameSmolTrie } from './lib/trie'; import path from 'node:path'; -import { processLine } from './lib/process-line'; import { SOURCE_DIR } from './constants/dir'; import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq'; import { $$fetch } from './lib/fetch-retry'; +import runAgainstSourceFile from './lib/run-against-source-file'; export async function parseDomesticList() { const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'))); @@ -36,27 +35,24 @@ export async function parseDomesticList() { const notIncludedDomestic = new Set(top5000); - const runAgainstRuleset = async (ruleset: string) => { - for await (const l of readFileByLine(ruleset)) { - const line = processLine(l); - if (!line) continue; - const [type, domain] = line.split(','); - if (type === 'DOMAIN-SUFFIX') { + // await Promise.all([ + await runAgainstSourceFile( + path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'), + (domain, includeAllSubdomain) => { + if (includeAllSubdomain) { if (top5000.has(domain)) { notIncludedDomestic.delete(domain); } - } else if (type === 'DOMAIN-KEYWORD') { - for (const d of top5000) { - if (d.includes(domain)) { - notIncludedDomestic.delete(d); - } - } + } else { + // noop, DOMAIN-KEYWORD handing + // for (const d of top5000) { + // if (d.includes(domain)) { + // notIncludedDomestic.delete(d); + // } + // } } } - }; - - // await Promise.all([ - await runAgainstRuleset(path.resolve(SOURCE_DIR, 'non_ip/domestic.conf')); + ); // ]); console.log(notIncludedDomestic.size, notIncludedDomestic); diff --git a/Build/validate-gfwlist.ts b/Build/validate-gfwlist.ts index 1bf0c1a3..3fc280f3 100644 --- a/Build/validate-gfwlist.ts +++ b/Build/validate-gfwlist.ts @@ -3,11 +3,12 @@ import { fastNormalizeDomain } from './lib/normalize-domain'; import { HostnameSmolTrie } from './lib/trie'; // import { Readable } from 'stream'; import { parse } from 'csv-parse/sync'; -import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import path from 'node:path'; import { OUTPUT_SURGE_DIR } from './constants/dir'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; import { $$fetch } from './lib/fetch-retry'; +import runAgainstSourceFile from './lib/run-against-source-file'; export async function parseGfwList() { const whiteSet = new Set(); @@ -77,46 +78,20 @@ export async function parseGfwList() { const keywordSet = new Set(); - const runAgainstRuleset = async (ruleset: string) => { - for await (const l of readFileByLine(ruleset)) { - const line = processLine(l); - if (!line) continue; - const [type, domain] = line.split(','); - switch (type) { - case 'DOMAIN-SUFFIX': { - trie.whitelist('.' + domain); - break; - } - case 'DOMAIN': { - trie.whitelist(domain); - break; - } - case 'DOMAIN-KEYWORD': { - keywordSet.add(domain); - break; - } - // no default - } - } + const callback = (domain: string, includeAllSubdomain: boolean) => { + trie.whitelist(domain, includeAllSubdomain); }; - const runAgainstDomainset = async (ruleset: string) => { - for await (const l of readFileByLine(ruleset)) { - const line = processLine(l); - if (!line) continue; - trie.whitelist(line); - } - }; await Promise.all([ - runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf')), - runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf')), - runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf')), - runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf')), - runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf')), - runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf')), - runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf')), - runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf')), - runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf')) + runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'), + runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset') ]); whiteSet.forEach(domain => trie.whitelist(domain)); diff --git a/Build/validate-global-tld.ts b/Build/validate-global-tld.ts index c32a9bbc..b7d07ae0 100644 --- a/Build/validate-global-tld.ts +++ b/Build/validate-global-tld.ts @@ -1,42 +1,28 @@ import path from 'node:path'; -import { readFileByLine } from './lib/fetch-text-by-line'; import { HostnameSmolTrie } from './lib/trie'; -import { OUTPUT_SURGE_DIR, SOURCE_DIR } from './constants/dir'; +import { OUTPUT_SURGE_DIR } from './constants/dir'; import { ICP_TLD } from './constants/domains'; import tldts from 'tldts-experimental'; import { looseTldtsOpt } from './constants/loose-tldts-opt'; +import runAgainstSourceFile from './lib/run-against-source-file'; (async () => { const trie = new HostnameSmolTrie(); const extraWhiteTLDs = new Set(); - for await (const line of readFileByLine(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'))) { - const [type, domain] = line.split(','); - if (type !== 'DOMAIN' && type !== 'DOMAIN-SUFFIX') { - continue; - } + await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'), (domain) => { if (domain === 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe') { - continue; + return; } const tld = tldts.getPublicSuffix(domain, looseTldtsOpt); if (tld) { extraWhiteTLDs.add(tld); } - } + }, 'ruleset'); - for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'global.conf'))) { - const [type, domain] = line.split(','); - switch (type) { - case 'DOMAIN': - trie.add(domain); - break; - case 'DOMAIN-SUFFIX': - trie.add(domain, true); - break; - default: - break; - } - } + await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'global.conf'), (domain, includeAllSubDomain) => { + trie.add(domain, includeAllSubDomain); + }, 'ruleset'); ICP_TLD.forEach(tld => trie.whitelist(tld, true)); extraWhiteTLDs.forEach(tld => trie.whitelist(tld, true)); diff --git a/Build/validate-reject-stats.ts b/Build/validate-reject-stats.ts index 51542e4e..8dd5951b 100644 --- a/Build/validate-reject-stats.ts +++ b/Build/validate-reject-stats.ts @@ -1,9 +1,8 @@ import path from 'node:path'; -import { readFileByLine } from './lib/fetch-text-by-line'; import { OUTPUT_SURGE_DIR } from './constants/dir'; -import { processLine } from './lib/process-line'; import tldts from 'tldts'; import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt'; +import runAgainstSourceFile from './lib/run-against-source-file'; (async () => { const rejectDomainCountMap = await runAgainstDomainset(new Map(), path.join(OUTPUT_SURGE_DIR, 'domainset', 'reject.conf')); @@ -17,22 +16,22 @@ import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt'; })(); async function runAgainstDomainset(rejectDomainCountMap: Map, file: string) { - for await (const line of readFileByLine(file)) { - if (!processLine(line)) { - continue; - } - const apexDomain = tldts.getDomain(line, loosTldOptWithPrivateDomains); - if (!apexDomain) { - continue; - } + await runAgainstSourceFile( + file, + (domain: string) => { + const apexDomain = tldts.getDomain(domain, loosTldOptWithPrivateDomains); + if (!apexDomain) { + return; + } - rejectDomainCountMap.set( - apexDomain, - rejectDomainCountMap.has(apexDomain) - ? rejectDomainCountMap.get(apexDomain)! + 1 - : 1 - ); - } + rejectDomainCountMap.set( + apexDomain, + rejectDomainCountMap.has(apexDomain) + ? rejectDomainCountMap.get(apexDomain)! + 1 + : 1 + ); + } + ); return rejectDomainCountMap; }