Chore: new util run against source file
Some checks failed
Build / Build (push) Has been cancelled
Build / Diff output (push) Has been cancelled
Build / Deploy to Cloudflare Pages (3.114.6) (push) Has been cancelled
Build / Deploy to GitHub and GitLab (push) Has been cancelled

This commit is contained in:
SukkaW 2025-04-27 23:33:56 +08:00
parent 505f7544ed
commit 2d706f4775
8 changed files with 132 additions and 177 deletions

View File

@ -0,0 +1,41 @@
import { never } from 'foxts/guard';
import { readFileByLine } from './fetch-text-by-line';
import { processLine } from './process-line';
export default async function runAgainstSourceFile(
filePath: string,
callback: (domain: string, includeAllSubDomain: boolean) => void,
type?: 'ruleset' | 'domainset'
) {
for await (const line of readFileByLine(filePath)) {
const l = processLine(line);
if (!l) {
continue;
}
if (type == null) {
if (l.includes(',')) {
type = 'ruleset';
} else {
type = 'domainset';
}
}
if (type === 'ruleset') {
const [ruleType, domain] = l.split(',', 3);
if (ruleType === 'DOMAIN') {
callback(domain, false);
} else if (ruleType === 'DOMAIN-SUFFIX') {
callback(domain, true);
}
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition -- exhaus options
} else if (type === 'domainset') {
if (l[0] === '.') {
callback(l.slice(1), true);
} else {
callback(l, false);
}
} else {
never(type);
}
}
}

View File

@ -1,9 +1,9 @@
import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import tldts from 'tldts'; import tldts from 'tldts';
import { HostnameSmolTrie } from './lib/trie'; import { HostnameSmolTrie } from './lib/trie';
import path from 'node:path'; import path from 'node:path';
import { SOURCE_DIR } from './constants/dir'; import { SOURCE_DIR } from './constants/dir';
import { processLine } from './lib/process-line'; import runAgainstSourceFile from './lib/run-against-source-file';
(async () => { (async () => {
const lines1 = await Array.fromAsync(await fetchRemoteTextByLine('https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true)); const lines1 = await Array.fromAsync(await fetchRemoteTextByLine('https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true));
@ -31,23 +31,12 @@ import { processLine } from './lib/process-line';
}); });
} }
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'domainset', 'reject.conf'))) { await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), (domain, includeAllSubDomain) => {
const l = processLine(line); trie.whitelist(domain, includeAllSubDomain);
if (l) { }, 'domainset');
trie.whitelist(l); await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), (domain, includeAllSubDomain) => {
} trie.whitelist(domain, includeAllSubDomain);
} }, 'ruleset');
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'))) {
const l = processLine(line);
if (l) {
const [type, domain] = l.split(',', 3);
if (type === 'DOMAIN') {
trie.whitelist(domain, false);
} else if (type === 'DOMAIN-SUFFIX') {
trie.whitelist(domain, true);
}
}
}
console.log(trie.dump().map(i => '.' + i).join('\n')); console.log(trie.dump().map(i => '.' + i).join('\n'));
})(); })();

View File

@ -1,20 +1,23 @@
import path from 'node:path'; import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { processFilterRulesWithPreload } from './lib/parse-filter/filters'; import { processFilterRulesWithPreload } from './lib/parse-filter/filters';
import { processHosts } from './lib/parse-filter/hosts'; import { processHosts } from './lib/parse-filter/hosts';
import { processLine } from './lib/process-line';
import { HostnameSmolTrie } from './lib/trie'; import { HostnameSmolTrie } from './lib/trie';
import { dummySpan } from './trace'; import { dummySpan } from './trace';
import { SOURCE_DIR } from './constants/dir'; import { SOURCE_DIR } from './constants/dir';
import { PREDEFINED_WHITELIST } from './constants/reject-data-source'; import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => { (async () => {
const trie = new HostnameSmolTrie(); const trie = new HostnameSmolTrie();
await writeHostsToTrie(trie, 'https://cdn.jsdelivr.net/gh/jerryn70/GoodbyeAds@master/Extension/GoodbyeAds-Xiaomi-Extension.txt', true); await writeHostsToTrie(trie, 'https://cdn.jsdelivr.net/gh/jerryn70/GoodbyeAds@master/Extension/GoodbyeAds-Xiaomi-Extension.txt', true);
await runWhiteOnSource(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), trie); const callback = (domain: string, includeAllSubDomain: boolean) => {
await runWhiteOnSource(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), trie); trie.whitelist(domain, includeAllSubDomain);
};
await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), callback, 'domainset');
await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), callback, 'ruleset');
for (let i = 0, len = PREDEFINED_WHITELIST.length; i < len; i++) { for (let i = 0, len = PREDEFINED_WHITELIST.length; i < len; i++) {
trie.whitelist(PREDEFINED_WHITELIST[i]); trie.whitelist(PREDEFINED_WHITELIST[i]);
@ -25,24 +28,6 @@ import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
console.log('---------------------------'); console.log('---------------------------');
})(); })();
async function runWhiteOnSource(sourceFile: string, trie: HostnameSmolTrie) {
for await (const line of readFileByLine(sourceFile)) {
const l = processLine(line);
if (l) {
if (l.includes(',')) {
const [type, domain] = l.split(',', 3);
if (type === 'DOMAIN') {
trie.whitelist(domain, false);
} else if (type === 'DOMAIN-SUFFIX') {
trie.whitelist(domain, true);
}
} else {
trie.whitelist(l);
}
}
}
}
async function writeHostsToTrie(trie: HostnameSmolTrie, hostsUrl: string, includeAllSubDomain = false) { async function writeHostsToTrie(trie: HostnameSmolTrie, hostsUrl: string, includeAllSubDomain = false) {
const hosts = await processHosts(dummySpan, hostsUrl, [], includeAllSubDomain); const hosts = await processHosts(dummySpan, hostsUrl, [], includeAllSubDomain);

View File

@ -1,11 +1,9 @@
import { readFileByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
import { SOURCE_DIR } from './constants/dir'; import { SOURCE_DIR } from './constants/dir';
import path from 'node:path'; import path from 'node:path';
import { newQueue } from '@henrygd/queue'; import { newQueue } from '@henrygd/queue';
import { isDomainAlive, keyedAsyncMutexWithQueue } from './lib/is-domain-alive'; import { isDomainAlive, keyedAsyncMutexWithQueue } from './lib/is-domain-alive';
import { fdir as Fdir } from 'fdir'; import { fdir as Fdir } from 'fdir';
import runAgainstSourceFile from './lib/run-against-source-file';
const queue = newQueue(24); const queue = newQueue(24);
@ -19,10 +17,20 @@ function onDomain(args: [string, boolean]) {
(async () => { (async () => {
const domainSets = await new Fdir() const domainSets = await new Fdir()
.withFullPaths() .withFullPaths()
.filter((filePath, isDirectory) => {
if (isDirectory) return false;
const extname = path.extname(filePath);
return extname === '.txt' || extname === '.conf';
})
.crawl(SOURCE_DIR + path.sep + 'domainset') .crawl(SOURCE_DIR + path.sep + 'domainset')
.withPromise(); .withPromise();
const domainRules = await new Fdir() const domainRules = await new Fdir()
.withFullPaths() .withFullPaths()
.filter((filePath, isDirectory) => {
if (isDirectory) return false;
const extname = path.extname(filePath);
return extname === '.txt' || extname === '.conf';
})
.crawl(SOURCE_DIR + path.sep + 'non_ip') .crawl(SOURCE_DIR + path.sep + 'non_ip')
.withPromise(); .withPromise();
@ -37,53 +45,29 @@ function onDomain(args: [string, boolean]) {
})(); })();
export async function runAgainstRuleset(filepath: string) { export async function runAgainstRuleset(filepath: string) {
const extname = path.extname(filepath);
if (extname !== '.conf') {
console.log('[skip]', filepath);
return;
}
const promises: Array<Promise<void>> = []; const promises: Array<Promise<void>> = [];
await runAgainstSourceFile(
for await (const l of readFileByLine(filepath)) { filepath,
const line = processLine(l); (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
if (!line) continue; domain,
const [type, domain] = line.split(','); () => isDomainAlive(domain, includeAllSubdomain)
switch (type) { ).then(onDomain))
case 'DOMAIN-SUFFIX': );
case 'DOMAIN': {
promises.push(
queue.add(() => keyedAsyncMutexWithQueue(domain, () => isDomainAlive(domain, type === 'DOMAIN-SUFFIX')))
.then(onDomain)
);
break;
}
// no default
}
}
await Promise.all(promises); await Promise.all(promises);
console.log('[done]', filepath); console.log('[done]', filepath);
} }
export async function runAgainstDomainset(filepath: string) { export async function runAgainstDomainset(filepath: string) {
const extname = path.extname(filepath);
if (extname !== '.conf') {
console.log('[skip]', filepath);
return;
}
const promises: Array<Promise<void>> = []; const promises: Array<Promise<void>> = [];
for await (const l of readFileByLine(filepath)) { await runAgainstSourceFile(
const line = processLine(l); filepath,
if (!line) continue; (domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
promises.push( domain,
queue.add(() => keyedAsyncMutexWithQueue(line, () => isDomainAlive(line, line[0] === '.'))) () => isDomainAlive(domain, includeAllSubdomain)
.then(onDomain) ).then(onDomain))
); );
}
await Promise.all(promises); await Promise.all(promises);
console.log('[done]', filepath); console.log('[done]', filepath);
} }

View File

@ -1,11 +1,10 @@
import { readFileByLine } from './lib/fetch-text-by-line';
import { parse } from 'csv-parse/sync'; import { parse } from 'csv-parse/sync';
import { HostnameSmolTrie } from './lib/trie'; import { HostnameSmolTrie } from './lib/trie';
import path from 'node:path'; import path from 'node:path';
import { processLine } from './lib/process-line';
import { SOURCE_DIR } from './constants/dir'; import { SOURCE_DIR } from './constants/dir';
import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq'; import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq';
import { $$fetch } from './lib/fetch-retry'; import { $$fetch } from './lib/fetch-retry';
import runAgainstSourceFile from './lib/run-against-source-file';
export async function parseDomesticList() { export async function parseDomesticList() {
const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'))); const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')));
@ -36,27 +35,24 @@ export async function parseDomesticList() {
const notIncludedDomestic = new Set<string>(top5000); const notIncludedDomestic = new Set<string>(top5000);
const runAgainstRuleset = async (ruleset: string) => { // await Promise.all([
for await (const l of readFileByLine(ruleset)) { await runAgainstSourceFile(
const line = processLine(l); path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'),
if (!line) continue; (domain, includeAllSubdomain) => {
const [type, domain] = line.split(','); if (includeAllSubdomain) {
if (type === 'DOMAIN-SUFFIX') {
if (top5000.has(domain)) { if (top5000.has(domain)) {
notIncludedDomestic.delete(domain); notIncludedDomestic.delete(domain);
} }
} else if (type === 'DOMAIN-KEYWORD') { } else {
for (const d of top5000) { // noop, DOMAIN-KEYWORD handing
if (d.includes(domain)) { // for (const d of top5000) {
notIncludedDomestic.delete(d); // if (d.includes(domain)) {
} // notIncludedDomestic.delete(d);
} // }
// }
} }
} }
}; );
// await Promise.all([
await runAgainstRuleset(path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'));
// ]); // ]);
console.log(notIncludedDomestic.size, notIncludedDomestic); console.log(notIncludedDomestic.size, notIncludedDomestic);

View File

@ -3,11 +3,12 @@ import { fastNormalizeDomain } from './lib/normalize-domain';
import { HostnameSmolTrie } from './lib/trie'; import { HostnameSmolTrie } from './lib/trie';
// import { Readable } from 'stream'; // import { Readable } from 'stream';
import { parse } from 'csv-parse/sync'; import { parse } from 'csv-parse/sync';
import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import path from 'node:path'; import path from 'node:path';
import { OUTPUT_SURGE_DIR } from './constants/dir'; import { OUTPUT_SURGE_DIR } from './constants/dir';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { $$fetch } from './lib/fetch-retry'; import { $$fetch } from './lib/fetch-retry';
import runAgainstSourceFile from './lib/run-against-source-file';
export async function parseGfwList() { export async function parseGfwList() {
const whiteSet = new Set<string>(); const whiteSet = new Set<string>();
@ -77,46 +78,20 @@ export async function parseGfwList() {
const keywordSet = new Set<string>(); const keywordSet = new Set<string>();
const runAgainstRuleset = async (ruleset: string) => { const callback = (domain: string, includeAllSubdomain: boolean) => {
for await (const l of readFileByLine(ruleset)) { trie.whitelist(domain, includeAllSubdomain);
const line = processLine(l);
if (!line) continue;
const [type, domain] = line.split(',');
switch (type) {
case 'DOMAIN-SUFFIX': {
trie.whitelist('.' + domain);
break;
}
case 'DOMAIN': {
trie.whitelist(domain);
break;
}
case 'DOMAIN-KEYWORD': {
keywordSet.add(domain);
break;
}
// no default
}
}
}; };
const runAgainstDomainset = async (ruleset: string) => {
for await (const l of readFileByLine(ruleset)) {
const line = processLine(l);
if (!line) continue;
trie.whitelist(line);
}
};
await Promise.all([ await Promise.all([
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf')), runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf'), callback, 'ruleset'),
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf')), runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf'), callback, 'ruleset'),
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf')), runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset'),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf')), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset'),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf')), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf'), callback, 'ruleset'),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf')), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset'),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf')), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf')), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf')) runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset')
]); ]);
whiteSet.forEach(domain => trie.whitelist(domain)); whiteSet.forEach(domain => trie.whitelist(domain));

View File

@ -1,42 +1,28 @@
import path from 'node:path'; import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { HostnameSmolTrie } from './lib/trie'; import { HostnameSmolTrie } from './lib/trie';
import { OUTPUT_SURGE_DIR, SOURCE_DIR } from './constants/dir'; import { OUTPUT_SURGE_DIR } from './constants/dir';
import { ICP_TLD } from './constants/domains'; import { ICP_TLD } from './constants/domains';
import tldts from 'tldts-experimental'; import tldts from 'tldts-experimental';
import { looseTldtsOpt } from './constants/loose-tldts-opt'; import { looseTldtsOpt } from './constants/loose-tldts-opt';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => { (async () => {
const trie = new HostnameSmolTrie(); const trie = new HostnameSmolTrie();
const extraWhiteTLDs = new Set<string>(); const extraWhiteTLDs = new Set<string>();
for await (const line of readFileByLine(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'))) { await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'), (domain) => {
const [type, domain] = line.split(',');
if (type !== 'DOMAIN' && type !== 'DOMAIN-SUFFIX') {
continue;
}
if (domain === 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe') { if (domain === 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe') {
continue; return;
} }
const tld = tldts.getPublicSuffix(domain, looseTldtsOpt); const tld = tldts.getPublicSuffix(domain, looseTldtsOpt);
if (tld) { if (tld) {
extraWhiteTLDs.add(tld); extraWhiteTLDs.add(tld);
} }
} }, 'ruleset');
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'global.conf'))) { await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'global.conf'), (domain, includeAllSubDomain) => {
const [type, domain] = line.split(','); trie.add(domain, includeAllSubDomain);
switch (type) { }, 'ruleset');
case 'DOMAIN':
trie.add(domain);
break;
case 'DOMAIN-SUFFIX':
trie.add(domain, true);
break;
default:
break;
}
}
ICP_TLD.forEach(tld => trie.whitelist(tld, true)); ICP_TLD.forEach(tld => trie.whitelist(tld, true));
extraWhiteTLDs.forEach(tld => trie.whitelist(tld, true)); extraWhiteTLDs.forEach(tld => trie.whitelist(tld, true));

View File

@ -1,9 +1,8 @@
import path from 'node:path'; import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { OUTPUT_SURGE_DIR } from './constants/dir'; import { OUTPUT_SURGE_DIR } from './constants/dir';
import { processLine } from './lib/process-line';
import tldts from 'tldts'; import tldts from 'tldts';
import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt'; import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => { (async () => {
const rejectDomainCountMap = await runAgainstDomainset(new Map<string, number>(), path.join(OUTPUT_SURGE_DIR, 'domainset', 'reject.conf')); const rejectDomainCountMap = await runAgainstDomainset(new Map<string, number>(), path.join(OUTPUT_SURGE_DIR, 'domainset', 'reject.conf'));
@ -17,22 +16,22 @@ import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
})(); })();
async function runAgainstDomainset(rejectDomainCountMap: Map<string, number>, file: string) { async function runAgainstDomainset(rejectDomainCountMap: Map<string, number>, file: string) {
for await (const line of readFileByLine(file)) { await runAgainstSourceFile(
if (!processLine(line)) { file,
continue; (domain: string) => {
} const apexDomain = tldts.getDomain(domain, loosTldOptWithPrivateDomains);
const apexDomain = tldts.getDomain(line, loosTldOptWithPrivateDomains); if (!apexDomain) {
if (!apexDomain) { return;
continue; }
}
rejectDomainCountMap.set( rejectDomainCountMap.set(
apexDomain, apexDomain,
rejectDomainCountMap.has(apexDomain) rejectDomainCountMap.has(apexDomain)
? rejectDomainCountMap.get(apexDomain)! + 1 ? rejectDomainCountMap.get(apexDomain)! + 1
: 1 : 1
); );
} }
);
return rejectDomainCountMap; return rejectDomainCountMap;
} }