Chore: new util run against source file
Some checks failed
Build / Build (push) Has been cancelled
Build / Diff output (push) Has been cancelled
Build / Deploy to Cloudflare Pages (3.114.6) (push) Has been cancelled
Build / Deploy to GitHub and GitLab (push) Has been cancelled

This commit is contained in:
SukkaW 2025-04-27 23:33:56 +08:00
parent 505f7544ed
commit 2d706f4775
8 changed files with 132 additions and 177 deletions

View File

@ -0,0 +1,41 @@
import { never } from 'foxts/guard';
import { readFileByLine } from './fetch-text-by-line';
import { processLine } from './process-line';
export default async function runAgainstSourceFile(
filePath: string,
callback: (domain: string, includeAllSubDomain: boolean) => void,
type?: 'ruleset' | 'domainset'
) {
for await (const line of readFileByLine(filePath)) {
const l = processLine(line);
if (!l) {
continue;
}
if (type == null) {
if (l.includes(',')) {
type = 'ruleset';
} else {
type = 'domainset';
}
}
if (type === 'ruleset') {
const [ruleType, domain] = l.split(',', 3);
if (ruleType === 'DOMAIN') {
callback(domain, false);
} else if (ruleType === 'DOMAIN-SUFFIX') {
callback(domain, true);
}
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition -- exhaus options
} else if (type === 'domainset') {
if (l[0] === '.') {
callback(l.slice(1), true);
} else {
callback(l, false);
}
} else {
never(type);
}
}
}

View File

@ -1,9 +1,9 @@
import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import tldts from 'tldts';
import { HostnameSmolTrie } from './lib/trie';
import path from 'node:path';
import { SOURCE_DIR } from './constants/dir';
import { processLine } from './lib/process-line';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => {
const lines1 = await Array.fromAsync(await fetchRemoteTextByLine('https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true));
@ -31,23 +31,12 @@ import { processLine } from './lib/process-line';
});
}
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'domainset', 'reject.conf'))) {
const l = processLine(line);
if (l) {
trie.whitelist(l);
}
}
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'))) {
const l = processLine(line);
if (l) {
const [type, domain] = l.split(',', 3);
if (type === 'DOMAIN') {
trie.whitelist(domain, false);
} else if (type === 'DOMAIN-SUFFIX') {
trie.whitelist(domain, true);
}
}
}
await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), (domain, includeAllSubDomain) => {
trie.whitelist(domain, includeAllSubDomain);
}, 'domainset');
await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), (domain, includeAllSubDomain) => {
trie.whitelist(domain, includeAllSubDomain);
}, 'ruleset');
console.log(trie.dump().map(i => '.' + i).join('\n'));
})();

View File

@ -1,20 +1,23 @@
import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { processFilterRulesWithPreload } from './lib/parse-filter/filters';
import { processHosts } from './lib/parse-filter/hosts';
import { processLine } from './lib/process-line';
import { HostnameSmolTrie } from './lib/trie';
import { dummySpan } from './trace';
import { SOURCE_DIR } from './constants/dir';
import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => {
const trie = new HostnameSmolTrie();
await writeHostsToTrie(trie, 'https://cdn.jsdelivr.net/gh/jerryn70/GoodbyeAds@master/Extension/GoodbyeAds-Xiaomi-Extension.txt', true);
await runWhiteOnSource(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), trie);
await runWhiteOnSource(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), trie);
const callback = (domain: string, includeAllSubDomain: boolean) => {
trie.whitelist(domain, includeAllSubDomain);
};
await runAgainstSourceFile(path.join(SOURCE_DIR, 'domainset', 'reject.conf'), callback, 'domainset');
await runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip', 'reject.conf'), callback, 'ruleset');
for (let i = 0, len = PREDEFINED_WHITELIST.length; i < len; i++) {
trie.whitelist(PREDEFINED_WHITELIST[i]);
@ -25,24 +28,6 @@ import { PREDEFINED_WHITELIST } from './constants/reject-data-source';
console.log('---------------------------');
})();
async function runWhiteOnSource(sourceFile: string, trie: HostnameSmolTrie) {
for await (const line of readFileByLine(sourceFile)) {
const l = processLine(line);
if (l) {
if (l.includes(',')) {
const [type, domain] = l.split(',', 3);
if (type === 'DOMAIN') {
trie.whitelist(domain, false);
} else if (type === 'DOMAIN-SUFFIX') {
trie.whitelist(domain, true);
}
} else {
trie.whitelist(l);
}
}
}
}
async function writeHostsToTrie(trie: HostnameSmolTrie, hostsUrl: string, includeAllSubDomain = false) {
const hosts = await processHosts(dummySpan, hostsUrl, [], includeAllSubDomain);

View File

@ -1,11 +1,9 @@
import { readFileByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line';
import { SOURCE_DIR } from './constants/dir';
import path from 'node:path';
import { newQueue } from '@henrygd/queue';
import { isDomainAlive, keyedAsyncMutexWithQueue } from './lib/is-domain-alive';
import { fdir as Fdir } from 'fdir';
import runAgainstSourceFile from './lib/run-against-source-file';
const queue = newQueue(24);
@ -19,10 +17,20 @@ function onDomain(args: [string, boolean]) {
(async () => {
const domainSets = await new Fdir()
.withFullPaths()
.filter((filePath, isDirectory) => {
if (isDirectory) return false;
const extname = path.extname(filePath);
return extname === '.txt' || extname === '.conf';
})
.crawl(SOURCE_DIR + path.sep + 'domainset')
.withPromise();
const domainRules = await new Fdir()
.withFullPaths()
.filter((filePath, isDirectory) => {
if (isDirectory) return false;
const extname = path.extname(filePath);
return extname === '.txt' || extname === '.conf';
})
.crawl(SOURCE_DIR + path.sep + 'non_ip')
.withPromise();
@ -37,53 +45,29 @@ function onDomain(args: [string, boolean]) {
})();
export async function runAgainstRuleset(filepath: string) {
const extname = path.extname(filepath);
if (extname !== '.conf') {
console.log('[skip]', filepath);
return;
}
const promises: Array<Promise<void>> = [];
for await (const l of readFileByLine(filepath)) {
const line = processLine(l);
if (!line) continue;
const [type, domain] = line.split(',');
switch (type) {
case 'DOMAIN-SUFFIX':
case 'DOMAIN': {
promises.push(
queue.add(() => keyedAsyncMutexWithQueue(domain, () => isDomainAlive(domain, type === 'DOMAIN-SUFFIX')))
.then(onDomain)
);
break;
}
// no default
}
}
await runAgainstSourceFile(
filepath,
(domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
domain,
() => isDomainAlive(domain, includeAllSubdomain)
).then(onDomain))
);
await Promise.all(promises);
console.log('[done]', filepath);
}
export async function runAgainstDomainset(filepath: string) {
const extname = path.extname(filepath);
if (extname !== '.conf') {
console.log('[skip]', filepath);
return;
}
const promises: Array<Promise<void>> = [];
for await (const l of readFileByLine(filepath)) {
const line = processLine(l);
if (!line) continue;
promises.push(
queue.add(() => keyedAsyncMutexWithQueue(line, () => isDomainAlive(line, line[0] === '.')))
.then(onDomain)
);
}
await runAgainstSourceFile(
filepath,
(domain: string, includeAllSubdomain: boolean) => queue.add(() => keyedAsyncMutexWithQueue(
domain,
() => isDomainAlive(domain, includeAllSubdomain)
).then(onDomain))
);
await Promise.all(promises);
console.log('[done]', filepath);
}

View File

@ -1,11 +1,10 @@
import { readFileByLine } from './lib/fetch-text-by-line';
import { parse } from 'csv-parse/sync';
import { HostnameSmolTrie } from './lib/trie';
import path from 'node:path';
import { processLine } from './lib/process-line';
import { SOURCE_DIR } from './constants/dir';
import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq';
import { $$fetch } from './lib/fetch-retry';
import runAgainstSourceFile from './lib/run-against-source-file';
export async function parseDomesticList() {
const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')));
@ -36,27 +35,24 @@ export async function parseDomesticList() {
const notIncludedDomestic = new Set<string>(top5000);
const runAgainstRuleset = async (ruleset: string) => {
for await (const l of readFileByLine(ruleset)) {
const line = processLine(l);
if (!line) continue;
const [type, domain] = line.split(',');
if (type === 'DOMAIN-SUFFIX') {
// await Promise.all([
await runAgainstSourceFile(
path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'),
(domain, includeAllSubdomain) => {
if (includeAllSubdomain) {
if (top5000.has(domain)) {
notIncludedDomestic.delete(domain);
}
} else if (type === 'DOMAIN-KEYWORD') {
for (const d of top5000) {
if (d.includes(domain)) {
notIncludedDomestic.delete(d);
}
}
} else {
// noop, DOMAIN-KEYWORD handing
// for (const d of top5000) {
// if (d.includes(domain)) {
// notIncludedDomestic.delete(d);
// }
// }
}
}
};
// await Promise.all([
await runAgainstRuleset(path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'));
);
// ]);
console.log(notIncludedDomestic.size, notIncludedDomestic);

View File

@ -3,11 +3,12 @@ import { fastNormalizeDomain } from './lib/normalize-domain';
import { HostnameSmolTrie } from './lib/trie';
// import { Readable } from 'stream';
import { parse } from 'csv-parse/sync';
import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import path from 'node:path';
import { OUTPUT_SURGE_DIR } from './constants/dir';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { $$fetch } from './lib/fetch-retry';
import runAgainstSourceFile from './lib/run-against-source-file';
export async function parseGfwList() {
const whiteSet = new Set<string>();
@ -77,46 +78,20 @@ export async function parseGfwList() {
const keywordSet = new Set<string>();
const runAgainstRuleset = async (ruleset: string) => {
for await (const l of readFileByLine(ruleset)) {
const line = processLine(l);
if (!line) continue;
const [type, domain] = line.split(',');
switch (type) {
case 'DOMAIN-SUFFIX': {
trie.whitelist('.' + domain);
break;
}
case 'DOMAIN': {
trie.whitelist(domain);
break;
}
case 'DOMAIN-KEYWORD': {
keywordSet.add(domain);
break;
}
// no default
}
}
const callback = (domain: string, includeAllSubdomain: boolean) => {
trie.whitelist(domain, includeAllSubdomain);
};
const runAgainstDomainset = async (ruleset: string) => {
for await (const l of readFileByLine(ruleset)) {
const line = processLine(l);
if (!line) continue;
trie.whitelist(line);
}
};
await Promise.all([
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf')),
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf')),
runAgainstRuleset(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf')),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf')),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf')),
runAgainstRuleset(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf')),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf')),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf')),
runAgainstDomainset(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'))
runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/global.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/reject.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/ai.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'),
runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset')
]);
whiteSet.forEach(domain => trie.whitelist(domain));

View File

@ -1,42 +1,28 @@
import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { HostnameSmolTrie } from './lib/trie';
import { OUTPUT_SURGE_DIR, SOURCE_DIR } from './constants/dir';
import { OUTPUT_SURGE_DIR } from './constants/dir';
import { ICP_TLD } from './constants/domains';
import tldts from 'tldts-experimental';
import { looseTldtsOpt } from './constants/loose-tldts-opt';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => {
const trie = new HostnameSmolTrie();
const extraWhiteTLDs = new Set<string>();
for await (const line of readFileByLine(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'))) {
const [type, domain] = line.split(',');
if (type !== 'DOMAIN' && type !== 'DOMAIN-SUFFIX') {
continue;
}
await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'domestic.conf'), (domain) => {
if (domain === 'this_ruleset_is_made_by_sukkaw.ruleset.skk.moe') {
continue;
return;
}
const tld = tldts.getPublicSuffix(domain, looseTldtsOpt);
if (tld) {
extraWhiteTLDs.add(tld);
}
}
}, 'ruleset');
for await (const line of readFileByLine(path.join(SOURCE_DIR, 'non_ip', 'global.conf'))) {
const [type, domain] = line.split(',');
switch (type) {
case 'DOMAIN':
trie.add(domain);
break;
case 'DOMAIN-SUFFIX':
trie.add(domain, true);
break;
default:
break;
}
}
await runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip', 'global.conf'), (domain, includeAllSubDomain) => {
trie.add(domain, includeAllSubDomain);
}, 'ruleset');
ICP_TLD.forEach(tld => trie.whitelist(tld, true));
extraWhiteTLDs.forEach(tld => trie.whitelist(tld, true));

View File

@ -1,9 +1,8 @@
import path from 'node:path';
import { readFileByLine } from './lib/fetch-text-by-line';
import { OUTPUT_SURGE_DIR } from './constants/dir';
import { processLine } from './lib/process-line';
import tldts from 'tldts';
import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
import runAgainstSourceFile from './lib/run-against-source-file';
(async () => {
const rejectDomainCountMap = await runAgainstDomainset(new Map<string, number>(), path.join(OUTPUT_SURGE_DIR, 'domainset', 'reject.conf'));
@ -17,22 +16,22 @@ import { loosTldOptWithPrivateDomains } from './constants/loose-tldts-opt';
})();
async function runAgainstDomainset(rejectDomainCountMap: Map<string, number>, file: string) {
for await (const line of readFileByLine(file)) {
if (!processLine(line)) {
continue;
}
const apexDomain = tldts.getDomain(line, loosTldOptWithPrivateDomains);
if (!apexDomain) {
continue;
}
await runAgainstSourceFile(
file,
(domain: string) => {
const apexDomain = tldts.getDomain(domain, loosTldOptWithPrivateDomains);
if (!apexDomain) {
return;
}
rejectDomainCountMap.set(
apexDomain,
rejectDomainCountMap.has(apexDomain)
? rejectDomainCountMap.get(apexDomain)! + 1
: 1
);
}
rejectDomainCountMap.set(
apexDomain,
rejectDomainCountMap.has(apexDomain)
? rejectDomainCountMap.get(apexDomain)! + 1
: 1
);
}
);
return rejectDomainCountMap;
}