From ede1b7e25b858e5b7dd4fa750c39aa0aabb7dfa8 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Fri, 11 Jul 2025 19:03:30 +0800 Subject: [PATCH] Chore: validate domain tools --- Build/validate-domestic.ts | 88 +++++++++++++++++++++----------------- Build/validate-gfwlist.ts | 87 +++++++++++++++++++------------------ 2 files changed, 94 insertions(+), 81 deletions(-) diff --git a/Build/validate-domestic.ts b/Build/validate-domestic.ts index b2247cd9..7b86483e 100644 --- a/Build/validate-domestic.ts +++ b/Build/validate-domestic.ts @@ -1,59 +1,67 @@ -import { parse } from 'csv-parse/sync'; -import { HostnameSmolTrie } from './lib/trie'; import path from 'node:path'; import { SOURCE_DIR } from './constants/dir'; import { parseFelixDnsmasqFromResp } from './lib/parse-dnsmasq'; import { $$fetch } from './lib/fetch-retry'; import runAgainstSourceFile from './lib/run-against-source-file'; +import { getTopOneMillionDomains } from './validate-gfwlist'; +import { HostnameSmolTrie } from './lib/trie'; +import tldts from 'tldts-experimental'; +import { DOMESTICS } from '../Source/non_ip/domestic'; export async function parseDomesticList() { - const trie = new HostnameSmolTrie(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'))); + const allChinaDomains = new Set(await parseFelixDnsmasqFromResp(await $$fetch('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'))); - const top5000 = new Set(); + const topDomainTrie = await getTopOneMillionDomains(); - const res = await (await $$fetch('https://radar.cloudflare.com/charts/LargerTopDomainsTable/attachment?id=1077&top=10000', { - headers: { - accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6,es;q=0.5', - 'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'document', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-site': 'none', - 'sec-fetch-user': '?1', - 'upgrade-insecure-requests': '1' + const resultTrie = new HostnameSmolTrie(); + + topDomainTrie.dumpWithoutDot((domain) => { + const apexDomain = tldts.getDomain(domain); + + if (apexDomain && allChinaDomains.has(apexDomain)) { + resultTrie.add(apexDomain, false); } - })).text(); - const stream = parse(res); - for await (const [domain] of stream) { - if (trie.has(domain)) { - top5000.add(domain); - } - console.log({ domain }); - } + }); - const notIncludedDomestic = new Set(top5000); + const callback = (domain: string, includeAllSubdomain: boolean) => resultTrie.whitelist(domain, includeAllSubdomain); // await Promise.all([ await runAgainstSourceFile( path.resolve(SOURCE_DIR, 'non_ip/domestic.conf'), - (domain, includeAllSubdomain) => { - if (includeAllSubdomain) { - if (top5000.has(domain)) { - notIncludedDomestic.delete(domain); - } - } else { - // noop, DOMAIN-KEYWORD handing - // for (const d of top5000) { - // if (d.includes(domain)) { - // notIncludedDomestic.delete(d); - // } - // } - } - } + callback ); + await runAgainstSourceFile( + path.resolve(SOURCE_DIR, 'domainset/reject.conf'), + callback + ); + + Object.values(DOMESTICS).forEach(domestic => { + domestic.domains.forEach(domain => { + switch (domain[0]) { + case '+': + case '$': { + resultTrie.whitelist(domain.slice(1), true); + break; + } + default: { + resultTrie.whitelist(domain, true); + break; + } + } + }); + }); + + // noop, DOMAIN-KEYWORD handing + // for (const d of top5000) { + // if (d.includes(domain)) { + // notIncludedDomestic.delete(d); + // } + // } // ]); - console.log(notIncludedDomestic.size, notIncludedDomestic); + console.log(resultTrie.dump().join('\n') + '\n'); +} + +if (require.main === module) { + parseDomesticList().catch(console.error); } diff --git a/Build/validate-gfwlist.ts b/Build/validate-gfwlist.ts index b8c2017c..ea707b21 100644 --- a/Build/validate-gfwlist.ts +++ b/Build/validate-gfwlist.ts @@ -11,13 +11,52 @@ import runAgainstSourceFile from './lib/run-against-source-file'; import { nullthrow } from 'foxts/guard'; import { Buffer } from 'node:buffer'; -export async function parseGfwList() { +export async function getTopOneMillionDomains() { const { parse: csvParser } = await import('csv-parse'); + const topDomainTrie = new HostnameSmolTrie(); + const csvParse = csvParser({ columns: false, skip_empty_lines: true }); + + const topDomainsZipBody = await (await $$fetch('https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip', { + headers: { + accept: '*/*', + 'user-agent': 'curl/8.12.1' + } + })).arrayBuffer(); + let entry: yauzl.Entry | null = null; + for await (const e of await yauzl.fromBuffer(Buffer.from(topDomainsZipBody))) { + if (e.filename === 'top-1m.csv') { + entry = e; + break; + } + } + + const { promise, resolve, reject } = Promise.withResolvers(); + + const readable = await nullthrow(entry, 'top-1m.csv entry not found').openReadStream(); + const parser = readable.pipe(csvParse); + parser.on('readable', () => { + let record; + while ((record = parser.read()) !== null) { + topDomainTrie.add(record[1]); + } + }); + + parser.on('end', () => { + resolve(topDomainTrie); + }); + parser.on('error', (err) => { + reject(err); + }); + + return promise; +} + +export async function parseGfwList() { const whiteSet = new Set(); const gfwListTrie = new HostnameSmolTrie(); - const excludeGfwList = createKeywordFilter([ + const gfwlistIgnoreLineKwfilter = createKeywordFilter([ '.*', '*', '=', @@ -31,7 +70,7 @@ export async function parseGfwList() { const line = processLine(l); if (!line) continue; - if (excludeGfwList(line)) { + if (gfwlistIgnoreLineKwfilter(line)) { continue; } if (line.startsWith('@@||')) { @@ -71,42 +110,7 @@ export async function parseGfwList() { gfwListTrie.add(l); } - const topDomainTrie = new HostnameSmolTrie(); - - const csvParse = csvParser({ columns: false, skip_empty_lines: true }); - const topDomainsZipBody = await (await $$fetch('https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip', { - headers: { - accept: '*/*', - 'user-agent': 'curl/8.12.1' - } - })).arrayBuffer(); - let entry: yauzl.Entry | null = null; - for await (const e of await yauzl.fromBuffer(Buffer.from(topDomainsZipBody))) { - if (e.filename === 'top-1m.csv') { - entry = e; - break; - } - } - - const { promise, resolve, reject } = Promise.withResolvers(); - - const readable = await nullthrow(entry, 'top-1m.csv entry not found').openReadStream(); - const parser = readable.pipe(csvParse); - parser.on('readable', () => { - let record; - while ((record = parser.read()) !== null) { - topDomainTrie.add(record[1]); - } - }); - - parser.on('end', () => { - resolve(topDomainTrie); - }); - parser.on('error', (err) => { - reject(err); - }); - - await promise; + const topDomainTrie = await getTopOneMillionDomains(); const keywordSet = new Set(); @@ -116,18 +120,19 @@ export async function parseGfwList() { }; await Promise.all([ runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/global.conf'), callback, 'ruleset', keywordSet), - runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/domestic.conf'), callback, 'ruleset', keywordSet), + // runAgainstSourceFile(path.join(OUTPUT_SURGE_DIR, 'non_ip/domestic.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/reject.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.join(SOURCE_DIR, 'non_ip/telegram.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'non_ip/stream.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/ai.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/microsoft.conf'), callback, 'ruleset', keywordSet), + runAgainstSourceFile(path.resolve(SOURCE_DIR, 'non_ip/apple_service.conf'), callback, 'ruleset', keywordSet), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject.conf'), callback, 'domainset'), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/reject_extra.conf'), callback, 'domainset'), runAgainstSourceFile(path.resolve(OUTPUT_SURGE_DIR, 'domainset/cdn.conf'), callback, 'domainset') ]); - whiteSet.forEach(domain => gfwListTrie.whitelist(domain)); + whiteSet.forEach(domain => gfwListTrie.whitelist(domain, true)); const kwfilter = createKeywordFilter([...keywordSet]);