From ca169b9db5db3271e03705babfb9fdaf47ada8d5 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 7 Jan 2024 01:05:20 +0800 Subject: [PATCH] Fix: enable domain check for some reject data source --- Build/build-reject-domainset.ts | 2 +- Build/lib/cached-tld-parse.ts | 3 ++- Build/lib/get-phishing-domains.ts | 4 ++-- Build/lib/normalize-domain.ts | 11 +++++++---- Build/lib/parse-filter.ts | 15 ++++++++++++--- Build/lib/reject-data-source.ts | 16 ++++++++-------- 6 files changed, 32 insertions(+), 19 deletions(-) diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index f3e3adbd..2c53c11d 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -37,7 +37,7 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { domainSets.add(host); }); })), - ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])), + ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2], entry[3])), ...ADGUARD_FILTERS.map(input => { const promise = typeof input === 'string' ? processFilterRules(input) diff --git a/Build/lib/cached-tld-parse.ts b/Build/lib/cached-tld-parse.ts index 73ccfeae..27b2892b 100644 --- a/Build/lib/cached-tld-parse.ts +++ b/Build/lib/cached-tld-parse.ts @@ -3,6 +3,7 @@ import { createCache } from './cache-apply'; import type { PublicSuffixList } from '@gorhill/publicsuffixlist'; const cache = createCache('cached-tld-parse', true); +const cache2 = createCache('cached-tld-parse2', true); const sharedConfig = { allowPrivateDomains: true }; const sharedConfig2 = { allowPrivateDomains: true, detectIp: false }; @@ -10,7 +11,7 @@ const sharedConfig2 = { allowPrivateDomains: true, detectIp: false }; /** { allowPrivateDomains: true } */ export const parse = (domain: string) => cache.sync(domain, () => tldts.parse(domain, sharedConfig)); /** { allowPrivateDomains: true, detectIp: false } */ -export const parse2 = (domain: string) => cache.sync(domain, () => tldts.parse(domain, sharedConfig2)); +export const parse2 = (domain: string) => cache2.sync(domain, () => tldts.parse(domain, sharedConfig2)); let gothillGetDomainCache: ReturnType | null = null; export const createCachedGorhillGetDomain = (gorhill: PublicSuffixList) => { diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index f960bbe2..db0a0209 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -85,8 +85,8 @@ const BLACK_TLD = new Set([ export const getPhishingDomains = () => traceAsync('get phishing domains', async () => { const [domainSet, domainSet2, gorhill] = await Promise.all([ - processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true, TTL.THREE_HOURS()), - processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS()), + processDomainLists('https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, false, TTL.THREE_HOURS()), + processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true, true, TTL.THREE_HOURS()), getGorhillPublicSuffixPromise() ]); domainSet2.forEach((domain) => domainSet.add(domain)); diff --git a/Build/lib/normalize-domain.ts b/Build/lib/normalize-domain.ts index 140ebe27..52202c2c 100644 --- a/Build/lib/normalize-domain.ts +++ b/Build/lib/normalize-domain.ts @@ -5,11 +5,14 @@ export const normalizeDomain = (domain: string) => { if (isProbablyIpv4(domain)) return null; const parsed = tldts.parse2(domain); - if (parsed.isIp) return null; + // if (parsed.isIp) return null; + if (!parsed.hostname) return null; if (!parsed.isIcann && !parsed.isPrivate) return null; - const h = parsed.hostname; - if (!h) return null; + let h = parsed.hostname; + if (h[0] === '.') h = h.slice(1); + if (h.endsWith('.')) h = h.slice(0, -1); - return h[0] === '.' ? h.slice(1) : h; + if (h) return h; + return null; }; diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index a7e04778..ee828a83 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -11,17 +11,23 @@ import { normalizeDomain } from './normalize-domain'; import { fetchAssets } from './fetch-assets'; import { deserializeSet, fsCache, serializeSet } from './cache-filesystem'; -const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null +const DEBUG_DOMAIN_TO_FIND: string | null = '.j3.4z0vc.chileinsumos.cl'; // example.com | null let foundDebugDomain = false; -export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { +export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) { return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply( domainListsUrl, async () => { const domainSets = new Set(); for await (const line of await fetchRemoteTextByLine(domainListsUrl)) { - const domainToAdd = processLine(line); + let domainToAdd = processLine(line); + if (!domainToAdd) continue; + + if (!skipDomainCheck) { + domainToAdd = normalizeDomain(domainToAdd); + } + if (!domainToAdd) continue; if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { @@ -123,6 +129,9 @@ export async function processFilterRules( const flag = result[1]; const hostname = result[0]; + // if (hostname.endsWith('.')) { + // hostname = hostname.slice(0, -1); + // } if (DEBUG_DOMAIN_TO_FIND) { if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) { diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 13a052ad..95f0df52 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -31,20 +31,20 @@ export const HOSTS = [ export const DOMAIN_LISTS = [ // CoinBlockerList // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl - ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, TTL.TWO_WEEKS()], + ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, true, TTL.TWO_WEEKS()], // BarbBlock // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl - ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()], + ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, true, TTL.TWO_WEEKS()], // DigitalSide Threat-Intel - OSINT Hub // Update once per day - ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()], + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, true, TTL.ONE_DAY()], // AdGuard CNAME Filter Combined // Update on a 7 days basis, so we add a 3 hours cache ttl - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()] + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, true, TTL.THREE_DAYS()] ] as const; export const ADGUARD_FILTERS = [