Update Reject Infra & Data Source

This commit is contained in:
SukkaW 2024-09-30 08:05:47 +08:00
parent c158217dda
commit 0d1fddcb81
5 changed files with 98 additions and 69 deletions

View File

@ -4,7 +4,7 @@ import process from 'node:process';
import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
import { compareAndWriteFile } from './lib/create-file';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { task } from './trace';
@ -44,7 +44,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
...HOSTS_EXTRA.map(host => ` - ${host[0]}`),
...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`),
...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`)
...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`),
...PHISHING_HOSTS_EXTRA.map(host => ` - ${host[0]}`)
]);
const appendArrayToRejectOutput = rejectOutput.addFromDomainset.bind(rejectOutput);
@ -91,18 +92,25 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
appendArrayToRejectExtraOutput(black);
})
),
([
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt',
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
].map(
input => processFilterRules(childSpan, input).then(({ white, black }) => {
setAddFromArray(filterRuleWhitelistDomainSets, white);
setAddFromArray(filterRuleWhitelistDomainSets, black);
})
)),
ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
setAddFromArray(filterRuleWhitelistDomainSets, white);
setAddFromArray(filterRuleWhitelistDomainSets, black);
})),
getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput)
readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput),
// Dedupe domainSets
span.traceChildAsync('collect black keywords/suffixes', async () => {
/** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) {
const [type, value] = line.split(',');
if (type === 'DOMAIN-KEYWORD') {
rejectOutput.addDomainKeyword(value); // Add for later deduplication
rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication
} else if (type === 'DOMAIN-SUFFIX') {
filterRuleWhitelistDomainSets.add('.' + value);
}
}
})
].flat());
// eslint-disable-next-line sukka/no-single-return -- not single return
return shouldStop;
@ -112,22 +120,6 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
process.exit(1);
}
// Dedupe domainSets
await span.traceChildAsync('collect black keywords/suffixes', async () => {
/** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */
for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) {
const [type, value] = line.split(',');
if (type === 'DOMAIN-KEYWORD') {
rejectOutput.addDomainKeyword(value); // Add for later deduplication
rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication
} else if (type === 'DOMAIN-SUFFIX') {
rejectOutput.whitelistDomain('.' + value); // Add for later deduplication
rejectExtraOutput.whitelistDomain('.' + value); // Add for later deduplication
}
}
});
await Promise.all([
rejectOutput.done(),
rejectExtraOutput.done()

View File

@ -3,14 +3,12 @@ import { TTL } from '../lib/cache-filesystem';
type HostsSource = [main: string, mirrors: string[] | null, includeAllSubDomain: boolean, ttl: number];
export const HOSTS: HostsSource[] = [
// no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()],
['https://raw.githubusercontent.com/durablenapkin/block/refs/heads/master/tvstream.txt', null, true, TTL.THREE_HOURS()]
['https://raw.githubusercontent.com/durablenapkin/block/master/tvstream.txt', null, true, TTL.THREE_HOURS()]
];
export const HOSTS_EXTRA: HostsSource[] = [
@ -24,7 +22,7 @@ export const HOSTS_EXTRA: HostsSource[] = [
// Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller
['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()]
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()]
];
export const DOMAIN_LISTS: HostsSource[] = [
@ -86,9 +84,12 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [
'https://phishing.army/download/phishing_army_blocklist.txt',
[],
true, TTL.THREE_HOURS()
],
]
];
export const PHISHING_HOSTS_EXTRA: HostsSource[] = [
[
'https://raw.githubusercontent.com/durablenapkin/scamblocklist/refs/heads/master/hosts.txt',
'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt',
[],
true, TTL.TWLVE_HOURS()
]
@ -97,14 +98,16 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [
type AdGuardFilterSource = [main: string, mirrors: string[] | null, ttl: number, allowThirdParty?: boolean];
export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
// no coin list adguard list is more maintained than its hosts
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/nocoin.txt', [], TTL.TWO_WEEKS()],
// EasyList
[
'https://easylist.to/easylist/easylist.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
'https://ublockorigin.pages.dev/thirdparties/easylist.txt',
'https://easylist-downloads.adblockplus.org/easylist.txt',
'https://secure.fanboy.co.nz/easylist.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
'https://ublockorigin.pages.dev/thirdparties/easylist.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt'
],
TTL.TWLVE_HOURS()
@ -113,11 +116,11 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
[
'https://easylist.to/easylist/easyprivacy.txt',
[
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt',
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt'
],
TTL.TWLVE_HOURS()
],
@ -150,23 +153,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [
],
TTL.TEN_DAYS()
],
// Brave First Party & First Party CNAME
['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()]
// uBlock Origin Unbreak
[
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
[
'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
],
TTL.THREE_HOURS()
]
];
export const ADGUARD_FILTERS_WHITELIST: AdGuardFilterSource[] = [
[
'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exceptions.txt',
[
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt'
],
TTL.THREE_HOURS()
],
[
'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exclusions.txt',
[
'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt'
],
TTL.THREE_HOURS()
]
];
export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
// AdGuard Annoyances filter
['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true],
// AdGuard Cookie Notices
['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true],
// EasyList Germany filter
[
'https://easylist.to/easylistgermany/easylistgermany.txt',
[
'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
],
TTL.TWLVE_HOURS()
],
['https://filters.adtidy.org/extension/ublock/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true],
// AdGuard Cookie Notices, included in Annoyances filter
// ['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true],
// EasyList Germany filter, not even included in extra for now
// [
// 'https://easylist.to/easylistgermany/easylistgermany.txt',
// [
// 'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
// ],
// TTL.TWLVE_HOURS()
// ],
// AdGuard Japanese filter
['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()],
// uBlock Origin Filter List
@ -177,8 +203,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
],
TTL.THREE_HOURS()
],
// AdGuard Popup Overlay
['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true],
// AdGuard Popup Overlay - included in Annoyances filter
// ['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true],
// AdGuard Mobile Banner
// almost all generic rule
// ['https://filters.adtidy.org/extension/ublock/filters/20_optimized.txt', null, TTL.THREE_HOURS()],
@ -205,14 +231,6 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
// 'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
// ]
// ],
// uBlock Origin Unbreak
[
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
[
'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
],
TTL.THREE_HOURS()
],
// uBlock Origin Annoyances
[
'https://ublockorigin.github.io/uAssetsCDN/filters/annoyances.min.txt',
@ -229,6 +247,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
],
TTL.THREE_HOURS()
],
// Dandelion Sprout's Annoyances
['https://filters.adtidy.org/extension/ublock/filters/250_optimized.txt', null, TTL.THREE_HOURS(), true],
// EasyList - Newsletters
[
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist-newsletters.txt',
@ -253,6 +273,12 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [
'https://secure.fanboy.co.nz/fanboy-cookiemonster_ubo.txt'
],
TTL.TWLVE_HOURS()
],
// Bypass Paywall Cleaner
[
'https://gitflic.ru/project/magnolia1234/bypass-paywalls-clean-filters/blob/raw?file=bpc-paywall-filter.txt',
[],
TTL.ONE_DAY()
]
];
@ -269,6 +295,7 @@ export const PREDEFINED_WHITELIST = [
'.ip6-allhosts',
'.mcastprefix',
'.skk.moe',
'.cdn.cloudflare.net', // Surge/Clash doesn't support CNAME
'analytics.google.com',
'.cloud.answerhub.com',
'ae01.alicdn.com',

View File

@ -1,10 +1,10 @@
import { processDomainLists } from './parse-filter';
import { processDomainLists, processHosts } from './parse-filter';
import * as tldts from 'tldts-experimental';
import { dummySpan } from '../trace';
import type { Span } from '../trace';
import { appendArrayInPlaceCurried } from './append-array-in-place';
import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source';
import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
import picocolors from 'picocolors';
import createKeywordFilter from './aho-corasick';
@ -162,6 +162,8 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
(await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));
return domainArr;
});

View File

@ -45,7 +45,11 @@ const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean
const cacheKey = createCacheKey(__filename);
export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null, extraCacheKey: (input: string) => string = identity) {
export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
ttl: number | null = null, extraCacheKey: (input: string) => string = identity
) {
return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
extraCacheKey(cacheKey(domainListsUrl)),
async () => {
@ -100,9 +104,13 @@ const hostsLineCb = (l: string, set: string[], includeAllSubDomain: boolean, met
set.push(includeAllSubDomain ? `.${domain}` : domain);
};
export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
ttl: number | null = null, extraCacheKey: (input: string) => string = identity
) {
return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
cacheKey(hostsUrl),
extraCacheKey(cacheKey(hostsUrl)),
async () => {
const domainSets: string[] = [];

View File

@ -214,8 +214,8 @@ export class RulesetOutput extends RuleOutput<Preprocessed> {
}
}
console.error(picocolors.bold('Parsed Failed'));
if (parsedFailures.length > 0) {
console.error(picocolors.bold('Parsed Failed'));
console.table(parsedFailures);
}