Chore: simplify build infra / remove reject_phishing

This commit is contained in:
SukkaW
2023-12-09 23:23:55 +08:00
parent 42e9b4310f
commit 1928c052a9
24 changed files with 259 additions and 332 deletions

View File

@@ -1,6 +1,7 @@
// @ts-check
import { readFileByLine } from './fetch-remote-text-by-line';
import { readFileByLine } from './fetch-text-by-line';
import { surgeDomainsetToClashDomainset, surgeRulesetToClashClassicalTextRuleset } from './clash';
import { traceAsync } from './trace-runner';
export async function compareAndWriteFile(linesA: string[], filePath: string) {
let isEqual = true;
@@ -21,7 +22,7 @@ export async function compareAndWriteFile(linesA: string[], filePath: string) {
const lineA = linesA[index];
index++;
if (typeof lineA !== 'string') {
if (lineA == null) {
// The file becomes smaller
isEqual = false;
break;
@@ -37,7 +38,7 @@ export async function compareAndWriteFile(linesA: string[], filePath: string) {
}
}
if (index !== linesALen) {
if (isEqual && index !== linesALen) {
// The file becomes larger
isEqual = false;
}
@@ -48,13 +49,11 @@ export async function compareAndWriteFile(linesA: string[], filePath: string) {
return;
}
console.log(`Writing ${filePath}...`);
await traceAsync(`Writing ${filePath}`, async () => {
if (linesALen < 10000) {
return Bun.write(file, `${linesA.join('\n')}\n`);
}
const start = Bun.nanoseconds();
if (linesALen < 10000) {
await Bun.write(file, `${linesA.join('\n')}\n`);
} else {
const writer = file.writer();
for (let i = 0; i < linesALen; i++) {
@@ -62,11 +61,9 @@ export async function compareAndWriteFile(linesA: string[], filePath: string) {
writer.write('\n');
}
writer.flush();
await writer.end();
}
console.log(`Done writing ${filePath} in ${(Bun.nanoseconds() - start) / 1e6}ms`);
await writer.flush();
return writer.end();
});
}
export const withBannerArray = (title: string, description: string[], date: Date, content: string[]) => {

View File

@@ -83,6 +83,7 @@ function createFetchRetry($fetch: typeof fetch): typeof fetch {
} catch (err: unknown) {
if (err instanceof Error) {
if (err.name === 'AbortError') {
console.log('[fetch abort]', url.toString());
return bail(err);
}
}

View File

@@ -78,6 +78,6 @@ export async function *createReadlineInterfaceFromResponse(resp: Response): Asyn
}
}
export function fetchRemoteTextAndCreateReadlineInterface(url: string | URL) {
export function fetchRemoteTextAndReadByLine(url: string | URL) {
return fetchWithRetry(url, defaultRequestInit).then(res => createReadlineInterfaceFromResponse(res as Response));
}

View File

@@ -0,0 +1,178 @@
import fsp from 'fs/promises';
import path from 'path';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processHosts } from './parse-filter';
import { traceAsync, traceSync } from './trace-runner';
import * as tldts from 'tldts';
import { createTrie } from './trie';
import { createCachedGorhillGetDomain } from './cached-tld-parse';
import { processLine } from './process-line';
const WHITELIST_DOMAIN = new Set([
'w3s.link',
'dweb.link',
'nftstorage.link',
'square.site',
'business.site',
'page.link', // Firebase URL Shortener
'fleek.cool',
'notion.site'
]);
const BLACK_TLD = new Set([
'autos',
'bar',
'biz',
'bond',
'business',
'buzz',
'cc',
'cf',
'cfd',
'click',
'cloud',
'club',
'cn',
'codes',
'com.cn',
'cool',
'cyou',
'fit',
'fun',
'ga',
'gd',
'gq',
'group',
'host',
'icu',
'id',
'info',
'ink',
'life',
'live',
'link',
'ltd',
'ml',
'mobi',
'one',
'online',
'pro',
'pl',
'pw',
'rest',
'rf.gd',
'sa.com',
'sbs',
'shop',
'site',
'space',
'store',
'tech',
'tk',
'tokyo',
'top',
'vip',
'vn',
'website',
'win',
'xyz',
'za.com'
]);
export const getPhishingDomains = () => traceAsync('get phishing domains', async () => {
const [domainSet, gorhill] = await Promise.all([
processHosts('https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true),
// processDomainLists('https://phishing.army/download/phishing_army_blocklist.txt', true),
// processFilterRules(
// 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt',
// [
// 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
// // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt'
// ]
// ),
getGorhillPublicSuffixPromise(),
// Remove old files
fsp.rm(path.resolve(import.meta.dir, '../../List/domainset/reject_phishing.conf'), { force: true }),
fsp.rm(path.resolve(import.meta.dir, '../../Clash/domainset/reject_phishing.txt'), { force: true })
]);
traceSync.skip('* whitelisting phishing domains', () => {
const trieForRemovingWhiteListed = createTrie(domainSet);
WHITELIST_DOMAIN.forEach(white => {
trieForRemovingWhiteListed.find(`.${white}`, false).forEach(f => domainSet.delete(f));
// if (trieForRemovingWhiteListed.has(white)) {
domainSet.delete(white);
// }
});
});
const domainCountMap: Record<string, number> = {};
const getDomain = createCachedGorhillGetDomain(gorhill);
traceSync.skip('* process phishing domain set', () => {
const domainArr = Array.from(domainSet);
for (let i = 0, len = domainArr.length; i < len; i++) {
const line = processLine(domainArr[i]);
if (!line) continue;
const apexDomain = getDomain(line);
if (!apexDomain) continue;
domainCountMap[apexDomain] ||= 0;
const isPhishingDomainMockingCoJp = line.includes('-co-jp');
if (isPhishingDomainMockingCoJp) {
domainCountMap[apexDomain] += 0.5;
}
if (line.startsWith('.amaz')) {
domainCountMap[apexDomain] += 0.5;
if (line.startsWith('.amazon-')) {
domainCountMap[apexDomain] += 4.5;
}
if (isPhishingDomainMockingCoJp) {
domainCountMap[apexDomain] += 4;
}
} else if (line.startsWith('.customer')) {
domainCountMap[apexDomain] += 0.25;
}
const tld = gorhill.getPublicSuffix(line[0] === '.' ? line.slice(1) : line);
if (!tld || !BLACK_TLD.has(tld)) continue;
domainCountMap[apexDomain] += 1;
const lineLen = line.length;
if (lineLen > 19) {
// Add more weight if the domain is long enough
if (lineLen > 44) {
domainCountMap[apexDomain] += 3.5;
} else if (lineLen > 34) {
domainCountMap[apexDomain] += 2.5;
} else if (lineLen > 29) {
domainCountMap[apexDomain] += 1.5;
} else if (lineLen > 24) {
domainCountMap[apexDomain] += 0.75;
} else {
domainCountMap[apexDomain] += 0.25;
}
if (domainCountMap[apexDomain] < 5) {
const subdomain = tldts.getSubdomain(line, { detectIp: false });
if (subdomain?.includes('.')) {
domainCountMap[apexDomain] += 1.5;
}
}
}
}
});
const results = traceSync.skip('* get final phishing results', () => Object.entries(domainCountMap)
.filter(([, count]) => count >= 5)
.map(([apexDomain]) => apexDomain));
return [results, domainSet] as const;
});

View File

@@ -1,4 +1,4 @@
import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
import { parse } from 'tldts';
const isDomainLoose = (domain: string): boolean => {
@@ -8,7 +8,7 @@ const isDomainLoose = (domain: string): boolean => {
export const parseFelixDnsmasq = async (url: string | URL): Promise<string[]> => {
const res: string[] = [];
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(url)) {
for await (const line of await fetchRemoteTextAndReadByLine(url)) {
if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) {
const domain = line.replace('server=/', '').replace('/114.114.114.114', '');
if (isDomainLoose(domain)) {

View File

@@ -1,12 +1,13 @@
// @ts-check
import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
import * as tldts from './cached-tld-parse';
import { fetchRemoteTextAndCreateReadlineInterface } from './fetch-remote-text-by-line';
import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
import { NetworkFilter } from '@cliqz/adblocker';
import { processLine } from './process-line';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
import { isProbablyIpv4 } from './is-fast-ip';
import { traceAsync } from './trace-runner';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
@@ -42,7 +43,7 @@ export async function processDomainLists(domainListsUrl: string | URL, includeAl
const domainSets = new Set<string>();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(domainListsUrl)) {
for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
const domainToAdd = processLine(line);
if (!domainToAdd) {
continue;
@@ -64,145 +65,134 @@ export async function processDomainLists(domainListsUrl: string | URL, includeAl
}
export async function processHosts(hostsUrl: string | URL, includeAllSubDomain = false, skipDomainCheck = false) {
console.time(`- processHosts: ${hostsUrl.toString()}`);
if (typeof hostsUrl === 'string') {
hostsUrl = new URL(hostsUrl);
}
const domainSets = new Set<string>();
for await (const l of await fetchRemoteTextAndCreateReadlineInterface(hostsUrl)) {
const line = processLine(l);
if (!line) {
continue;
return traceAsync(`- processHosts: ${hostsUrl.toString()}`, async () => {
if (typeof hostsUrl === 'string') {
hostsUrl = new URL(hostsUrl);
}
const [, ...domains] = line.split(' ');
const _domain = domains.join(' ').trim();
const domainSets = new Set<string>();
if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
const line = processLine(l);
if (!line) {
continue;
}
const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
const [, ...domains] = line.split(' ');
const _domain = domains.join(' ').trim();
if (domain) {
if (includeAllSubDomain) {
domainSets.add(`.${domain}`);
} else {
domainSets.add(domain);
if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(hostsUrl.href, false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
const domain = skipDomainCheck ? _domain : normalizeDomain(_domain);
if (domain) {
if (includeAllSubDomain) {
domainSets.add(`.${domain}`);
} else {
domainSets.add(domain);
}
}
}
}
console.timeEnd(` - processHosts: ${hostsUrl.toString()}`);
return domainSets;
return domainSets;
});
}
export async function processFilterRules(
filterRulesUrl: string | URL,
fallbackUrls?: ReadonlyArray<string | URL> | undefined
): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
const runStart = Bun.nanoseconds();
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
let downloadTime = 0;
const gorhill = await getGorhillPublicSuffixPromise();
await traceAsync(`- processFilterRules: ${filterRulesUrl.toString()}`, async () => {
const gorhill = await getGorhillPublicSuffixPromise();
/**
* @param {string} line
*/
const lineCb = (line: string) => {
const result = parse(line, gorhill);
if (!result) {
return;
}
/**
* @param {string} line
*/
const lineCb = (line: string) => {
const result = parse(line, gorhill);
if (!result) {
return;
}
const flag = result[1];
const hostname = result[0];
const flag = result[1];
const hostname = result[0];
if (DEBUG_DOMAIN_TO_FIND) {
if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
if (DEBUG_DOMAIN_TO_FIND) {
if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
console.log({ result, flag });
console.log({ result, flag });
}
}
switch (flag) {
case 0:
if (hostname[0] !== '.') {
whitelistDomainSets.add(`.${hostname}`);
} else {
whitelistDomainSets.add(hostname);
}
break;
case -1:
whitelistDomainSets.add(hostname);
break;
case 1:
blacklistDomainSets.add(hostname);
break;
case 2:
if (hostname[0] !== '.') {
blacklistDomainSets.add(`.${hostname}`);
} else {
blacklistDomainSets.add(hostname);
}
break;
default:
throw new Error(`Unknown flag: ${flag as any}`);
}
};
if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
// don't trim here
lineCb(line);
}
} else {
let filterRules;
try {
const controller = new AbortController();
/** @type string[] */
filterRules = (
await Promise.any(
[filterRulesUrl, ...fallbackUrls].map(async url => {
const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
const text = await r.text();
console.log('[fetch finish]', url.toString());
controller.abort();
return text;
})
)
).split('\n');
} catch (e) {
console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
throw e;
}
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
}
switch (flag) {
case 0:
if (hostname[0] !== '.') {
whitelistDomainSets.add(`.${hostname}`);
} else {
whitelistDomainSets.add(hostname);
}
break;
case -1:
whitelistDomainSets.add(hostname);
break;
case 1:
blacklistDomainSets.add(hostname);
break;
case 2:
if (hostname[0] !== '.') {
blacklistDomainSets.add(`.${hostname}`);
} else {
blacklistDomainSets.add(hostname);
}
break;
default:
throw new Error(`Unknown flag: ${flag as any}`);
}
};
if (!fallbackUrls || fallbackUrls.length === 0) {
downloadTime = 0;
let last = Bun.nanoseconds();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
const now = Bun.nanoseconds();
downloadTime += Bun.nanoseconds() - last;
last = now;
// don't trim here
lineCb(line);
}
} else {
let filterRules;
const downloadStart = Bun.nanoseconds();
try {
const controller = new AbortController();
/** @type string[] */
filterRules = (
await Promise.any(
[filterRulesUrl, ...fallbackUrls].map(async url => {
const r = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
const text = await r.text();
controller.abort();
return text;
})
)
).split('\n');
} catch (e) {
console.log(`Download Rule for [${filterRulesUrl.toString()}] failed`);
throw e;
}
downloadTime = Bun.nanoseconds() - downloadStart;
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
}
console.log(` ┬ processFilterRules (${filterRulesUrl.toString()}): ${((Bun.nanoseconds() - runStart) / 1e6).toFixed(3)}ms`);
console.log(` └── download time: ${(downloadTime / 1e6).toFixed(3)}ms`);
});
return {
white: whitelistDomainSets,

View File

@@ -11,12 +11,13 @@ export const HOSTS = [
// Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
// 'https://ublockorigin.github.io/uAssets/thirdparties/urlhaus-filter/urlhaus-filter-online.txt',
// 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/urlhaus-filter/urlhaus-filter-online.txt',
['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, true],
// Curben's Phishing URL Blocklist
// Covered by lib/get-phishing-domains.ts
// 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt'
// 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true],
// ['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true],
// Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt'
@@ -33,7 +34,7 @@ export const ADGUARD_FILTERS = [
'https://easylist-downloads.adblockplus.org/easylist.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt',
'https://secure.fanboy.co.nz/easylist.txt',
'https://ublockorigin.github.io/uAssets/thirdparties/easylist.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
'https://ublockorigin.pages.dev/thirdparties/easylist.txt'
]
],
@@ -44,7 +45,7 @@ export const ADGUARD_FILTERS = [
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
'https://ublockorigin.github.io/uAssets/thirdparties/easyprivacy.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt',
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
]
],
@@ -52,7 +53,7 @@ export const ADGUARD_FILTERS = [
[
'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt',
[
'https://filters.adtidy.org/extension/chromium/filters/15.txt'
'https://filters.adtidy.org/extension/ublock/filters/15.txt'
]
],
// AdGuard CNAME Filter Combined
@@ -63,41 +64,36 @@ export const ADGUARD_FILTERS = [
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
// uBlock Origin Filter List
[
'https://ublockorigin.github.io/uAssets/filters/filters.min.txt',
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
'https://ublockorigin.pages.dev/filters/filters.min.txt'
]
],
// uBlock Origin Badware Risk List
[
'https://ublockorigin.github.io/uAssets/filters/badware.min.txt',
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.min.txt',
'https://ublockorigin.pages.dev/filters/badware.min.txt'
]
],
// uBlock Origin Privacy List
[
'https://ublockorigin.github.io/uAssets/filters/privacy.min.txt',
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.min.txt',
'https://ublockorigin.pages.dev/filters/privacy.min.txt'
]
],
// uBlock Origin Resource Abuse: merged in uBlock Origin Privacy List
// [
// 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
// 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
// [
// 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
// 'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
// ]
// ],
// uBlock Origin Unbreak
[
'https://ublockorigin.github.io/uAssets/filters/unbreak.min.txt',
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
]
],

View File

@@ -6,4 +6,8 @@ describe('stable-sort-domain', () => {
it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => {
expect(domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov')).toBe(-1);
});
it('.fgnzdb.xyz, .hub.fghtem.com', () => {
expect(domainSorter('.fgnzdb.xyz', '.hub.fghtem.com')).toBe(1);
});
});

View File

@@ -1,19 +1,21 @@
import path from 'path';
import picocolors from 'picocolors';
const traceSync = <T>(prefix: string, fn: () => T): T => {
function traceSync<T>(prefix: string, fn: () => T): T {
const start = Bun.nanoseconds();
const result = fn();
const end = Bun.nanoseconds();
console.log(`${prefix}: ${((end - start) / 1e6).toFixed(3)}ms`);
console.log(`${picocolors.gray(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`);
return result;
};
}
traceSync.skip = <T>(prefix: string, fn: () => T): T => fn();
export { traceSync };
const traceAsync = async <T>(prefix: string, fn: () => Promise<T>): Promise<T> => {
const start = Bun.nanoseconds();
const result = await fn();
const end = Bun.nanoseconds();
console.log(`${prefix}: ${((end - start) / 1e6).toFixed(3)}ms`);
console.log(`${picocolors.gray(`[${((end - start) / 1e6).toFixed(3)}ms]`)} ${prefix}`);
return result;
};
export { traceAsync };
@@ -31,7 +33,7 @@ const task = <T>(importMetaPath: string, fn: () => Promise<T>, customname: strin
const start = Bun.nanoseconds();
await fn();
const end = Bun.nanoseconds();
console.log(`✅ [${taskName}] Executed successfully: ${((end - start) / 1e6).toFixed(3)}ms`);
console.log(`✅ [${taskName}] [${((end - start) / 1e6).toFixed(3)}ms] Executed successfully`);
return { start, end, taskName } as TaskResult;
};