Fix: workaround normalize domain

This commit is contained in:
SukkaW
2024-08-04 23:00:47 +08:00
parent b3d3052630
commit 32ef8ef7b6
6 changed files with 280 additions and 258 deletions

View File

@@ -0,0 +1,9 @@
import { describe, it } from 'mocha';
import { normalizeDomain } from './normalize-domain';
describe('normalizeDomain', () => {
it('mine.torrent.pw', () => {
console.log(normalizeDomain('mine.torrent.pw'));
});
});

View File

@@ -1,11 +1,11 @@
import { parse as tldtsParse } from 'tldts-experimental';
import { isProbablyIpv4 } from './is-fast-ip';
// https://github.com/remusao/tldts/issues/2121
// import tldts from 'tldts-experimental';
import tldts from 'tldts';
export const normalizeDomain = (domain: string) => {
if (!domain) return null;
if (isProbablyIpv4(domain)) return null;
const parsed = tldtsParse(domain, { allowPrivateDomains: true, detectIp: false });
// if (parsed.isIp) return null;
const parsed = tldts.parse(domain, { allowPrivateDomains: true, allowIcannDomains: true, detectIp: true });
if (parsed.isIp) return null;
if (!parsed.hostname) return null;
// Private invalid domain (things like .tor, .dn42, etc)
if (!parsed.isIcann && !parsed.isPrivate) return null;

View File

@@ -10,10 +10,11 @@ import { fetchAssets } from './fetch-assets';
import { deserializeArray, fsFetchCache, serializeArray } from './cache-filesystem';
import type { Span } from '../trace';
import createKeywordFilter from './aho-corasick';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
const temporaryBypass = DEBUG_DOMAIN_TO_FIND !== null;
const temporaryBypass = true;
const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean, meta: string) => {
let line = processLine(l);
@@ -554,7 +555,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
: (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
);
const suffix = tldts.getPublicSuffix(sliced);
const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
if (!suffix) {
// This exclude domain-like resource like `1.1.4.514.js`
result[1] = ParseType.Null;
@@ -608,7 +609,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
}
result[0] = `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd
line, sliced, sliceStart, sliceEnd, domain
})}`;
result[1] = ParseType.ErrorMessage;
return result;
@@ -629,7 +630,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
) {
const _domain = line.slice(0, -1);
const suffix = tldts.getPublicSuffix(_domain);
const suffix = tldts.getPublicSuffix(_domain, looseTldtsOpt);
if (!suffix) {
// This exclude domain-like resource like `_social_tracking.js^`
result[1] = ParseType.Null;
@@ -685,7 +686,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
sliceEnd = -9;
}
const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
const suffix = tldts.getPublicSuffix(sliced);
const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
/**
* Fast exclude definitely not domain-like resource
*
@@ -708,7 +709,7 @@ function parse($line: string, result: [string, ParseType]): [hostname: string, f
return result;
}
result[0] = `[parse-filter E0010] can not parse: ${line}`;
result[0] = `[parse-filter ${tryNormalizeDomain === null ? 'E0010' : 'E0011'}] can not parse: ${JSON.stringify({ line, tryNormalizeDomain, sliced })}`;
result[1] = ParseType.ErrorMessage;
return result;
}