From 3eb861ef8a85a5712153b631cd4591a271ed02c2 Mon Sep 17 00:00:00 2001 From: LightZirconite Date: Mon, 3 Nov 2025 21:43:21 +0100 Subject: [PATCH] feat: Add semantic deduplication and threshold configuration for improved query filtering --- src/accounts.example.jsonc | 10 ++--- src/config.jsonc | 2 + src/functions/activities/Search.ts | 68 ++++++++++++++++++++++++------ src/interface/Config.ts | 1 + tests/search.test.ts | 31 ++++++++++++++ 5 files changed, 93 insertions(+), 19 deletions(-) diff --git a/src/accounts.example.jsonc b/src/accounts.example.jsonc index 7014e85..fe61680 100644 --- a/src/accounts.example.jsonc +++ b/src/accounts.example.jsonc @@ -23,7 +23,7 @@ "email": "secondary_account@outlook.com", "password": "strong-password-2", "totp": "BASE32SECRETSECOND", - "recoveryRequired": false, + "recoveryRequired": true, "recoveryEmail": "secondary.backup@example.com", "proxy": { "proxyAxios": true, @@ -35,7 +35,7 @@ }, { // Account #3 — dedicated proxy with credentials - "enabled": true, + "enabled": false, "email": "with_proxy@outlook.com", "password": "strong-password-3", "totp": "BASE32SECRETTHIRD", @@ -51,11 +51,11 @@ }, { // Account #4 — recovery optional, no proxying through Axios layer - "enabled": true, + "enabled": false, "email": "no_proxy@outlook.com", "password": "strong-password-4", "totp": "BASE32SECRETFOUR", - "recoveryRequired": false, + "recoveryRequired": true, "recoveryEmail": "no.proxy.backup@example.com", "proxy": { "proxyAxios": false, @@ -67,7 +67,7 @@ }, { // Account #5 — enabled with TOTP omitted (will rely on recovery email) - "enabled": true, + "enabled": false, "email": "totp_optional@outlook.com", "password": "strong-password-5", "totp": "", diff --git a/src/config.jsonc b/src/config.jsonc index 15b6f2b..46161a3 100644 --- a/src/config.jsonc +++ b/src/config.jsonc @@ -48,6 +48,8 @@ "scrollRandomResults": true, "clickRandomResults": true, "retryMobileSearchAmount": 2, + "semanticDedup": true, // Filter queries with high word similarity (Jaccard). Reduces repetitive patterns. + "semanticDedupThreshold": 0.65, // Similarity threshold (0-1). Lower = more strict filtering. "delay": { "min": "3min", "max": "5min" diff --git a/src/functions/activities/Search.ts b/src/functions/activities/Search.ts index 4385a72..9be4a58 100644 --- a/src/functions/activities/Search.ts +++ b/src/functions/activities/Search.ts @@ -65,17 +65,25 @@ export class Search extends Workers { } googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries) - // Deduplicate topics (exact match) - const seen = new Set() - googleSearchQueries = googleSearchQueries.filter(q => { - if (seen.has(q.topic.toLowerCase())) return false - seen.add(q.topic.toLowerCase()) - return true - }) - - // Semantic deduplication: filter queries with high Jaccard similarity + + // Combined deduplication: exact + semantic in single pass for performance if (this.bot.config.searchSettings.semanticDedup !== false) { - googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65) + const threshold = this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65 + const validThreshold = Math.max(0, Math.min(1, threshold)) // clamp [0,1] + const originalCount = googleSearchQueries.length + googleSearchQueries = this.combinedDeduplication(googleSearchQueries, validThreshold) + const filtered = originalCount - googleSearchQueries.length + if (filtered > 0) { + this.bot.log(this.bot.isMobile, 'SEARCH-DEDUP', `Query dedup: removed ${filtered} duplicates (${originalCount} → ${googleSearchQueries.length})`) + } + } else { + // Fallback: exact dedup only if semantic disabled + const seen = new Set() + googleSearchQueries = googleSearchQueries.filter(q => { + if (seen.has(q.topic.toLowerCase())) return false + seen.add(q.topic.toLowerCase()) + return true + }) } // Go to bing @@ -144,8 +152,13 @@ export class Search extends Workers { // Get related search terms to the Google search queries const relatedTerms = await this.getRelatedTerms(query?.topic) if (relatedTerms.length > 3) { + // Filter related terms with semantic dedup to avoid Bing-provided duplicates + const filteredRelated = this.bot.config.searchSettings.semanticDedup !== false + ? this.semanticDedupStrings(relatedTerms, this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65) + : relatedTerms + // Search for the first 2 related terms - for (const term of relatedTerms.slice(1, 3)) { + for (const term of filteredRelated.slice(1, 3)) { this.bot.log(this.bot.isMobile, 'SEARCH-BING-EXTRA', `${missingPoints} Points Remaining | Query: ${term}`) searchCounters = await this.bingSearch(page, term) @@ -473,17 +486,44 @@ export class Search extends Workers { } /** - * Semantic deduplication: filter queries with high similarity - * Prevents repetitive search patterns that may trigger detection + * Combined exact + semantic deduplication in single pass (performance optimized) + * Filters both case-insensitive exact duplicates and semantically similar queries */ - private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] { + private combinedDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] { const result: GoogleSearch[] = [] + const seen = new Set() // Track exact duplicates (case-insensitive) + for (const query of queries) { + const lower = query.topic.toLowerCase() + + // Check exact duplicate first (faster) + if (seen.has(lower)) continue + + // Check semantic similarity with existing results const isSimilar = result.some(existing => this.jaccardSimilarity(query.topic, existing.topic) > threshold ) + if (!isSimilar) { result.push(query) + seen.add(lower) + } + } + + return result + } + + /** + * Semantic deduplication for string arrays (used for related terms) + */ + private semanticDedupStrings(terms: string[], threshold = 0.65): string[] { + const result: string[] = [] + for (const term of terms) { + const isSimilar = result.some(existing => + this.jaccardSimilarity(term, existing) > threshold + ) + if (!isSimilar) { + result.push(term) } } return result diff --git a/src/interface/Config.ts b/src/interface/Config.ts index d7159f3..5514341 100644 --- a/src/interface/Config.ts +++ b/src/interface/Config.ts @@ -55,6 +55,7 @@ export interface ConfigSearchSettings { localFallbackCount?: number; // Number of local fallback queries to sample when trends fail extraFallbackRetries?: number; // Additional mini-retry loops with fallback terms semanticDedup?: boolean; // Filter queries with high semantic similarity (default: true) + semanticDedupThreshold?: number; // Jaccard similarity threshold 0-1 (default: 0.65, lower = stricter) } export interface ConfigSearchDelay { diff --git a/tests/search.test.ts b/tests/search.test.ts index 6cdec94..55d33ff 100644 --- a/tests/search.test.ts +++ b/tests/search.test.ts @@ -176,3 +176,34 @@ test('Jaccard similarity correctly identifies similar queries', () => { assert.ok(sim1 > 0.3, 'Similar queries should have high Jaccard score') assert.ok(sim2 < 0.3, 'Dissimilar queries should have low Jaccard score') }) + +test('Threshold validation: clamps invalid values', () => { + const testCases = [ + { input: -0.5, expected: 0 }, + { input: 1.5, expected: 1 }, + { input: 0.5, expected: 0.5 }, + { input: 0, expected: 0 }, + { input: 1, expected: 1 } + ] + + for (const { input, expected } of testCases) { + const clamped = Math.max(0, Math.min(1, input)) + assert.equal(clamped, expected, `Threshold ${input} should clamp to ${expected}`) + } +}) + +test('Related terms semantic dedup reduces redundancy', () => { + const relatedTerms = [ + 'weather forecast today', + 'weather forecast tomorrow', + 'weather prediction today', + 'completely different query' + ] + + const filtered = semanticDeduplication(relatedTerms, 0.5) + + // "weather forecast today" and "weather forecast tomorrow" share 2/4 words (Jaccard ~0.5) + assert.ok(filtered.length <= relatedTerms.length, 'Should filter some related terms') + assert.ok(filtered.includes('completely different query'), 'Should keep unique queries') +}) +