From a1163976a6601ceceb1d79d50163e2c507de11ea Mon Sep 17 00:00:00 2001 From: LightZirconite Date: Mon, 3 Nov 2025 21:36:05 +0100 Subject: [PATCH] search: add semantic deduplication to reduce query redundancy - Implement Jaccard word-level similarity in Search.ts - Add 15 unit tests for query quality metrics and deduplication - Introduce optional searchSettings.semanticDedup config flag - Backward-compatible, default enabled (threshold 0.65) - Tests: 17/17 pass, typecheck clean, risk: low --- src/functions/activities/Search.ts | 36 +++++- src/interface/Config.ts | 1 + tests/queryDiversityEngine.test.ts | 95 +++++++++++++++ tests/search.test.ts | 178 +++++++++++++++++++++++++++++ 4 files changed, 309 insertions(+), 1 deletion(-) create mode 100644 tests/queryDiversityEngine.test.ts create mode 100644 tests/search.test.ts diff --git a/src/functions/activities/Search.ts b/src/functions/activities/Search.ts index 6378b24..4385a72 100644 --- a/src/functions/activities/Search.ts +++ b/src/functions/activities/Search.ts @@ -65,7 +65,7 @@ export class Search extends Workers { } googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries) - // Deduplicate topics + // Deduplicate topics (exact match) const seen = new Set() googleSearchQueries = googleSearchQueries.filter(q => { if (seen.has(q.topic.toLowerCase())) return false @@ -73,6 +73,11 @@ export class Search extends Workers { return true }) + // Semantic deduplication: filter queries with high Jaccard similarity + if (this.bot.config.searchSettings.semanticDedup !== false) { + googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65) + } + // Go to bing await page.goto(this.searchPageURL ? this.searchPageURL : this.bingHome) @@ -455,4 +460,33 @@ export class Search extends Workers { } } + /** + * Calculate Jaccard similarity between two strings (word-level) + * Used for semantic deduplication to avoid ban-pattern queries + */ + private jaccardSimilarity(a: string, b: string): number { + const setA = new Set(a.toLowerCase().split(/\s+/)) + const setB = new Set(b.toLowerCase().split(/\s+/)) + const intersection = new Set([...setA].filter(x => setB.has(x))) + const union = new Set([...setA, ...setB]) + return union.size === 0 ? 0 : intersection.size / union.size + } + + /** + * Semantic deduplication: filter queries with high similarity + * Prevents repetitive search patterns that may trigger detection + */ + private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] { + const result: GoogleSearch[] = [] + for (const query of queries) { + const isSimilar = result.some(existing => + this.jaccardSimilarity(query.topic, existing.topic) > threshold + ) + if (!isSimilar) { + result.push(query) + } + } + return result + } + } \ No newline at end of file diff --git a/src/interface/Config.ts b/src/interface/Config.ts index b4c6b09..d7159f3 100644 --- a/src/interface/Config.ts +++ b/src/interface/Config.ts @@ -54,6 +54,7 @@ export interface ConfigSearchSettings { retryMobileSearchAmount: number; localFallbackCount?: number; // Number of local fallback queries to sample when trends fail extraFallbackRetries?: number; // Additional mini-retry loops with fallback terms + semanticDedup?: boolean; // Filter queries with high semantic similarity (default: true) } export interface ConfigSearchDelay { diff --git a/tests/queryDiversityEngine.test.ts b/tests/queryDiversityEngine.test.ts new file mode 100644 index 0000000..37b3ada --- /dev/null +++ b/tests/queryDiversityEngine.test.ts @@ -0,0 +1,95 @@ +import test from 'node:test' +import assert from 'node:assert/strict' + +import { QueryDiversityEngine } from '../src/util/QueryDiversityEngine' + +test('QueryDiversityEngine fetches and limits queries', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback'], + maxQueriesPerSource: 5 + }) + + const queries = await engine.fetchQueries(10) + + assert.ok(queries.length > 0, 'Should return at least one query') + assert.ok(queries.length <= 10, 'Should respect count limit') + assert.ok(queries.every(q => typeof q === 'string' && q.length > 0), 'All queries should be non-empty strings') +}) + +test('QueryDiversityEngine deduplicates queries', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback'], + deduplicate: true + }) + + const queries = await engine.fetchQueries(20) + const uniqueSet = new Set(queries) + + assert.equal(queries.length, uniqueSet.size, 'All queries should be unique') +}) + +test('QueryDiversityEngine interleaves multiple sources', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback', 'local-fallback'], // Duplicate source to test interleaving + mixStrategies: true, + maxQueriesPerSource: 3 + }) + + const queries = await engine.fetchQueries(6) + + assert.ok(queries.length > 0, 'Should return queries from multiple sources') + // Interleaving logic should distribute queries from different sources +}) + +test('QueryDiversityEngine caches results', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback'], + cacheMinutes: 1 + }) + + const firstFetch = await engine.fetchQueries(5) + const secondFetch = await engine.fetchQueries(5) + + // Cache should return consistent results within cache window + // Note: shuffling happens after cache retrieval, so we validate cache hit by checking source consistency + assert.ok(firstFetch.length === 5, 'First fetch should return 5 queries') + assert.ok(secondFetch.length === 5, 'Second fetch should return 5 queries') + // Cached data is shuffled independently, so we just validate count and source +}) + +test('QueryDiversityEngine clears cache correctly', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback'], + cacheMinutes: 1 + }) + + await engine.fetchQueries(5) + engine.clearCache() + + const queries = await engine.fetchQueries(5) + assert.ok(queries.length > 0, 'Should fetch fresh queries after cache clear') +}) + +test('QueryDiversityEngine handles empty sources gracefully', async () => { + const engine = new QueryDiversityEngine({ + sources: [], + maxQueriesPerSource: 5 + }) + + const queries = await engine.fetchQueries(5) + + // Should fallback to local when no sources configured + assert.ok(queries.length > 0, 'Should return fallback queries when no sources configured') +}) + +test('QueryDiversityEngine respects maxQueriesPerSource', async () => { + const engine = new QueryDiversityEngine({ + sources: ['local-fallback'], + maxQueriesPerSource: 3 + }) + + const queries = await engine.fetchQueries(10) + + // With single source and max 3, should not exceed 3 + assert.ok(queries.length <= 3, 'Should respect maxQueriesPerSource limit') +}) diff --git a/tests/search.test.ts b/tests/search.test.ts new file mode 100644 index 0000000..6cdec94 --- /dev/null +++ b/tests/search.test.ts @@ -0,0 +1,178 @@ +import test from 'node:test' +import assert from 'node:assert/strict' + +/** + * Search integration tests: validate query quality, diversity, and deduplication + * These tests focus on metrics that prevent ban-pattern detection + */ + +// Mock GoogleSearch interface +interface GoogleSearch { + topic: string; + related: string[]; +} + +// Helper: calculate Jaccard similarity (used in semantic dedup) +function jaccardSimilarity(a: string, b: string): number { + const setA = new Set(a.toLowerCase().split(/\s+/)) + const setB = new Set(b.toLowerCase().split(/\s+/)) + const intersection = new Set([...setA].filter(x => setB.has(x))) + const union = new Set([...setA, ...setB]) + return intersection.size / union.size +} + +// Helper: simulate Search.ts deduplication logic +function deduplicateQueries(queries: GoogleSearch[]): GoogleSearch[] { + const seen = new Set() + return queries.filter(q => { + const lower = q.topic.toLowerCase() + if (seen.has(lower)) return false + seen.add(lower) + return true + }) +} + +// Helper: semantic deduplication (proposed enhancement) +function semanticDeduplication(queries: string[], threshold = 0.7): string[] { + const result: string[] = [] + for (const query of queries) { + const isSimilar = result.some(existing => jaccardSimilarity(query, existing) > threshold) + if (!isSimilar) { + result.push(query) + } + } + return result +} + +test('Search deduplication removes exact duplicates', () => { + const queries: GoogleSearch[] = [ + { topic: 'Weather Today', related: [] }, + { topic: 'weather today', related: [] }, + { topic: 'News Updates', related: [] } + ] + + const deduped = deduplicateQueries(queries) + + assert.equal(deduped.length, 2, 'Should remove case-insensitive duplicates') + assert.ok(deduped.some(q => q.topic === 'Weather Today'), 'Should keep first occurrence') + assert.ok(deduped.some(q => q.topic === 'News Updates'), 'Should keep unique queries') +}) + +test('Semantic deduplication filters similar queries', () => { + const queries = [ + 'movie reviews', + 'film reviews', + 'weather forecast', + 'weather predictions', + 'sports news' + ] + + const deduped = semanticDeduplication(queries, 0.5) + + // "movie reviews" and "film reviews" share 1 common word: "reviews" (Jaccard = 1/3 = 0.33) + // "weather forecast" and "weather predictions" share 1 common word: "weather" (Jaccard = 1/3 = 0.33) + // Both below 0.5 threshold, so all queries should pass + assert.ok(deduped.length === queries.length || deduped.length === queries.length - 1, 'Should keep most queries with 0.5 threshold') + assert.ok(deduped.includes('sports news'), 'Should keep unique queries') +}) + +test('Query quality metrics: length validation', () => { + const queries = [ + 'a', + 'valid query here', + 'this is a very long query that exceeds reasonable search length and might look suspicious to automated systems', + 'normal search term' + ] + + const valid = queries.filter(q => q.length >= 3 && q.length <= 100) + + assert.equal(valid.length, 2, 'Should filter too short and too long queries') + assert.ok(valid.includes('valid query here'), 'Should accept reasonable queries') + assert.ok(valid.includes('normal search term'), 'Should accept reasonable queries') +}) + +test('Query diversity: lexical variance check', () => { + const queries = [ + 'weather today', + 'news updates', + 'movie reviews', + 'sports scores', + 'travel tips' + ] + + // Calculate unique word count + const allWords = queries.flatMap(q => q.toLowerCase().split(/\s+/)) + const uniqueWords = new Set(allWords) + + // High diversity: unique words / total words should be > 0.7 + const diversity = uniqueWords.size / allWords.length + + assert.ok(diversity > 0.7, `Query diversity (${diversity.toFixed(2)}) should be > 0.7`) +}) + +test('Query diversity: prevent repetitive patterns', () => { + const queries = [ + 'how to cook', + 'how to bake', + 'how to grill', + 'how to steam', + 'how to fry' + ] + + const prefixes = queries.map(q => q.split(' ').slice(0, 2).join(' ')) + const uniquePrefixes = new Set(prefixes) + + // All start with "how to" - low diversity + assert.equal(uniquePrefixes.size, 1, 'Should detect repetitive prefix pattern') + + // Mitigation: interleave different query types + const diverse = [ + 'how to cook', + 'weather today', + 'how to bake', + 'news updates', + 'how to grill' + ] + + const diversePrefixes = diverse.map(q => q.split(' ').slice(0, 2).join(' ')) + const uniqueDiversePrefixes = new Set(diversePrefixes) + + assert.ok(uniqueDiversePrefixes.size > 2, 'Diverse queries should have varied prefixes') +}) + +test('Baseline: queries.json fallback quality', async () => { + // Simulate loading queries.json + const mockQueries = [ + { title: 'Houses near you', queries: ['Houses near me'] }, + { title: 'Feeling symptoms?', queries: ['Rash on forearm', 'Stuffy nose'] } + ] + + const flattened = mockQueries.flatMap(x => x.queries) + + assert.ok(flattened.length > 0, 'Should have fallback queries') + assert.ok(flattened.every(q => q.length >= 3), 'All fallback queries should meet min length') +}) + +test('Related terms expansion quality', () => { + const relatedTerms = [ + 'weather forecast', + 'weather today', + 'weather prediction', + 'forecast accuracy' + ] + + // Filter too-similar related terms with lower threshold + const filtered = semanticDeduplication(relatedTerms, 0.5) + + // All queries have Jaccard < 0.5, so should keep most/all + assert.ok(filtered.length >= 2, 'Should keep at least 2 diverse related terms') + assert.ok(filtered.length <= relatedTerms.length, 'Should not exceed input length') +}) + +test('Jaccard similarity correctly identifies similar queries', () => { + const sim1 = jaccardSimilarity('movie reviews', 'film reviews') + const sim2 = jaccardSimilarity('weather today', 'sports news') + + assert.ok(sim1 > 0.3, 'Similar queries should have high Jaccard score') + assert.ok(sim2 < 0.3, 'Dissimilar queries should have low Jaccard score') +})