feat: Add semantic deduplication and threshold configuration for improved query filtering

This commit is contained in:
2025-11-03 21:43:21 +01:00
parent a1163976a6
commit 3eb861ef8a
5 changed files with 93 additions and 19 deletions

View File

@@ -23,7 +23,7 @@
"email": "secondary_account@outlook.com", "email": "secondary_account@outlook.com",
"password": "strong-password-2", "password": "strong-password-2",
"totp": "BASE32SECRETSECOND", "totp": "BASE32SECRETSECOND",
"recoveryRequired": false, "recoveryRequired": true,
"recoveryEmail": "secondary.backup@example.com", "recoveryEmail": "secondary.backup@example.com",
"proxy": { "proxy": {
"proxyAxios": true, "proxyAxios": true,
@@ -35,7 +35,7 @@
}, },
{ {
// Account #3 — dedicated proxy with credentials // Account #3 — dedicated proxy with credentials
"enabled": true, "enabled": false,
"email": "with_proxy@outlook.com", "email": "with_proxy@outlook.com",
"password": "strong-password-3", "password": "strong-password-3",
"totp": "BASE32SECRETTHIRD", "totp": "BASE32SECRETTHIRD",
@@ -51,11 +51,11 @@
}, },
{ {
// Account #4 — recovery optional, no proxying through Axios layer // Account #4 — recovery optional, no proxying through Axios layer
"enabled": true, "enabled": false,
"email": "no_proxy@outlook.com", "email": "no_proxy@outlook.com",
"password": "strong-password-4", "password": "strong-password-4",
"totp": "BASE32SECRETFOUR", "totp": "BASE32SECRETFOUR",
"recoveryRequired": false, "recoveryRequired": true,
"recoveryEmail": "no.proxy.backup@example.com", "recoveryEmail": "no.proxy.backup@example.com",
"proxy": { "proxy": {
"proxyAxios": false, "proxyAxios": false,
@@ -67,7 +67,7 @@
}, },
{ {
// Account #5 — enabled with TOTP omitted (will rely on recovery email) // Account #5 — enabled with TOTP omitted (will rely on recovery email)
"enabled": true, "enabled": false,
"email": "totp_optional@outlook.com", "email": "totp_optional@outlook.com",
"password": "strong-password-5", "password": "strong-password-5",
"totp": "", "totp": "",

View File

@@ -48,6 +48,8 @@
"scrollRandomResults": true, "scrollRandomResults": true,
"clickRandomResults": true, "clickRandomResults": true,
"retryMobileSearchAmount": 2, "retryMobileSearchAmount": 2,
"semanticDedup": true, // Filter queries with high word similarity (Jaccard). Reduces repetitive patterns.
"semanticDedupThreshold": 0.65, // Similarity threshold (0-1). Lower = more strict filtering.
"delay": { "delay": {
"min": "3min", "min": "3min",
"max": "5min" "max": "5min"

View File

@@ -65,17 +65,25 @@ export class Search extends Workers {
} }
googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries) googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries)
// Deduplicate topics (exact match)
const seen = new Set<string>() // Combined deduplication: exact + semantic in single pass for performance
googleSearchQueries = googleSearchQueries.filter(q => {
if (seen.has(q.topic.toLowerCase())) return false
seen.add(q.topic.toLowerCase())
return true
})
// Semantic deduplication: filter queries with high Jaccard similarity
if (this.bot.config.searchSettings.semanticDedup !== false) { if (this.bot.config.searchSettings.semanticDedup !== false) {
googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65) const threshold = this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65
const validThreshold = Math.max(0, Math.min(1, threshold)) // clamp [0,1]
const originalCount = googleSearchQueries.length
googleSearchQueries = this.combinedDeduplication(googleSearchQueries, validThreshold)
const filtered = originalCount - googleSearchQueries.length
if (filtered > 0) {
this.bot.log(this.bot.isMobile, 'SEARCH-DEDUP', `Query dedup: removed ${filtered} duplicates (${originalCount}${googleSearchQueries.length})`)
}
} else {
// Fallback: exact dedup only if semantic disabled
const seen = new Set<string>()
googleSearchQueries = googleSearchQueries.filter(q => {
if (seen.has(q.topic.toLowerCase())) return false
seen.add(q.topic.toLowerCase())
return true
})
} }
// Go to bing // Go to bing
@@ -144,8 +152,13 @@ export class Search extends Workers {
// Get related search terms to the Google search queries // Get related search terms to the Google search queries
const relatedTerms = await this.getRelatedTerms(query?.topic) const relatedTerms = await this.getRelatedTerms(query?.topic)
if (relatedTerms.length > 3) { if (relatedTerms.length > 3) {
// Filter related terms with semantic dedup to avoid Bing-provided duplicates
const filteredRelated = this.bot.config.searchSettings.semanticDedup !== false
? this.semanticDedupStrings(relatedTerms, this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65)
: relatedTerms
// Search for the first 2 related terms // Search for the first 2 related terms
for (const term of relatedTerms.slice(1, 3)) { for (const term of filteredRelated.slice(1, 3)) {
this.bot.log(this.bot.isMobile, 'SEARCH-BING-EXTRA', `${missingPoints} Points Remaining | Query: ${term}`) this.bot.log(this.bot.isMobile, 'SEARCH-BING-EXTRA', `${missingPoints} Points Remaining | Query: ${term}`)
searchCounters = await this.bingSearch(page, term) searchCounters = await this.bingSearch(page, term)
@@ -473,17 +486,44 @@ export class Search extends Workers {
} }
/** /**
* Semantic deduplication: filter queries with high similarity * Combined exact + semantic deduplication in single pass (performance optimized)
* Prevents repetitive search patterns that may trigger detection * Filters both case-insensitive exact duplicates and semantically similar queries
*/ */
private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] { private combinedDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] {
const result: GoogleSearch[] = [] const result: GoogleSearch[] = []
const seen = new Set<string>() // Track exact duplicates (case-insensitive)
for (const query of queries) { for (const query of queries) {
const lower = query.topic.toLowerCase()
// Check exact duplicate first (faster)
if (seen.has(lower)) continue
// Check semantic similarity with existing results
const isSimilar = result.some(existing => const isSimilar = result.some(existing =>
this.jaccardSimilarity(query.topic, existing.topic) > threshold this.jaccardSimilarity(query.topic, existing.topic) > threshold
) )
if (!isSimilar) { if (!isSimilar) {
result.push(query) result.push(query)
seen.add(lower)
}
}
return result
}
/**
* Semantic deduplication for string arrays (used for related terms)
*/
private semanticDedupStrings(terms: string[], threshold = 0.65): string[] {
const result: string[] = []
for (const term of terms) {
const isSimilar = result.some(existing =>
this.jaccardSimilarity(term, existing) > threshold
)
if (!isSimilar) {
result.push(term)
} }
} }
return result return result

View File

@@ -55,6 +55,7 @@ export interface ConfigSearchSettings {
localFallbackCount?: number; // Number of local fallback queries to sample when trends fail localFallbackCount?: number; // Number of local fallback queries to sample when trends fail
extraFallbackRetries?: number; // Additional mini-retry loops with fallback terms extraFallbackRetries?: number; // Additional mini-retry loops with fallback terms
semanticDedup?: boolean; // Filter queries with high semantic similarity (default: true) semanticDedup?: boolean; // Filter queries with high semantic similarity (default: true)
semanticDedupThreshold?: number; // Jaccard similarity threshold 0-1 (default: 0.65, lower = stricter)
} }
export interface ConfigSearchDelay { export interface ConfigSearchDelay {

View File

@@ -176,3 +176,34 @@ test('Jaccard similarity correctly identifies similar queries', () => {
assert.ok(sim1 > 0.3, 'Similar queries should have high Jaccard score') assert.ok(sim1 > 0.3, 'Similar queries should have high Jaccard score')
assert.ok(sim2 < 0.3, 'Dissimilar queries should have low Jaccard score') assert.ok(sim2 < 0.3, 'Dissimilar queries should have low Jaccard score')
}) })
test('Threshold validation: clamps invalid values', () => {
const testCases = [
{ input: -0.5, expected: 0 },
{ input: 1.5, expected: 1 },
{ input: 0.5, expected: 0.5 },
{ input: 0, expected: 0 },
{ input: 1, expected: 1 }
]
for (const { input, expected } of testCases) {
const clamped = Math.max(0, Math.min(1, input))
assert.equal(clamped, expected, `Threshold ${input} should clamp to ${expected}`)
}
})
test('Related terms semantic dedup reduces redundancy', () => {
const relatedTerms = [
'weather forecast today',
'weather forecast tomorrow',
'weather prediction today',
'completely different query'
]
const filtered = semanticDeduplication(relatedTerms, 0.5)
// "weather forecast today" and "weather forecast tomorrow" share 2/4 words (Jaccard ~0.5)
assert.ok(filtered.length <= relatedTerms.length, 'Should filter some related terms')
assert.ok(filtered.includes('completely different query'), 'Should keep unique queries')
})