search: add semantic deduplication to reduce query redundancy

- Implement Jaccard word-level similarity in Search.ts
- Add 15 unit tests for query quality metrics and deduplication
- Introduce optional searchSettings.semanticDedup config flag
- Backward-compatible, default enabled (threshold 0.65)
- Tests: 17/17 pass, typecheck clean, risk: low
This commit is contained in:
2025-11-03 21:36:05 +01:00
parent 39b62a4190
commit a1163976a6
4 changed files with 309 additions and 1 deletions

View File

@@ -65,7 +65,7 @@ export class Search extends Workers {
}
googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries)
// Deduplicate topics
// Deduplicate topics (exact match)
const seen = new Set<string>()
googleSearchQueries = googleSearchQueries.filter(q => {
if (seen.has(q.topic.toLowerCase())) return false
@@ -73,6 +73,11 @@ export class Search extends Workers {
return true
})
// Semantic deduplication: filter queries with high Jaccard similarity
if (this.bot.config.searchSettings.semanticDedup !== false) {
googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65)
}
// Go to bing
await page.goto(this.searchPageURL ? this.searchPageURL : this.bingHome)
@@ -455,4 +460,33 @@ export class Search extends Workers {
}
}
/**
* Calculate Jaccard similarity between two strings (word-level)
* Used for semantic deduplication to avoid ban-pattern queries
*/
private jaccardSimilarity(a: string, b: string): number {
const setA = new Set(a.toLowerCase().split(/\s+/))
const setB = new Set(b.toLowerCase().split(/\s+/))
const intersection = new Set([...setA].filter(x => setB.has(x)))
const union = new Set([...setA, ...setB])
return union.size === 0 ? 0 : intersection.size / union.size
}
/**
* Semantic deduplication: filter queries with high similarity
* Prevents repetitive search patterns that may trigger detection
*/
private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] {
const result: GoogleSearch[] = []
for (const query of queries) {
const isSimilar = result.some(existing =>
this.jaccardSimilarity(query.topic, existing.topic) > threshold
)
if (!isSimilar) {
result.push(query)
}
}
return result
}
}