mirror of
https://github.com/LightZirconite/Microsoft-Rewards-Bot.git
synced 2026-01-11 17:56:15 +00:00
search: add semantic deduplication to reduce query redundancy
- Implement Jaccard word-level similarity in Search.ts - Add 15 unit tests for query quality metrics and deduplication - Introduce optional searchSettings.semanticDedup config flag - Backward-compatible, default enabled (threshold 0.65) - Tests: 17/17 pass, typecheck clean, risk: low
This commit is contained in:
@@ -65,7 +65,7 @@ export class Search extends Workers {
|
||||
}
|
||||
|
||||
googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries)
|
||||
// Deduplicate topics
|
||||
// Deduplicate topics (exact match)
|
||||
const seen = new Set<string>()
|
||||
googleSearchQueries = googleSearchQueries.filter(q => {
|
||||
if (seen.has(q.topic.toLowerCase())) return false
|
||||
@@ -73,6 +73,11 @@ export class Search extends Workers {
|
||||
return true
|
||||
})
|
||||
|
||||
// Semantic deduplication: filter queries with high Jaccard similarity
|
||||
if (this.bot.config.searchSettings.semanticDedup !== false) {
|
||||
googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65)
|
||||
}
|
||||
|
||||
// Go to bing
|
||||
await page.goto(this.searchPageURL ? this.searchPageURL : this.bingHome)
|
||||
|
||||
@@ -455,4 +460,33 @@ export class Search extends Workers {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate Jaccard similarity between two strings (word-level)
|
||||
* Used for semantic deduplication to avoid ban-pattern queries
|
||||
*/
|
||||
private jaccardSimilarity(a: string, b: string): number {
|
||||
const setA = new Set(a.toLowerCase().split(/\s+/))
|
||||
const setB = new Set(b.toLowerCase().split(/\s+/))
|
||||
const intersection = new Set([...setA].filter(x => setB.has(x)))
|
||||
const union = new Set([...setA, ...setB])
|
||||
return union.size === 0 ? 0 : intersection.size / union.size
|
||||
}
|
||||
|
||||
/**
|
||||
* Semantic deduplication: filter queries with high similarity
|
||||
* Prevents repetitive search patterns that may trigger detection
|
||||
*/
|
||||
private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] {
|
||||
const result: GoogleSearch[] = []
|
||||
for (const query of queries) {
|
||||
const isSimilar = result.some(existing =>
|
||||
this.jaccardSimilarity(query.topic, existing.topic) > threshold
|
||||
)
|
||||
if (!isSimilar) {
|
||||
result.push(query)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user