mirror of
https://github.com/LightZirconite/Microsoft-Rewards-Bot.git
synced 2026-01-10 01:06:17 +00:00
feat: Add semantic deduplication and threshold configuration for improved query filtering
This commit is contained in:
@@ -23,7 +23,7 @@
|
||||
"email": "secondary_account@outlook.com",
|
||||
"password": "strong-password-2",
|
||||
"totp": "BASE32SECRETSECOND",
|
||||
"recoveryRequired": false,
|
||||
"recoveryRequired": true,
|
||||
"recoveryEmail": "secondary.backup@example.com",
|
||||
"proxy": {
|
||||
"proxyAxios": true,
|
||||
@@ -35,7 +35,7 @@
|
||||
},
|
||||
{
|
||||
// Account #3 — dedicated proxy with credentials
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"email": "with_proxy@outlook.com",
|
||||
"password": "strong-password-3",
|
||||
"totp": "BASE32SECRETTHIRD",
|
||||
@@ -51,11 +51,11 @@
|
||||
},
|
||||
{
|
||||
// Account #4 — recovery optional, no proxying through Axios layer
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"email": "no_proxy@outlook.com",
|
||||
"password": "strong-password-4",
|
||||
"totp": "BASE32SECRETFOUR",
|
||||
"recoveryRequired": false,
|
||||
"recoveryRequired": true,
|
||||
"recoveryEmail": "no.proxy.backup@example.com",
|
||||
"proxy": {
|
||||
"proxyAxios": false,
|
||||
@@ -67,7 +67,7 @@
|
||||
},
|
||||
{
|
||||
// Account #5 — enabled with TOTP omitted (will rely on recovery email)
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"email": "totp_optional@outlook.com",
|
||||
"password": "strong-password-5",
|
||||
"totp": "",
|
||||
|
||||
@@ -48,6 +48,8 @@
|
||||
"scrollRandomResults": true,
|
||||
"clickRandomResults": true,
|
||||
"retryMobileSearchAmount": 2,
|
||||
"semanticDedup": true, // Filter queries with high word similarity (Jaccard). Reduces repetitive patterns.
|
||||
"semanticDedupThreshold": 0.65, // Similarity threshold (0-1). Lower = more strict filtering.
|
||||
"delay": {
|
||||
"min": "3min",
|
||||
"max": "5min"
|
||||
|
||||
@@ -65,17 +65,25 @@ export class Search extends Workers {
|
||||
}
|
||||
|
||||
googleSearchQueries = this.bot.utils.shuffleArray(googleSearchQueries)
|
||||
// Deduplicate topics (exact match)
|
||||
const seen = new Set<string>()
|
||||
googleSearchQueries = googleSearchQueries.filter(q => {
|
||||
if (seen.has(q.topic.toLowerCase())) return false
|
||||
seen.add(q.topic.toLowerCase())
|
||||
return true
|
||||
})
|
||||
|
||||
// Semantic deduplication: filter queries with high Jaccard similarity
|
||||
|
||||
// Combined deduplication: exact + semantic in single pass for performance
|
||||
if (this.bot.config.searchSettings.semanticDedup !== false) {
|
||||
googleSearchQueries = this.semanticDeduplication(googleSearchQueries, 0.65)
|
||||
const threshold = this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65
|
||||
const validThreshold = Math.max(0, Math.min(1, threshold)) // clamp [0,1]
|
||||
const originalCount = googleSearchQueries.length
|
||||
googleSearchQueries = this.combinedDeduplication(googleSearchQueries, validThreshold)
|
||||
const filtered = originalCount - googleSearchQueries.length
|
||||
if (filtered > 0) {
|
||||
this.bot.log(this.bot.isMobile, 'SEARCH-DEDUP', `Query dedup: removed ${filtered} duplicates (${originalCount} → ${googleSearchQueries.length})`)
|
||||
}
|
||||
} else {
|
||||
// Fallback: exact dedup only if semantic disabled
|
||||
const seen = new Set<string>()
|
||||
googleSearchQueries = googleSearchQueries.filter(q => {
|
||||
if (seen.has(q.topic.toLowerCase())) return false
|
||||
seen.add(q.topic.toLowerCase())
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
// Go to bing
|
||||
@@ -144,8 +152,13 @@ export class Search extends Workers {
|
||||
// Get related search terms to the Google search queries
|
||||
const relatedTerms = await this.getRelatedTerms(query?.topic)
|
||||
if (relatedTerms.length > 3) {
|
||||
// Filter related terms with semantic dedup to avoid Bing-provided duplicates
|
||||
const filteredRelated = this.bot.config.searchSettings.semanticDedup !== false
|
||||
? this.semanticDedupStrings(relatedTerms, this.bot.config.searchSettings.semanticDedupThreshold ?? 0.65)
|
||||
: relatedTerms
|
||||
|
||||
// Search for the first 2 related terms
|
||||
for (const term of relatedTerms.slice(1, 3)) {
|
||||
for (const term of filteredRelated.slice(1, 3)) {
|
||||
this.bot.log(this.bot.isMobile, 'SEARCH-BING-EXTRA', `${missingPoints} Points Remaining | Query: ${term}`)
|
||||
|
||||
searchCounters = await this.bingSearch(page, term)
|
||||
@@ -473,17 +486,44 @@ export class Search extends Workers {
|
||||
}
|
||||
|
||||
/**
|
||||
* Semantic deduplication: filter queries with high similarity
|
||||
* Prevents repetitive search patterns that may trigger detection
|
||||
* Combined exact + semantic deduplication in single pass (performance optimized)
|
||||
* Filters both case-insensitive exact duplicates and semantically similar queries
|
||||
*/
|
||||
private semanticDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] {
|
||||
private combinedDeduplication(queries: GoogleSearch[], threshold = 0.65): GoogleSearch[] {
|
||||
const result: GoogleSearch[] = []
|
||||
const seen = new Set<string>() // Track exact duplicates (case-insensitive)
|
||||
|
||||
for (const query of queries) {
|
||||
const lower = query.topic.toLowerCase()
|
||||
|
||||
// Check exact duplicate first (faster)
|
||||
if (seen.has(lower)) continue
|
||||
|
||||
// Check semantic similarity with existing results
|
||||
const isSimilar = result.some(existing =>
|
||||
this.jaccardSimilarity(query.topic, existing.topic) > threshold
|
||||
)
|
||||
|
||||
if (!isSimilar) {
|
||||
result.push(query)
|
||||
seen.add(lower)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
/**
|
||||
* Semantic deduplication for string arrays (used for related terms)
|
||||
*/
|
||||
private semanticDedupStrings(terms: string[], threshold = 0.65): string[] {
|
||||
const result: string[] = []
|
||||
for (const term of terms) {
|
||||
const isSimilar = result.some(existing =>
|
||||
this.jaccardSimilarity(term, existing) > threshold
|
||||
)
|
||||
if (!isSimilar) {
|
||||
result.push(term)
|
||||
}
|
||||
}
|
||||
return result
|
||||
|
||||
@@ -55,6 +55,7 @@ export interface ConfigSearchSettings {
|
||||
localFallbackCount?: number; // Number of local fallback queries to sample when trends fail
|
||||
extraFallbackRetries?: number; // Additional mini-retry loops with fallback terms
|
||||
semanticDedup?: boolean; // Filter queries with high semantic similarity (default: true)
|
||||
semanticDedupThreshold?: number; // Jaccard similarity threshold 0-1 (default: 0.65, lower = stricter)
|
||||
}
|
||||
|
||||
export interface ConfigSearchDelay {
|
||||
|
||||
@@ -176,3 +176,34 @@ test('Jaccard similarity correctly identifies similar queries', () => {
|
||||
assert.ok(sim1 > 0.3, 'Similar queries should have high Jaccard score')
|
||||
assert.ok(sim2 < 0.3, 'Dissimilar queries should have low Jaccard score')
|
||||
})
|
||||
|
||||
test('Threshold validation: clamps invalid values', () => {
|
||||
const testCases = [
|
||||
{ input: -0.5, expected: 0 },
|
||||
{ input: 1.5, expected: 1 },
|
||||
{ input: 0.5, expected: 0.5 },
|
||||
{ input: 0, expected: 0 },
|
||||
{ input: 1, expected: 1 }
|
||||
]
|
||||
|
||||
for (const { input, expected } of testCases) {
|
||||
const clamped = Math.max(0, Math.min(1, input))
|
||||
assert.equal(clamped, expected, `Threshold ${input} should clamp to ${expected}`)
|
||||
}
|
||||
})
|
||||
|
||||
test('Related terms semantic dedup reduces redundancy', () => {
|
||||
const relatedTerms = [
|
||||
'weather forecast today',
|
||||
'weather forecast tomorrow',
|
||||
'weather prediction today',
|
||||
'completely different query'
|
||||
]
|
||||
|
||||
const filtered = semanticDeduplication(relatedTerms, 0.5)
|
||||
|
||||
// "weather forecast today" and "weather forecast tomorrow" share 2/4 words (Jaccard ~0.5)
|
||||
assert.ok(filtered.length <= relatedTerms.length, 'Should filter some related terms')
|
||||
assert.ok(filtered.includes('completely different query'), 'Should keep unique queries')
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user