Compute good values for band count and rows count according to given similarity threshold

This commit is contained in:
Yohann D'ANELLO 2021-10-29 14:26:25 +02:00
parent 1d5bd54b4f
commit 78efd7fb2a
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 16 additions and 7 deletions

View File

@ -6,10 +6,6 @@ import sys
import numpy as np import numpy as np
SHINGLE_SIZE = 5 # Known as k SHINGLE_SIZE = 5 # Known as k
PERMUTATIONS_COUNT = 640000
BAND_COUNT = 160000
ROWS_PER_BAND = 4
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
def parse_args(argv: dict = None) -> argparse.Namespace: def parse_args(argv: dict = None) -> argparse.Namespace:
@ -85,12 +81,25 @@ def parse(stream, similarity: float) -> None:
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
shingles = compute_shingles(docs, SHINGLE_SIZE) shingles = compute_shingles(docs, SHINGLE_SIZE)
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
# Compute b and r such that s/2 < t < s
# Use at least 2 rows and 16 bands to have good values
rows = 2
bands = 16
threshold = (1 / bands) ** (1 / rows)
while not (2 * similarity / 3 < threshold < similarity):
if threshold >= similarity:
bands *= 2
else:
rows *= 2
threshold = (1 / bands) ** (1 / rows)
signature = compute_signature_matrix(shingles, bands * rows)
candidate_pairs = set() candidate_pairs = set()
for band_id in range(BAND_COUNT): for band_id in range(bands):
band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND] band = signature[band_id * rows:(band_id + 1) * rows]
buckets = {} buckets = {}