Compute good values for band count and rows count according to given similarity threshold
This commit is contained in:
parent
1d5bd54b4f
commit
78efd7fb2a
|
@ -6,10 +6,6 @@ import sys
|
|||
import numpy as np
|
||||
|
||||
SHINGLE_SIZE = 5 # Known as k
|
||||
PERMUTATIONS_COUNT = 640000
|
||||
BAND_COUNT = 160000
|
||||
ROWS_PER_BAND = 4
|
||||
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
|
||||
|
||||
|
||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||
|
@ -85,12 +81,25 @@ def parse(stream, similarity: float) -> None:
|
|||
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||
|
||||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
||||
|
||||
# Compute b and r such that s/2 < t < s
|
||||
# Use at least 2 rows and 16 bands to have good values
|
||||
rows = 2
|
||||
bands = 16
|
||||
threshold = (1 / bands) ** (1 / rows)
|
||||
while not (2 * similarity / 3 < threshold < similarity):
|
||||
if threshold >= similarity:
|
||||
bands *= 2
|
||||
else:
|
||||
rows *= 2
|
||||
threshold = (1 / bands) ** (1 / rows)
|
||||
|
||||
signature = compute_signature_matrix(shingles, bands * rows)
|
||||
|
||||
candidate_pairs = set()
|
||||
|
||||
for band_id in range(BAND_COUNT):
|
||||
band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND]
|
||||
for band_id in range(bands):
|
||||
band = signature[band_id * rows:(band_id + 1) * rows]
|
||||
|
||||
buckets = {}
|
||||
|
||||
|
|
Loading…
Reference in New Issue