diff --git a/algods/algods.py b/algods/algods.py index eb372e4..86db147 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -6,10 +6,6 @@ import sys import numpy as np SHINGLE_SIZE = 5 # Known as k -PERMUTATIONS_COUNT = 640000 -BAND_COUNT = 160000 -ROWS_PER_BAND = 4 -assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND def parse_args(argv: dict = None) -> argparse.Namespace: @@ -85,12 +81,25 @@ def parse(stream, similarity: float) -> None: docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents shingles = compute_shingles(docs, SHINGLE_SIZE) - signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT) + + # Compute b and r such that s/2 < t < s + # Use at least 2 rows and 16 bands to have good values + rows = 2 + bands = 16 + threshold = (1 / bands) ** (1 / rows) + while not (2 * similarity / 3 < threshold < similarity): + if threshold >= similarity: + bands *= 2 + else: + rows *= 2 + threshold = (1 / bands) ** (1 / rows) + + signature = compute_signature_matrix(shingles, bands * rows) candidate_pairs = set() - for band_id in range(BAND_COUNT): - band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND] + for band_id in range(bands): + band = signature[band_id * rows:(band_id + 1) * rows] buckets = {}