From 78efd7fb2adaea0425fcb8cd80d431210e2abb5c Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Fri, 29 Oct 2021 14:26:25 +0200 Subject: [PATCH] Compute good values for band count and rows count according to given similarity threshold --- algods/algods.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index eb372e4..86db147 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -6,10 +6,6 @@ import sys import numpy as np SHINGLE_SIZE = 5 # Known as k -PERMUTATIONS_COUNT = 640000 -BAND_COUNT = 160000 -ROWS_PER_BAND = 4 -assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND def parse_args(argv: dict = None) -> argparse.Namespace: @@ -85,12 +81,25 @@ def parse(stream, similarity: float) -> None: docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents shingles = compute_shingles(docs, SHINGLE_SIZE) - signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT) + + # Compute b and r such that s/2 < t < s + # Use at least 2 rows and 16 bands to have good values + rows = 2 + bands = 16 + threshold = (1 / bands) ** (1 / rows) + while not (2 * similarity / 3 < threshold < similarity): + if threshold >= similarity: + bands *= 2 + else: + rows *= 2 + threshold = (1 / bands) ** (1 / rows) + + signature = compute_signature_matrix(shingles, bands * rows) candidate_pairs = set() - for band_id in range(BAND_COUNT): - band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND] + for band_id in range(bands): + band = signature[band_id * rows:(band_id + 1) * rows] buckets = {}