Compute good values for band count and rows count according to given similarity threshold
This commit is contained in:
parent
1d5bd54b4f
commit
78efd7fb2a
|
@ -6,10 +6,6 @@ import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
SHINGLE_SIZE = 5 # Known as k
|
SHINGLE_SIZE = 5 # Known as k
|
||||||
PERMUTATIONS_COUNT = 640000
|
|
||||||
BAND_COUNT = 160000
|
|
||||||
ROWS_PER_BAND = 4
|
|
||||||
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
|
@ -85,12 +81,25 @@ def parse(stream, similarity: float) -> None:
|
||||||
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||||
|
|
||||||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
|
||||||
|
# Compute b and r such that s/2 < t < s
|
||||||
|
# Use at least 2 rows and 16 bands to have good values
|
||||||
|
rows = 2
|
||||||
|
bands = 16
|
||||||
|
threshold = (1 / bands) ** (1 / rows)
|
||||||
|
while not (2 * similarity / 3 < threshold < similarity):
|
||||||
|
if threshold >= similarity:
|
||||||
|
bands *= 2
|
||||||
|
else:
|
||||||
|
rows *= 2
|
||||||
|
threshold = (1 / bands) ** (1 / rows)
|
||||||
|
|
||||||
|
signature = compute_signature_matrix(shingles, bands * rows)
|
||||||
|
|
||||||
candidate_pairs = set()
|
candidate_pairs = set()
|
||||||
|
|
||||||
for band_id in range(BAND_COUNT):
|
for band_id in range(bands):
|
||||||
band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND]
|
band = signature[band_id * rows:(band_id + 1) * rows]
|
||||||
|
|
||||||
buckets = {}
|
buckets = {}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue