From 0d9ec8ec1aa065885116461762f69c178c825c39 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Wed, 27 Oct 2021 19:45:58 +0200 Subject: [PATCH] Implement LSH --- algods/algods.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 8e55dae..10a716b 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,14 +1,14 @@ import argparse -import random -import time import unicodedata import sys import numpy as np - SHINGLE_SIZE = 5 # Known as k -PERMUTATIONS_COUNT = 3 +PERMUTATIONS_COUNT = 800 +BAND_COUNT = 400 +ROWS_PER_BAND = 2 +assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND def parse_args(argv: dict = None) -> argparse.Namespace: @@ -64,13 +64,8 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n signature_matrix = np.inf * np.ones((permutations_count, docs_count)) for permutation_id in range(permutations_count): - permutation = np.random.permutation(shingles_count) - - for doc_id, doc_shingles in enumerate(shingles.T): - for x in permutation: - if doc_shingles[x]: - signature_matrix[permutation_id, doc_id] = x - break + permutation = np.random.permutation(shingles) + signature_matrix[permutation_id] = permutation.argmax(0) return signature_matrix @@ -91,9 +86,34 @@ def parse(stream, similarity: float) -> None: shingles = compute_shingles(docs, SHINGLE_SIZE) signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT) + candidate_pairs = set() + + for band_id in range(BAND_COUNT): + band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND] + + buckets = {} + + for doc in range(len(docs)): + sign_doc = tuple(band[:, doc]) + buckets.setdefault(sign_doc, set()) + buckets[sign_doc].add(doc) + + for bucket in buckets.values(): + for doc_a in bucket: + for doc_b in bucket: + if doc_a != doc_b: + doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b) + candidate_pairs.add((doc_a, doc_b)) + + candidate_pairs = sorted(candidate_pairs) + for doc_a, doc_b in candidate_pairs: + print(doc_a, doc_b) + + exit(1) + + # Calculate true jaccard similarity shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))] for i, doc1 in enumerate(shingles_sets): - t = time.time() for j in range(i + 1, len(docs)): doc2 = shingles_sets[j] d = jaccard_similarity(doc1, doc2)