Implement LSH

2021-10-27 19:45:58 +02:00
parent 769c62ee93
commit 0d9ec8ec1a
1 changed files with 32 additions and 12 deletions
--- a/algods/algods.py
+++ b/algods/algods.py
@@ -1,14 +1,14 @@
 import argparse
 import random
 import time
 import unicodedata
 import sys
 import numpy as np
 SHINGLE_SIZE = 5  # Known as k
-PERMUTATIONS_COUNT = 3
+PERMUTATIONS_COUNT = 800
 BAND_COUNT = 400
 ROWS_PER_BAND = 2
 assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
 def parse_args(argv: dict = None) -> argparse.Namespace:
@@ -64,13 +64,8 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n
    signature_matrix = np.inf * np.ones((permutations_count, docs_count))
    for permutation_id in range(permutations_count):
-        permutation = np.random.permutation(shingles_count)
+        permutation = np.random.permutation(shingles)
-
+        signature_matrix[permutation_id] = permutation.argmax(0)
        for doc_id, doc_shingles in enumerate(shingles.T):
            for x in permutation:
                if doc_shingles[x]:
                    signature_matrix[permutation_id, doc_id] = x
                    break
    return signature_matrix
@@ -91,9 +86,34 @@ def parse(stream, similarity: float) -> None:
    shingles = compute_shingles(docs, SHINGLE_SIZE)
    signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
    candidate_pairs = set()
    for band_id in range(BAND_COUNT):
        band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND]
        buckets = {}
        for doc in range(len(docs)):
            sign_doc = tuple(band[:, doc])
            buckets.setdefault(sign_doc, set())
            buckets[sign_doc].add(doc)
        for bucket in buckets.values():
            for doc_a in bucket:
                for doc_b in bucket:
                    if doc_a != doc_b:
                        doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b)
                        candidate_pairs.add((doc_a, doc_b))
    candidate_pairs = sorted(candidate_pairs)
    for doc_a, doc_b in candidate_pairs:
        print(doc_a, doc_b)
    exit(1)
    # Calculate true jaccard similarity
    shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
    for i, doc1 in enumerate(shingles_sets):
        t = time.time()
        for j in range(i + 1, len(docs)):
            doc2 = shingles_sets[j]
            d = jaccard_similarity(doc1, doc2)