Implement LSH
This commit is contained in:
parent
769c62ee93
commit
0d9ec8ec1a
|
@ -1,14 +1,14 @@
|
|||
import argparse
|
||||
import random
|
||||
import time
|
||||
import unicodedata
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
SHINGLE_SIZE = 5 # Known as k
|
||||
PERMUTATIONS_COUNT = 3
|
||||
PERMUTATIONS_COUNT = 800
|
||||
BAND_COUNT = 400
|
||||
ROWS_PER_BAND = 2
|
||||
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
|
||||
|
||||
|
||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||
|
@ -64,13 +64,8 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n
|
|||
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
||||
|
||||
for permutation_id in range(permutations_count):
|
||||
permutation = np.random.permutation(shingles_count)
|
||||
|
||||
for doc_id, doc_shingles in enumerate(shingles.T):
|
||||
for x in permutation:
|
||||
if doc_shingles[x]:
|
||||
signature_matrix[permutation_id, doc_id] = x
|
||||
break
|
||||
permutation = np.random.permutation(shingles)
|
||||
signature_matrix[permutation_id] = permutation.argmax(0)
|
||||
|
||||
return signature_matrix
|
||||
|
||||
|
@ -91,9 +86,34 @@ def parse(stream, similarity: float) -> None:
|
|||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
||||
|
||||
candidate_pairs = set()
|
||||
|
||||
for band_id in range(BAND_COUNT):
|
||||
band = signature[band_id * ROWS_PER_BAND:(band_id + 1) * ROWS_PER_BAND]
|
||||
|
||||
buckets = {}
|
||||
|
||||
for doc in range(len(docs)):
|
||||
sign_doc = tuple(band[:, doc])
|
||||
buckets.setdefault(sign_doc, set())
|
||||
buckets[sign_doc].add(doc)
|
||||
|
||||
for bucket in buckets.values():
|
||||
for doc_a in bucket:
|
||||
for doc_b in bucket:
|
||||
if doc_a != doc_b:
|
||||
doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b)
|
||||
candidate_pairs.add((doc_a, doc_b))
|
||||
|
||||
candidate_pairs = sorted(candidate_pairs)
|
||||
for doc_a, doc_b in candidate_pairs:
|
||||
print(doc_a, doc_b)
|
||||
|
||||
exit(1)
|
||||
|
||||
# Calculate true jaccard similarity
|
||||
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
|
||||
for i, doc1 in enumerate(shingles_sets):
|
||||
t = time.time()
|
||||
for j in range(i + 1, len(docs)):
|
||||
doc2 = shingles_sets[j]
|
||||
d = jaccard_similarity(doc1, doc2)
|
||||
|
|
Loading…
Reference in New Issue