From 677ad5e8b79c1431fbe98c3e41e4614692c87831 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Sun, 17 Oct 2021 11:27:45 +0200 Subject: [PATCH] Compute signature matrix using NumPy --- algods/algods.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 4cab606..0a3367c 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -58,11 +58,20 @@ def min_hash(doc: str, perm: list[str]) -> str: def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray: - permutation_matrix = np.zeros((permutations_count, len(shingles))) - for i in range(permutations_count): - permutation_matrix[i] = np.random.permutation(len(shingles)) + shingles_count, docs_count = shingles.shape - return permutation_matrix @ shingles + signature_matrix = np.inf * np.ones((permutations_count, docs_count)) + + for permutation_id in range(permutations_count): + permutation = np.random.permutation(shingles_count) + + for doc_id, doc_shingles in enumerate(shingles.T): + for x in permutation: + if doc_shingles[x]: + signature_matrix[permutation_id, doc_id] = x + break + + return signature_matrix def jaccard_similarity(doc1: set, doc2: set) -> float: