diff --git a/algods/algods.py b/algods/algods.py index 4cab606..0a3367c 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -58,11 +58,20 @@ def min_hash(doc: str, perm: list[str]) -> str: def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray: - permutation_matrix = np.zeros((permutations_count, len(shingles))) - for i in range(permutations_count): - permutation_matrix[i] = np.random.permutation(len(shingles)) + shingles_count, docs_count = shingles.shape - return permutation_matrix @ shingles + signature_matrix = np.inf * np.ones((permutations_count, docs_count)) + + for permutation_id in range(permutations_count): + permutation = np.random.permutation(shingles_count) + + for doc_id, doc_shingles in enumerate(shingles.T): + for x in permutation: + if doc_shingles[x]: + signature_matrix[permutation_id, doc_id] = x + break + + return signature_matrix def jaccard_similarity(doc1: set, doc2: set) -> float: