Compute signature matrix using NumPy

This commit is contained in:
Yohann D'ANELLO 2021-10-17 11:27:45 +02:00
parent 99fada5b52
commit 677ad5e8b7
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 13 additions and 4 deletions

View File

@ -58,11 +58,20 @@ def min_hash(doc: str, perm: list[str]) -> str:
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
permutation_matrix = np.zeros((permutations_count, len(shingles)))
for i in range(permutations_count):
permutation_matrix[i] = np.random.permutation(len(shingles))
shingles_count, docs_count = shingles.shape
return permutation_matrix @ shingles
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
for permutation_id in range(permutations_count):
permutation = np.random.permutation(shingles_count)
for doc_id, doc_shingles in enumerate(shingles.T):
for x in permutation:
if doc_shingles[x]:
signature_matrix[permutation_id, doc_id] = x
break
return signature_matrix
def jaccard_similarity(doc1: set, doc2: set) -> float: