Compute signature matrix using NumPy
This commit is contained in:
parent
99fada5b52
commit
677ad5e8b7
|
@ -58,11 +58,20 @@ def min_hash(doc: str, perm: list[str]) -> str:
|
|||
|
||||
|
||||
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
|
||||
permutation_matrix = np.zeros((permutations_count, len(shingles)))
|
||||
for i in range(permutations_count):
|
||||
permutation_matrix[i] = np.random.permutation(len(shingles))
|
||||
shingles_count, docs_count = shingles.shape
|
||||
|
||||
return permutation_matrix @ shingles
|
||||
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
||||
|
||||
for permutation_id in range(permutations_count):
|
||||
permutation = np.random.permutation(shingles_count)
|
||||
|
||||
for doc_id, doc_shingles in enumerate(shingles.T):
|
||||
for x in permutation:
|
||||
if doc_shingles[x]:
|
||||
signature_matrix[permutation_id, doc_id] = x
|
||||
break
|
||||
|
||||
return signature_matrix
|
||||
|
||||
|
||||
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||
|
|
Loading…
Reference in New Issue