Compute signature matrix using NumPy
This commit is contained in:
parent
99fada5b52
commit
677ad5e8b7
|
@ -58,11 +58,20 @@ def min_hash(doc: str, perm: list[str]) -> str:
|
||||||
|
|
||||||
|
|
||||||
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
|
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
|
||||||
permutation_matrix = np.zeros((permutations_count, len(shingles)))
|
shingles_count, docs_count = shingles.shape
|
||||||
for i in range(permutations_count):
|
|
||||||
permutation_matrix[i] = np.random.permutation(len(shingles))
|
|
||||||
|
|
||||||
return permutation_matrix @ shingles
|
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
||||||
|
|
||||||
|
for permutation_id in range(permutations_count):
|
||||||
|
permutation = np.random.permutation(shingles_count)
|
||||||
|
|
||||||
|
for doc_id, doc_shingles in enumerate(shingles.T):
|
||||||
|
for x in permutation:
|
||||||
|
if doc_shingles[x]:
|
||||||
|
signature_matrix[permutation_id, doc_id] = x
|
||||||
|
break
|
||||||
|
|
||||||
|
return signature_matrix
|
||||||
|
|
||||||
|
|
||||||
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
|
|
Loading…
Reference in New Issue