Compute true Jaccard similarity for candidate pairs

This commit is contained in:
Yohann D'ANELLO 2021-10-28 08:09:56 +02:00
parent 0d9ec8ec1a
commit 1d5bd54b4f
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 12 additions and 16 deletions

View File

@ -1,13 +1,14 @@
import argparse
from tqdm import tqdm
import unicodedata
import sys
import numpy as np
SHINGLE_SIZE = 5 # Known as k
PERMUTATIONS_COUNT = 800
BAND_COUNT = 400
ROWS_PER_BAND = 2
PERMUTATIONS_COUNT = 640000
BAND_COUNT = 160000
ROWS_PER_BAND = 4
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
@ -63,7 +64,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
for permutation_id in range(permutations_count):
for permutation_id in tqdm(range(permutations_count)):
permutation = np.random.permutation(shingles)
signature_matrix[permutation_id] = permutation.argmax(0)
@ -106,19 +107,14 @@ def parse(stream, similarity: float) -> None:
candidate_pairs.add((doc_a, doc_b))
candidate_pairs = sorted(candidate_pairs)
for doc_a, doc_b in candidate_pairs:
print(doc_a, doc_b)
exit(1)
# Calculate true jaccard similarity
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
for i, doc1 in enumerate(shingles_sets):
for j in range(i + 1, len(docs)):
doc2 = shingles_sets[j]
d = jaccard_similarity(doc1, doc2)
# Compute true jaccard similarity
shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a])
shingles_b = set(x for x in range(len(shingles)) if shingles[x, doc_b])
d = jaccard_similarity(shingles_a, shingles_b)
if d >= similarity:
print(f"{i} {j} {d:.06f}")
print(f"{doc_a} {doc_b} {d:.06f}")
def main():