Compute true Jaccard similarity for candidate pairs

This commit is contained in:
Yohann D'ANELLO 2021-10-28 08:09:56 +02:00
parent 0d9ec8ec1a
commit 1d5bd54b4f
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 12 additions and 16 deletions

View File

@ -1,13 +1,14 @@
import argparse import argparse
from tqdm import tqdm
import unicodedata import unicodedata
import sys import sys
import numpy as np import numpy as np
SHINGLE_SIZE = 5 # Known as k SHINGLE_SIZE = 5 # Known as k
PERMUTATIONS_COUNT = 800 PERMUTATIONS_COUNT = 640000
BAND_COUNT = 400 BAND_COUNT = 160000
ROWS_PER_BAND = 2 ROWS_PER_BAND = 4
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
@ -63,7 +64,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n
signature_matrix = np.inf * np.ones((permutations_count, docs_count)) signature_matrix = np.inf * np.ones((permutations_count, docs_count))
for permutation_id in range(permutations_count): for permutation_id in tqdm(range(permutations_count)):
permutation = np.random.permutation(shingles) permutation = np.random.permutation(shingles)
signature_matrix[permutation_id] = permutation.argmax(0) signature_matrix[permutation_id] = permutation.argmax(0)
@ -106,19 +107,14 @@ def parse(stream, similarity: float) -> None:
candidate_pairs.add((doc_a, doc_b)) candidate_pairs.add((doc_a, doc_b))
candidate_pairs = sorted(candidate_pairs) candidate_pairs = sorted(candidate_pairs)
for doc_a, doc_b in candidate_pairs: for doc_a, doc_b in candidate_pairs:
print(doc_a, doc_b) # Compute true jaccard similarity
shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a])
exit(1) shingles_b = set(x for x in range(len(shingles)) if shingles[x, doc_b])
d = jaccard_similarity(shingles_a, shingles_b)
# Calculate true jaccard similarity if d >= similarity:
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))] print(f"{doc_a} {doc_b} {d:.06f}")
for i, doc1 in enumerate(shingles_sets):
for j in range(i + 1, len(docs)):
doc2 = shingles_sets[j]
d = jaccard_similarity(doc1, doc2)
if d >= similarity:
print(f"{i} {j} {d:.06f}")
def main(): def main():