Compute true Jaccard similarity for candidate pairs
This commit is contained in:
parent
0d9ec8ec1a
commit
1d5bd54b4f
|
@ -1,13 +1,14 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
from tqdm import tqdm
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
SHINGLE_SIZE = 5 # Known as k
|
SHINGLE_SIZE = 5 # Known as k
|
||||||
PERMUTATIONS_COUNT = 800
|
PERMUTATIONS_COUNT = 640000
|
||||||
BAND_COUNT = 400
|
BAND_COUNT = 160000
|
||||||
ROWS_PER_BAND = 2
|
ROWS_PER_BAND = 4
|
||||||
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
|
assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,7 +64,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n
|
||||||
|
|
||||||
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
||||||
|
|
||||||
for permutation_id in range(permutations_count):
|
for permutation_id in tqdm(range(permutations_count)):
|
||||||
permutation = np.random.permutation(shingles)
|
permutation = np.random.permutation(shingles)
|
||||||
signature_matrix[permutation_id] = permutation.argmax(0)
|
signature_matrix[permutation_id] = permutation.argmax(0)
|
||||||
|
|
||||||
|
@ -106,19 +107,14 @@ def parse(stream, similarity: float) -> None:
|
||||||
candidate_pairs.add((doc_a, doc_b))
|
candidate_pairs.add((doc_a, doc_b))
|
||||||
|
|
||||||
candidate_pairs = sorted(candidate_pairs)
|
candidate_pairs = sorted(candidate_pairs)
|
||||||
|
|
||||||
for doc_a, doc_b in candidate_pairs:
|
for doc_a, doc_b in candidate_pairs:
|
||||||
print(doc_a, doc_b)
|
# Compute true jaccard similarity
|
||||||
|
shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a])
|
||||||
exit(1)
|
shingles_b = set(x for x in range(len(shingles)) if shingles[x, doc_b])
|
||||||
|
d = jaccard_similarity(shingles_a, shingles_b)
|
||||||
# Calculate true jaccard similarity
|
|
||||||
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
|
|
||||||
for i, doc1 in enumerate(shingles_sets):
|
|
||||||
for j in range(i + 1, len(docs)):
|
|
||||||
doc2 = shingles_sets[j]
|
|
||||||
d = jaccard_similarity(doc1, doc2)
|
|
||||||
if d >= similarity:
|
if d >= similarity:
|
||||||
print(f"{i} {j} {d:.06f}")
|
print(f"{doc_a} {doc_b} {d:.06f}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Reference in New Issue