diff --git a/algods/algods.py b/algods/algods.py index 10a716b..eb372e4 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,13 +1,14 @@ import argparse +from tqdm import tqdm import unicodedata import sys import numpy as np SHINGLE_SIZE = 5 # Known as k -PERMUTATIONS_COUNT = 800 -BAND_COUNT = 400 -ROWS_PER_BAND = 2 +PERMUTATIONS_COUNT = 640000 +BAND_COUNT = 160000 +ROWS_PER_BAND = 4 assert PERMUTATIONS_COUNT == BAND_COUNT * ROWS_PER_BAND @@ -63,7 +64,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> n signature_matrix = np.inf * np.ones((permutations_count, docs_count)) - for permutation_id in range(permutations_count): + for permutation_id in tqdm(range(permutations_count)): permutation = np.random.permutation(shingles) signature_matrix[permutation_id] = permutation.argmax(0) @@ -106,19 +107,14 @@ def parse(stream, similarity: float) -> None: candidate_pairs.add((doc_a, doc_b)) candidate_pairs = sorted(candidate_pairs) + for doc_a, doc_b in candidate_pairs: - print(doc_a, doc_b) - - exit(1) - - # Calculate true jaccard similarity - shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))] - for i, doc1 in enumerate(shingles_sets): - for j in range(i + 1, len(docs)): - doc2 = shingles_sets[j] - d = jaccard_similarity(doc1, doc2) - if d >= similarity: - print(f"{i} {j} {d:.06f}") + # Compute true jaccard similarity + shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a]) + shingles_b = set(x for x in range(len(shingles)) if shingles[x, doc_b]) + d = jaccard_similarity(shingles_a, shingles_b) + if d >= similarity: + print(f"{doc_a} {doc_b} {d:.06f}") def main():