From 769c62ee93962e0976ee6281713079aa744e1fbe Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Sun, 17 Oct 2021 11:48:56 +0200 Subject: [PATCH] Keep true Jaccard similarity --- algods/algods.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 0a3367c..8e55dae 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,5 +1,6 @@ import argparse import random +import time import unicodedata import sys @@ -32,7 +33,7 @@ def normalize(doc: str) -> str: def compute_shingles(docs: list[str], single_size: int) -> np.ndarray: - shingle_matrix = np.zeros((2, len(docs))) + shingle_matrix = np.zeros((2, len(docs)), dtype=bool) shingle_id = {} for doc_id, doc in enumerate(docs): @@ -42,9 +43,9 @@ def compute_shingles(docs: list[str], single_size: int) -> np.ndarray: shingle_id[sh] = len(shingle_id) if shingle_id[sh] >= len(shingle_matrix): # Extend matrix, double its size - shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape), axis=0) + shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0) - shingle_matrix[shingle_id[sh], doc_id] = 1 + shingle_matrix[shingle_id[sh], doc_id] = True shingle_matrix = shingle_matrix[:len(shingle_id)] @@ -90,6 +91,15 @@ def parse(stream, similarity: float) -> None: shingles = compute_shingles(docs, SHINGLE_SIZE) signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT) + shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))] + for i, doc1 in enumerate(shingles_sets): + t = time.time() + for j in range(i + 1, len(docs)): + doc2 = shingles_sets[j] + d = jaccard_similarity(doc1, doc2) + if d >= similarity: + print(f"{i} {j} {d:.06f}") + def main(): ns = parse_args()