From 99fada5b52d4635b45bd8d0fba51baa0158badd8 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Sun, 17 Oct 2021 10:50:50 +0200 Subject: [PATCH] Compute signature matrix using NumPy --- algods/algods.py | 58 ++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 169f277..4cab606 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,8 +1,13 @@ import argparse -import re +import random import unicodedata import sys -from typing import Generator + +import numpy as np + + +SHINGLE_SIZE = 5 # Known as k +PERMUTATIONS_COUNT = 3 def parse_args(argv: dict = None) -> argparse.Namespace: @@ -26,21 +31,38 @@ def normalize(doc: str) -> str: ).casefold().encode('ascii', 'ignore').decode('ascii') -def compute_shingles(docs: list[str], single_size: int) -> Generator[set[int], any, None]: +def compute_shingles(docs: list[str], single_size: int) -> np.ndarray: + shingle_matrix = np.zeros((2, len(docs))) shingle_id = {} - id_shingle = [] - ids = 0 - for d in docs: - char_shing = [d[i:i + single_size] for i in range(len(d) - single_size + 1)] - sid = set() + for doc_id, doc in enumerate(docs): + char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)] for sh in char_shing: if sh not in shingle_id: - shingle_id[sh] = ids - id_shingle.append(sh) - ids = ids + 1 - sid.add(shingle_id[sh]) - yield sid + shingle_id[sh] = len(shingle_id) + if shingle_id[sh] >= len(shingle_matrix): + # Extend matrix, double its size + shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape), axis=0) + + shingle_matrix[shingle_id[sh], doc_id] = 1 + + shingle_matrix = shingle_matrix[:len(shingle_id)] + + return shingle_matrix + + +def min_hash(doc: str, perm: list[str]) -> str: + for d in perm: + if d in doc: + return d + + +def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray: + permutation_matrix = np.zeros((permutations_count, len(shingles))) + for i in range(permutations_count): + permutation_matrix[i] = np.random.permutation(len(shingles)) + + return permutation_matrix @ shingles def jaccard_similarity(doc1: set, doc2: set) -> float: @@ -56,14 +78,8 @@ def parse(stream, similarity: float) -> None: docs = [line.rstrip('\n') for line in stream] docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents - shingles = list(compute_shingles(docs, 5)) - - for i, doc1 in enumerate(shingles): - for j in range(i + 1, len(shingles)): - doc2 = shingles[j] - d = jaccard_similarity(doc1, doc2) - if d >= similarity: - print(f"{i} {j} {d:.06f}") + shingles = compute_shingles(docs, SHINGLE_SIZE) + signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT) def main():