From 57d53a742f8f26f01580d440d3430c2e8991ff9c Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Fri, 29 Oct 2021 16:03:20 +0200 Subject: [PATCH] Comment code --- algods/algods.py | 195 +++++++++++++++++++++++++++++++++++++---------- setup.cfg | 1 + 2 files changed, 156 insertions(+), 40 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index de8c06d..43857e3 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -8,12 +8,23 @@ SHINGLE_SIZE = 5 # Known as k def parse_args(argv: dict = None) -> argparse.Namespace: + """ + Parse arguments from the command line using argparse. + This returns an Argparse namespace with the parsed arguments. + Raises an error if an argument is invalid. + --help option is implicitly added. + """ if argv is None: argv = sys.argv - parser = argparse.ArgumentParser(description='Exercise 1') - parser.add_argument('input', nargs='?', type=argparse.FileType('r'), help='Documents to read.', default=sys.stdin) + parser = argparse.ArgumentParser(description='Document similarity') + # Input document option. Can be whatever file descriptor, including standard input. + parser.add_argument('input', nargs='?', type=argparse.FileType('r'), + help='Documents to read.', default=sys.stdin) + # Give similarity threshold. parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05) + # Optional. Let to display a progress bar while generating and applying permutations, + # which is the most expensive state. parser.add_argument('--progress', '-p', '--tqdm', action='store_true', help='Display progress bar while calculating signature matrix.') @@ -23,6 +34,7 @@ def parse_args(argv: dict = None) -> argparse.Namespace: def normalize(doc: str) -> str: """ Remove accents from letters, remove non-ascii letters, keep only letters and digits. + For instance, "I l0ve Pokémons & co." gives "il0vepokemonsco". """ return ''.join(char for char in unicodedata.normalize( 'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe')) @@ -31,52 +43,169 @@ def normalize(doc: str) -> str: def compute_shingles(docs: list[str], single_size: int) -> np.ndarray: + """ + Transform a list of documents into a shingle matrix. + This takes as input a list of documents (strings) that are well-formated (without any special character), + and the length of the shingles. + It outputs a Numpy boolean matrix where M(i, j) = True states that the shingle i appears in the document j. + + Since we don't know first the shingles count, we extend regularly the size of the matrix, without + generating an overflow. + """ + # Initialize the shingle matrix with 2 shingles shingle_matrix = np.zeros((2, len(docs)), dtype=bool) shingle_id = {} for doc_id, doc in enumerate(docs): + # Compute different shingles for a single document char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)] for sh in char_shing: + # The given shingle is unknown. Register it if sh not in shingle_id: shingle_id[sh] = len(shingle_id) if shingle_id[sh] >= len(shingle_matrix): - # Extend matrix, double its size + # Matrix is too slow, so we double its size shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0) + # Store the information that the shingle is in the document shingle_matrix[shingle_id[sh], doc_id] = True + # Reduce matrix size to useful content shingle_matrix = shingle_matrix[:len(shingle_id)] return shingle_matrix -def min_hash(doc: str, perm: list[str]) -> str: - for d in perm: - if d in doc: - return d +def compute_optimal_matrix_size(threshold: float) -> tuple[int, int]: + """ + Compute bands and rows number for the signature matrix + such that these values let an approximation of the similarity of two + documents, using LSH. + + Recall that the threshold for a signature matrix with b bands of r rows + is given by t = (1 / b) ** (1 / r). + + We want that this value is lower than the expected threshold to avoid + true negatives, but we want that this value stay lear the expected + value since we also want to avoid false positives. + + Then, we ensure that the estimated threshold is between + 2/3*threshold and threshold. + + To achieve that, we start from some values, then we add bands if the + threshold is too high, or add some rows per band if it is too high. + """ + # Compute b and r such that s/2 < t < s + # Use at least 2 rows and 16 bands to have good values + rows = 2 + bands = 16 + est_threshold = (1 / bands) ** (1 / rows) + # Threshold is not acceptable + while not (2 * threshold / 3 < est_threshold < threshold): + # Add bands + if est_threshold >= threshold: + bands *= 2 + # Add rows + else: + rows *= 2 + est_threshold = (1 / bands) ** (1 / rows) + + # Estimated threshold is now near required threshold + return bands, rows def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, display_tqdm: bool = False) -> np.ndarray: + """ + Implementation of the min-hash algorithm. + + We compute a signature matrix of shingles generated by random permutations. + The shingles parameters stands for the shingle boolean matrix (shingle x document) + where shingles[i, j] = True states that shingle i appears in document j. + + The permutations_count argument indicates the number of random permutations to generate. + + The output is the signature matrix, that has for dimensions (permutations_count x docs_count). + For each permutation, we generate it randomly, then we take the first shingle of the document. + + While the permutation generation can be done quickly, the check of the first shingle of a + document in a permutation (which can be achieved with an argmax in a boolean row) may be + quite expensive, and take some time. If supported and option enabled, a progress bar can + be displayed. + """ shingles_count, docs_count = shingles.shape + # Initialize matrix signature_matrix = np.inf * np.ones((permutations_count, docs_count)) permutations_iterator = range(permutations_count) + # If supported, load tqdm to display the progress bar if display_tqdm: try: from tqdm import tqdm - permutations_iterator = tqdm(permutations_iterator) + permutations_iterator = tqdm(permutations_iterator, unit="perm.") except ImportError: print("tqdm is not installed. Please install tqdm before using --tqdm option.") for permutation_id in permutations_iterator: + # Generate random permutation of shingles + # This is not the most expensive task permutation = np.random.permutation(shingles) + # For each document, get the smallest shingle after permutation + # This is the expensive operation signature_matrix[permutation_id] = permutation.argmax(0) return signature_matrix +def find_candidate_pairs(signature: np.ndarray, bands: int, rows: int) -> set[tuple[int, int]]: + """ + Implementation of the LSH algorithm. + + Given a signature matrix and band and rows per band numbers, we want to + find some candidate document pairs to be similar. + + We already know that the probability that two documents have the same signature + is the same as their similarity. + + Two documents are called are a candidate pair if they have the same signature on + all rows of at least one band. + + The output is a set of pairs of document ids. + """ + _, docs_count = signature.shape + + candidate_pairs = set() + + for band_id in range(bands): + # Get interesting band + band = signature[band_id * rows:(band_id + 1) * rows] + + buckets = {} + + # Put documents into buckets + # A bucket is the tuple of all signatures of a row + for doc in range(docs_count): + sign_doc = tuple(band[:, doc]) + buckets.setdefault(sign_doc, set()) + buckets[sign_doc].add(doc) + + # Find documents in the same bucket + for bucket in buckets.values(): + for doc_a in bucket: + for doc_b in bucket: + if doc_a != doc_b: + # Sort documents for nice output + doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b) + candidate_pairs.add((doc_a, doc_b)) + + return candidate_pairs + + def jaccard_similarity(doc1: set, doc2: set) -> float: + """ + Compute jaccard similarity of two sets. + This is defined as 0 if both sets are empty, |A ∩ B| / |A ∪ B| if general cases. + """ if not doc1 or not doc2: return 0.0 @@ -86,46 +215,30 @@ def jaccard_similarity(doc1: set, doc2: set) -> float: def parse(stream, similarity: float, display_tqdm: bool = False) -> None: - docs = [line.rstrip('\n') for line in stream] + """ + Given a stream of documents (separated by line feeds) and a similarity threshold, + we display in standard output an estimation of document pairs that + have a Jaccard similarity higher than the requested threshold. + + We use k-shringling, MinHash and LSH to compute the estimation. + """ + docs = [line.rstrip('\n') for line in stream] # Read stream docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents + # Compute k-shingles shingles = compute_shingles(docs, SHINGLE_SIZE) - # Compute b and r such that s/2 < t < s - # Use at least 2 rows and 16 bands to have good values - rows = 2 - bands = 16 - threshold = (1 / bands) ** (1 / rows) - while not (2 * similarity / 3 < threshold < similarity): - if threshold >= similarity: - bands *= 2 - else: - rows *= 2 - threshold = (1 / bands) ** (1 / rows) - + # Compute best values for permutations count + bands, rows = compute_optimal_matrix_size(similarity) + # Compute signature matrix using MinHash signature = compute_signature_matrix(shingles, bands * rows, display_tqdm) - candidate_pairs = set() - - for band_id in range(bands): - band = signature[band_id * rows:(band_id + 1) * rows] - - buckets = {} - - for doc in range(len(docs)): - sign_doc = tuple(band[:, doc]) - buckets.setdefault(sign_doc, set()) - buckets[sign_doc].add(doc) - - for bucket in buckets.values(): - for doc_a in bucket: - for doc_b in bucket: - if doc_a != doc_b: - doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b) - candidate_pairs.add((doc_a, doc_b)) - + # Guess candidate pairs using LSH + candidate_pairs = find_candidate_pairs(signature, bands, rows) + # Sort pairs for a nice output candidate_pairs = sorted(candidate_pairs) + # For each document pair, compute true Jaccard similarity and display it for doc_a, doc_b in candidate_pairs: # Compute true jaccard similarity shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a]) @@ -136,9 +249,11 @@ def parse(stream, similarity: float, display_tqdm: bool = False) -> None: def main(): + # Parse arguments from command line ns = parse_args() if not (0 < ns.similarity <= 1): raise ValueError(f"Invalid similiarity value: {ns.similarity}") + # Analyse documents parse(ns.input, ns.similarity, ns.progress) diff --git a/setup.cfg b/setup.cfg index d91ff28..d9ccea8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,6 @@ [metadata] name = algods +version = 1.0 long_description = file: README.md long_description_content_type = text/markdown description = Django application to manage a network.