From eb754a30e5a008e4fd1c8ebba248daa324d35c48 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Fri, 29 Oct 2021 20:15:14 +0200 Subject: [PATCH] Draw graphs --- algods/algods.py | 102 +++++++++++++++++++++++++++++++++++++++++++---- setup.cfg | 3 ++ 2 files changed, 97 insertions(+), 8 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 7139af8..fa48458 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -3,6 +3,7 @@ import unicodedata import sys from typing import Optional +from matplotlib import pyplot as plt import numpy as np SHINGLE_SIZE = 5 # Known as k @@ -31,6 +32,7 @@ def parse_args(argv: dict = None) -> argparse.Namespace: # which is the most expensive state. parser.add_argument('--progress', '-p', '--tqdm', action='store_true', help='Display progress bar while calculating signature matrix.') + parser.add_argument('--graph', '-g', action='store_true', help="Draw graphs.") return parser.parse_args(argv[1:]) @@ -148,7 +150,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, disp if display_tqdm: try: from tqdm import tqdm - permutations_iterator = tqdm(permutations_iterator, unit="perm.") + permutations_iterator = tqdm(permutations_iterator, unit="perm.", position=1) except ImportError: print("tqdm is not installed. Please install tqdm before using --tqdm option.") @@ -207,6 +209,19 @@ def find_candidate_pairs(signature: np.ndarray, bands: int, rows: int) -> set[tu return candidate_pairs +def shingle_set(shingles: np.ndarray, doc_id: int) -> set[int]: + """ + Return the set of all shingle id from a document. + To don't recompute multiple times this, this is cached. + """ + if not hasattr(shingle_set, '_cache'): + shingle_set._cache = {} + if doc_id not in shingle_set._cache: + shingle_set._cache[doc_id] = set(x for x in range(len(shingles)) if shingles[x, doc_id]) + + return shingle_set._cache[doc_id] + + def jaccard_similarity(doc1: set, doc2: set) -> float: """ Compute jaccard similarity of two sets. @@ -220,7 +235,7 @@ def jaccard_similarity(doc1: set, doc2: set) -> float: return len(inter) / len(union) -def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False) \ +def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False, verbose: bool = True) \ -> Optional[tuple[int, int, int, int]]: """ Given a stream of documents (separated by line feeds) and a similarity threshold, @@ -234,7 +249,11 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool # Compute k-shingles shingles = compute_shingles(docs, SHINGLE_SIZE) + return parse_shingles(docs, shingles, similarity, stats=stats, display_tqdm=display_tqdm, verbose=verbose) + +def parse_shingles(docs: list[str], shingles: np.ndarray, similarity: float, *, stats: bool = False, + display_tqdm: bool = False, verbose: bool = True) -> Optional[tuple[int, int, int, int]]: # Compute best values for permutations count bands, rows = compute_optimal_matrix_size(similarity) # Compute signature matrix using MinHash @@ -250,14 +269,14 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool fp = 0 # For each document pair, compute true Jaccard similarity and display it - shingles_set = [set(x for x in range(len(shingles)) if shingles[x, doc]) for doc in range(len(docs))] for doc_a, doc_b in candidate_pairs: # Compute true jaccard similarity - shingles_a = shingles_set[doc_a] - shingles_b = shingles_set[doc_b] + shingles_a = shingle_set(shingles, doc_a) + shingles_b = shingle_set(shingles, doc_b) d = jaccard_similarity(shingles_a, shingles_b) if d >= similarity: - print(f"{doc_a} {doc_b} {d:.06f}") + if verbose: + print(f"{doc_a} {doc_b} {d:.06f}") tp += 1 else: fp += 1 @@ -270,8 +289,8 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool for doc_a in range(len(docs)): for doc_b in range(doc_a + 1, len(docs)): # Compute true jaccard similarity - shingles_a = shingles_set[doc_a] - shingles_b = shingles_set[doc_b] + shingles_a = shingle_set(shingles, doc_a) + shingles_b = shingle_set(shingles, doc_b) d = jaccard_similarity(shingles_a, shingles_b) if d >= similarity and (doc_a, doc_b) not in candidate_pairs: fn += 1 @@ -285,6 +304,10 @@ def main(): # Parse arguments from command line ns = parse_args() + if ns.graph: + # Don't use the program to compute something + return graph(ns.input, ns.progress) + if not (0 < ns.similarity <= 1): raise ValueError(f"Invalid similiarity value: {ns.similarity}") @@ -302,3 +325,66 @@ def main(): fp_rate = fp / (fp + tn) print(f"True positive rate: {tp_rate:.06f}", file=sys.stderr) print(f"False positive rate: {fp_rate:.06f}", file=sys.stderr) + + +def graph(stream, display_tqdm: bool = False) -> None: + """ + Draw statistic graphs about false-positive and true positive rates using matplotlib. + """ + docs = [line.rstrip('\n') for line in stream] # Read stream + docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents + + # Compute k-shingles + shingles = compute_shingles(docs, SHINGLE_SIZE) + + step = 0.05 + n = int(1 // step) + + tps, fps, tns, fns = [], [], [], [] + + step_iterator = range(1, n + 1) + if display_tqdm: + from tqdm import tqdm + step_iterator = tqdm(step_iterator, position=1) + + for i in step_iterator: + t = i * step + tp, fp, tn, fn = parse_shingles(docs, shingles, t, stats=True, display_tqdm=display_tqdm, verbose=False) + tps.append(tp) + fps.append(fp) + tns.append(tn) + fns.append(fn) + + tps = np.array(tps) + fps = np.array(fps) + tns = np.array(tns) + fns = np.array(fns) + + tps_rate = tps / (tps + fns) + fps_rate = fps / (fps + tns) + + print("tps = np.array(", tps, ")") + print("fps = np.array(", fps, ")") + print("tns = np.array(", tns, ")") + print("fns = np.array(", fns, ")") + + x_axis = step * np.array(range(1, n + 1)) + + plt.plot(x_axis, tps_rate, '*') + plt.xlabel("Threshold value") + plt.ylabel("True positive rate") + plt.title("True positive rate per threshold value") + plt.show() + + plt.plot(x_axis, fps_rate, '*') + plt.xlabel("Threshold value") + plt.ylabel("False positive rate") + plt.title("False positive rate per threshold value") + plt.show() + + plt.plot(x_axis, np.log(fps_rate), '*') + plt.xlabel("Threshold value") + plt.ylabel("False positive rate (log scale)") + plt.title("False positive rate per threshold value (logarithmic scale)") + + plt.show() diff --git a/setup.cfg b/setup.cfg index d9ccea8..5092a9a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,9 @@ classifiers = packages = find: include_package_data = True python_requires = >=3.6 +install_requires = + numpy + matplotlib [options.entry_points] console_scripts =