From eb754a30e5a008e4fd1c8ebba248daa324d35c48 Mon Sep 17 00:00:00 2001
From: Yohann D'ANELLO <ynerant@crans.org>
Date: Fri, 29 Oct 2021 20:15:14 +0200
Subject: [PATCH] Draw graphs

---
 algods/algods.py | 102 +++++++++++++++++++++++++++++++++++++++++++----
 setup.cfg        |   3 ++
 2 files changed, 97 insertions(+), 8 deletions(-)

diff --git a/algods/algods.py b/algods/algods.py
index 7139af8..fa48458 100644
--- a/algods/algods.py
+++ b/algods/algods.py
@@ -3,6 +3,7 @@ import unicodedata
 import sys
 from typing import Optional
 
+from matplotlib import pyplot as plt
 import numpy as np
 
 SHINGLE_SIZE = 5  # Known as k
@@ -31,6 +32,7 @@ def parse_args(argv: dict = None) -> argparse.Namespace:
     # which is the most expensive state.
     parser.add_argument('--progress', '-p', '--tqdm', action='store_true',
                         help='Display progress bar while calculating signature matrix.')
+    parser.add_argument('--graph', '-g', action='store_true', help="Draw graphs.")
 
     return parser.parse_args(argv[1:])
 
@@ -148,7 +150,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, disp
     if display_tqdm:
         try:
             from tqdm import tqdm
-            permutations_iterator = tqdm(permutations_iterator, unit="perm.")
+            permutations_iterator = tqdm(permutations_iterator, unit="perm.", position=1)
         except ImportError:
             print("tqdm is not installed. Please install tqdm before using --tqdm option.")
 
@@ -207,6 +209,19 @@ def find_candidate_pairs(signature: np.ndarray, bands: int, rows: int) -> set[tu
     return candidate_pairs
 
 
+def shingle_set(shingles: np.ndarray, doc_id: int) -> set[int]:
+    """
+    Return the set of all shingle id from a document.
+    To don't recompute multiple times this, this is cached.
+    """
+    if not hasattr(shingle_set, '_cache'):
+        shingle_set._cache = {}
+    if doc_id not in shingle_set._cache:
+        shingle_set._cache[doc_id] = set(x for x in range(len(shingles)) if shingles[x, doc_id])
+
+    return shingle_set._cache[doc_id]
+
+
 def jaccard_similarity(doc1: set, doc2: set) -> float:
     """
     Compute jaccard similarity of two sets.
@@ -220,7 +235,7 @@ def jaccard_similarity(doc1: set, doc2: set) -> float:
     return len(inter) / len(union)
 
 
-def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False) \
+def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False, verbose: bool = True) \
         -> Optional[tuple[int, int, int, int]]:
     """
     Given a stream of documents (separated by line feeds) and a similarity threshold,
@@ -234,7 +249,11 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
 
     # Compute k-shingles
     shingles = compute_shingles(docs, SHINGLE_SIZE)
+    return parse_shingles(docs, shingles, similarity, stats=stats, display_tqdm=display_tqdm, verbose=verbose)
 
+
+def parse_shingles(docs: list[str], shingles: np.ndarray, similarity: float, *, stats: bool = False,
+                   display_tqdm: bool = False, verbose: bool = True) -> Optional[tuple[int, int, int, int]]:
     # Compute best values for permutations count
     bands, rows = compute_optimal_matrix_size(similarity)
     # Compute signature matrix using MinHash
@@ -250,14 +269,14 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
     fp = 0
 
     # For each document pair, compute true Jaccard similarity and display it
-    shingles_set = [set(x for x in range(len(shingles)) if shingles[x, doc]) for doc in range(len(docs))]
     for doc_a, doc_b in candidate_pairs:
         # Compute true jaccard similarity
-        shingles_a = shingles_set[doc_a]
-        shingles_b = shingles_set[doc_b]
+        shingles_a = shingle_set(shingles, doc_a)
+        shingles_b = shingle_set(shingles, doc_b)
         d = jaccard_similarity(shingles_a, shingles_b)
         if d >= similarity:
-            print(f"{doc_a} {doc_b} {d:.06f}")
+            if verbose:
+                print(f"{doc_a} {doc_b} {d:.06f}")
             tp += 1
         else:
             fp += 1
@@ -270,8 +289,8 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
         for doc_a in range(len(docs)):
             for doc_b in range(doc_a + 1, len(docs)):
                 # Compute true jaccard similarity
-                shingles_a = shingles_set[doc_a]
-                shingles_b = shingles_set[doc_b]
+                shingles_a = shingle_set(shingles, doc_a)
+                shingles_b = shingle_set(shingles, doc_b)
                 d = jaccard_similarity(shingles_a, shingles_b)
                 if d >= similarity and (doc_a, doc_b) not in candidate_pairs:
                     fn += 1
@@ -285,6 +304,10 @@ def main():
     # Parse arguments from command line
     ns = parse_args()
 
+    if ns.graph:
+        # Don't use the program to compute something
+        return graph(ns.input, ns.progress)
+
     if not (0 < ns.similarity <= 1):
         raise ValueError(f"Invalid similiarity value: {ns.similarity}")
 
@@ -302,3 +325,66 @@ def main():
         fp_rate = fp / (fp + tn)
         print(f"True positive rate: {tp_rate:.06f}", file=sys.stderr)
         print(f"False positive rate: {fp_rate:.06f}", file=sys.stderr)
+
+
+def graph(stream, display_tqdm: bool = False) -> None:
+    """
+    Draw statistic graphs about false-positive and true positive rates using matplotlib.
+    """
+    docs = [line.rstrip('\n') for line in stream]  # Read stream
+    docs = [normalize(doc) for doc in docs]  # Remove special characters and normalize accents
+
+    # Compute k-shingles
+    shingles = compute_shingles(docs, SHINGLE_SIZE)
+
+    step = 0.05
+    n = int(1 // step)
+
+    tps, fps, tns, fns = [], [], [], []
+
+    step_iterator = range(1, n + 1)
+    if display_tqdm:
+        from tqdm import tqdm
+        step_iterator = tqdm(step_iterator, position=1)
+
+    for i in step_iterator:
+        t = i * step
+        tp, fp, tn, fn = parse_shingles(docs, shingles, t, stats=True, display_tqdm=display_tqdm, verbose=False)
+        tps.append(tp)
+        fps.append(fp)
+        tns.append(tn)
+        fns.append(fn)
+
+    tps = np.array(tps)
+    fps = np.array(fps)
+    tns = np.array(tns)
+    fns = np.array(fns)
+
+    tps_rate = tps / (tps + fns)
+    fps_rate = fps / (fps + tns)
+
+    print("tps = np.array(", tps, ")")
+    print("fps = np.array(", fps, ")")
+    print("tns = np.array(", tns, ")")
+    print("fns = np.array(", fns, ")")
+
+    x_axis = step * np.array(range(1, n + 1))
+
+    plt.plot(x_axis, tps_rate, '*')
+    plt.xlabel("Threshold value")
+    plt.ylabel("True positive rate")
+    plt.title("True positive rate per threshold value")
+    plt.show()
+
+    plt.plot(x_axis, fps_rate, '*')
+    plt.xlabel("Threshold value")
+    plt.ylabel("False positive rate")
+    plt.title("False positive rate per threshold value")
+    plt.show()
+
+    plt.plot(x_axis, np.log(fps_rate), '*')
+    plt.xlabel("Threshold value")
+    plt.ylabel("False positive rate (log scale)")
+    plt.title("False positive rate per threshold value (logarithmic scale)")
+
+    plt.show()
diff --git a/setup.cfg b/setup.cfg
index d9ccea8..5092a9a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,6 +22,9 @@ classifiers =
 packages = find:
 include_package_data = True
 python_requires = >=3.6
+install_requires =
+    numpy
+    matplotlib
 
 [options.entry_points]
 console_scripts =