From d767e2029b1795ed4650b6118cd6fce244c3cda2 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Fri, 29 Oct 2021 17:00:07 +0200 Subject: [PATCH] Print true/false positive/negative stats --- algods/algods.py | 54 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 43857e3..ab6bc10 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,6 +1,7 @@ import argparse import unicodedata import sys +from typing import Optional import numpy as np @@ -23,6 +24,9 @@ def parse_args(argv: dict = None) -> argparse.Namespace: help='Documents to read.', default=sys.stdin) # Give similarity threshold. parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05) + # Optional. Give statistics about true and false positive/negative rates, which take some time. + parser.add_argument('--stats', '-s', action='store_true', + help='Display some statistics.') # Optional. Let to display a progress bar while generating and applying permutations, # which is the most expensive state. parser.add_argument('--progress', '-p', '--tqdm', action='store_true', @@ -214,7 +218,8 @@ def jaccard_similarity(doc1: set, doc2: set) -> float: return len(inter) / len(union) -def parse(stream, similarity: float, display_tqdm: bool = False) -> None: +def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False) \ + -> Optional[tuple[int, int, int, int]]: """ Given a stream of documents (separated by line feeds) and a similarity threshold, we display in standard output an estimation of document pairs that @@ -238,14 +243,43 @@ def parse(stream, similarity: float, display_tqdm: bool = False) -> None: # Sort pairs for a nice output candidate_pairs = sorted(candidate_pairs) + # Compute true and false positive counts + tp = 0 + fp = 0 + # For each document pair, compute true Jaccard similarity and display it + shingles_set = [set(x for x in range(len(shingles)) if shingles[x, doc]) for doc in range(len(docs))] for doc_a, doc_b in candidate_pairs: # Compute true jaccard similarity - shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a]) - shingles_b = set(x for x in range(len(shingles)) if shingles[x, doc_b]) + shingles_a = shingles_set[doc_a] + shingles_b = shingles_set[doc_b] d = jaccard_similarity(shingles_a, shingles_b) if d >= similarity: print(f"{doc_a} {doc_b} {d:.06f}") + tp += 1 + else: + fp += 1 + + if stats: + # Compute true and false negative counts, for validation only + tn = 0 + fn = 0 + + for doc_a in range(len(docs)): + for doc_b in range(doc_a + 1, len(docs)): + # Compute true jaccard similarity + shingles_a = shingles_set[doc_a] + shingles_b = shingles_set[doc_b] + d = jaccard_similarity(shingles_a, shingles_b) + if d >= similarity and (doc_a, doc_b) not in candidate_pairs: + fn += 1 + elif d < similarity and (doc_a, doc_b) not in candidate_pairs: + tn += 1 + + fp_rate = fp / (fp + tn) + tp_rate = tp / (tp + fn) + + return tp, fp, tn, fn def main(): @@ -256,4 +290,16 @@ def main(): raise ValueError(f"Invalid similiarity value: {ns.similarity}") # Analyse documents - parse(ns.input, ns.similarity, ns.progress) + output = parse(ns.input, ns.similarity, stats=ns.stats, display_tqdm=ns.progress) + + if ns.stats: + tp, fp, tn, fn = output + print(f"True positive: {tp}", file=sys.stderr) + print(f"False positive: {tn}", file=sys.stderr) + print(f"True negative: {fp}", file=sys.stderr) + print(f"False negative: {fn}", file=sys.stderr) + + tp_rate = tp / (tp + fn) + fp_rate = fp / (fp + tn) + print(f"True positive rate: {tp_rate:.06f}", file=sys.stderr) + print(f"False positive rate: {fp_rate:.06f}", file=sys.stderr)