Draw graphs
This commit is contained in:
parent
f438ae6c79
commit
eb754a30e5
102
algods/algods.py
102
algods/algods.py
|
@ -3,6 +3,7 @@ import unicodedata
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
SHINGLE_SIZE = 5 # Known as k
|
SHINGLE_SIZE = 5 # Known as k
|
||||||
|
@ -31,6 +32,7 @@ def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
# which is the most expensive state.
|
# which is the most expensive state.
|
||||||
parser.add_argument('--progress', '-p', '--tqdm', action='store_true',
|
parser.add_argument('--progress', '-p', '--tqdm', action='store_true',
|
||||||
help='Display progress bar while calculating signature matrix.')
|
help='Display progress bar while calculating signature matrix.')
|
||||||
|
parser.add_argument('--graph', '-g', action='store_true', help="Draw graphs.")
|
||||||
|
|
||||||
return parser.parse_args(argv[1:])
|
return parser.parse_args(argv[1:])
|
||||||
|
|
||||||
|
@ -148,7 +150,7 @@ def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, disp
|
||||||
if display_tqdm:
|
if display_tqdm:
|
||||||
try:
|
try:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
permutations_iterator = tqdm(permutations_iterator, unit="perm.")
|
permutations_iterator = tqdm(permutations_iterator, unit="perm.", position=1)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("tqdm is not installed. Please install tqdm before using --tqdm option.")
|
print("tqdm is not installed. Please install tqdm before using --tqdm option.")
|
||||||
|
|
||||||
|
@ -207,6 +209,19 @@ def find_candidate_pairs(signature: np.ndarray, bands: int, rows: int) -> set[tu
|
||||||
return candidate_pairs
|
return candidate_pairs
|
||||||
|
|
||||||
|
|
||||||
|
def shingle_set(shingles: np.ndarray, doc_id: int) -> set[int]:
|
||||||
|
"""
|
||||||
|
Return the set of all shingle id from a document.
|
||||||
|
To don't recompute multiple times this, this is cached.
|
||||||
|
"""
|
||||||
|
if not hasattr(shingle_set, '_cache'):
|
||||||
|
shingle_set._cache = {}
|
||||||
|
if doc_id not in shingle_set._cache:
|
||||||
|
shingle_set._cache[doc_id] = set(x for x in range(len(shingles)) if shingles[x, doc_id])
|
||||||
|
|
||||||
|
return shingle_set._cache[doc_id]
|
||||||
|
|
||||||
|
|
||||||
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
"""
|
"""
|
||||||
Compute jaccard similarity of two sets.
|
Compute jaccard similarity of two sets.
|
||||||
|
@ -220,7 +235,7 @@ def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
return len(inter) / len(union)
|
return len(inter) / len(union)
|
||||||
|
|
||||||
|
|
||||||
def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False) \
|
def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool = False, verbose: bool = True) \
|
||||||
-> Optional[tuple[int, int, int, int]]:
|
-> Optional[tuple[int, int, int, int]]:
|
||||||
"""
|
"""
|
||||||
Given a stream of documents (separated by line feeds) and a similarity threshold,
|
Given a stream of documents (separated by line feeds) and a similarity threshold,
|
||||||
|
@ -234,7 +249,11 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
|
||||||
|
|
||||||
# Compute k-shingles
|
# Compute k-shingles
|
||||||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
|
return parse_shingles(docs, shingles, similarity, stats=stats, display_tqdm=display_tqdm, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_shingles(docs: list[str], shingles: np.ndarray, similarity: float, *, stats: bool = False,
|
||||||
|
display_tqdm: bool = False, verbose: bool = True) -> Optional[tuple[int, int, int, int]]:
|
||||||
# Compute best values for permutations count
|
# Compute best values for permutations count
|
||||||
bands, rows = compute_optimal_matrix_size(similarity)
|
bands, rows = compute_optimal_matrix_size(similarity)
|
||||||
# Compute signature matrix using MinHash
|
# Compute signature matrix using MinHash
|
||||||
|
@ -250,14 +269,14 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
|
||||||
fp = 0
|
fp = 0
|
||||||
|
|
||||||
# For each document pair, compute true Jaccard similarity and display it
|
# For each document pair, compute true Jaccard similarity and display it
|
||||||
shingles_set = [set(x for x in range(len(shingles)) if shingles[x, doc]) for doc in range(len(docs))]
|
|
||||||
for doc_a, doc_b in candidate_pairs:
|
for doc_a, doc_b in candidate_pairs:
|
||||||
# Compute true jaccard similarity
|
# Compute true jaccard similarity
|
||||||
shingles_a = shingles_set[doc_a]
|
shingles_a = shingle_set(shingles, doc_a)
|
||||||
shingles_b = shingles_set[doc_b]
|
shingles_b = shingle_set(shingles, doc_b)
|
||||||
d = jaccard_similarity(shingles_a, shingles_b)
|
d = jaccard_similarity(shingles_a, shingles_b)
|
||||||
if d >= similarity:
|
if d >= similarity:
|
||||||
print(f"{doc_a} {doc_b} {d:.06f}")
|
if verbose:
|
||||||
|
print(f"{doc_a} {doc_b} {d:.06f}")
|
||||||
tp += 1
|
tp += 1
|
||||||
else:
|
else:
|
||||||
fp += 1
|
fp += 1
|
||||||
|
@ -270,8 +289,8 @@ def parse(stream, similarity: float, *, stats: bool = False, display_tqdm: bool
|
||||||
for doc_a in range(len(docs)):
|
for doc_a in range(len(docs)):
|
||||||
for doc_b in range(doc_a + 1, len(docs)):
|
for doc_b in range(doc_a + 1, len(docs)):
|
||||||
# Compute true jaccard similarity
|
# Compute true jaccard similarity
|
||||||
shingles_a = shingles_set[doc_a]
|
shingles_a = shingle_set(shingles, doc_a)
|
||||||
shingles_b = shingles_set[doc_b]
|
shingles_b = shingle_set(shingles, doc_b)
|
||||||
d = jaccard_similarity(shingles_a, shingles_b)
|
d = jaccard_similarity(shingles_a, shingles_b)
|
||||||
if d >= similarity and (doc_a, doc_b) not in candidate_pairs:
|
if d >= similarity and (doc_a, doc_b) not in candidate_pairs:
|
||||||
fn += 1
|
fn += 1
|
||||||
|
@ -285,6 +304,10 @@ def main():
|
||||||
# Parse arguments from command line
|
# Parse arguments from command line
|
||||||
ns = parse_args()
|
ns = parse_args()
|
||||||
|
|
||||||
|
if ns.graph:
|
||||||
|
# Don't use the program to compute something
|
||||||
|
return graph(ns.input, ns.progress)
|
||||||
|
|
||||||
if not (0 < ns.similarity <= 1):
|
if not (0 < ns.similarity <= 1):
|
||||||
raise ValueError(f"Invalid similiarity value: {ns.similarity}")
|
raise ValueError(f"Invalid similiarity value: {ns.similarity}")
|
||||||
|
|
||||||
|
@ -302,3 +325,66 @@ def main():
|
||||||
fp_rate = fp / (fp + tn)
|
fp_rate = fp / (fp + tn)
|
||||||
print(f"True positive rate: {tp_rate:.06f}", file=sys.stderr)
|
print(f"True positive rate: {tp_rate:.06f}", file=sys.stderr)
|
||||||
print(f"False positive rate: {fp_rate:.06f}", file=sys.stderr)
|
print(f"False positive rate: {fp_rate:.06f}", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def graph(stream, display_tqdm: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Draw statistic graphs about false-positive and true positive rates using matplotlib.
|
||||||
|
"""
|
||||||
|
docs = [line.rstrip('\n') for line in stream] # Read stream
|
||||||
|
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||||
|
|
||||||
|
# Compute k-shingles
|
||||||
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
|
|
||||||
|
step = 0.05
|
||||||
|
n = int(1 // step)
|
||||||
|
|
||||||
|
tps, fps, tns, fns = [], [], [], []
|
||||||
|
|
||||||
|
step_iterator = range(1, n + 1)
|
||||||
|
if display_tqdm:
|
||||||
|
from tqdm import tqdm
|
||||||
|
step_iterator = tqdm(step_iterator, position=1)
|
||||||
|
|
||||||
|
for i in step_iterator:
|
||||||
|
t = i * step
|
||||||
|
tp, fp, tn, fn = parse_shingles(docs, shingles, t, stats=True, display_tqdm=display_tqdm, verbose=False)
|
||||||
|
tps.append(tp)
|
||||||
|
fps.append(fp)
|
||||||
|
tns.append(tn)
|
||||||
|
fns.append(fn)
|
||||||
|
|
||||||
|
tps = np.array(tps)
|
||||||
|
fps = np.array(fps)
|
||||||
|
tns = np.array(tns)
|
||||||
|
fns = np.array(fns)
|
||||||
|
|
||||||
|
tps_rate = tps / (tps + fns)
|
||||||
|
fps_rate = fps / (fps + tns)
|
||||||
|
|
||||||
|
print("tps = np.array(", tps, ")")
|
||||||
|
print("fps = np.array(", fps, ")")
|
||||||
|
print("tns = np.array(", tns, ")")
|
||||||
|
print("fns = np.array(", fns, ")")
|
||||||
|
|
||||||
|
x_axis = step * np.array(range(1, n + 1))
|
||||||
|
|
||||||
|
plt.plot(x_axis, tps_rate, '*')
|
||||||
|
plt.xlabel("Threshold value")
|
||||||
|
plt.ylabel("True positive rate")
|
||||||
|
plt.title("True positive rate per threshold value")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.plot(x_axis, fps_rate, '*')
|
||||||
|
plt.xlabel("Threshold value")
|
||||||
|
plt.ylabel("False positive rate")
|
||||||
|
plt.title("False positive rate per threshold value")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.plot(x_axis, np.log(fps_rate), '*')
|
||||||
|
plt.xlabel("Threshold value")
|
||||||
|
plt.ylabel("False positive rate (log scale)")
|
||||||
|
plt.title("False positive rate per threshold value (logarithmic scale)")
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
|
@ -22,6 +22,9 @@ classifiers =
|
||||||
packages = find:
|
packages = find:
|
||||||
include_package_data = True
|
include_package_data = True
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
|
install_requires =
|
||||||
|
numpy
|
||||||
|
matplotlib
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
|
Loading…
Reference in New Issue