Comment code
This commit is contained in:
parent
6eba4961f3
commit
57d53a742f
195
algods/algods.py
195
algods/algods.py
|
@ -8,12 +8,23 @@ SHINGLE_SIZE = 5 # Known as k
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
|
"""
|
||||||
|
Parse arguments from the command line using argparse.
|
||||||
|
This returns an Argparse namespace with the parsed arguments.
|
||||||
|
Raises an error if an argument is invalid.
|
||||||
|
--help option is implicitly added.
|
||||||
|
"""
|
||||||
if argv is None:
|
if argv is None:
|
||||||
argv = sys.argv
|
argv = sys.argv
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Exercise 1')
|
parser = argparse.ArgumentParser(description='Document similarity')
|
||||||
parser.add_argument('input', nargs='?', type=argparse.FileType('r'), help='Documents to read.', default=sys.stdin)
|
# Input document option. Can be whatever file descriptor, including standard input.
|
||||||
|
parser.add_argument('input', nargs='?', type=argparse.FileType('r'),
|
||||||
|
help='Documents to read.', default=sys.stdin)
|
||||||
|
# Give similarity threshold.
|
||||||
parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05)
|
parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05)
|
||||||
|
# Optional. Let to display a progress bar while generating and applying permutations,
|
||||||
|
# which is the most expensive state.
|
||||||
parser.add_argument('--progress', '-p', '--tqdm', action='store_true',
|
parser.add_argument('--progress', '-p', '--tqdm', action='store_true',
|
||||||
help='Display progress bar while calculating signature matrix.')
|
help='Display progress bar while calculating signature matrix.')
|
||||||
|
|
||||||
|
@ -23,6 +34,7 @@ def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
def normalize(doc: str) -> str:
|
def normalize(doc: str) -> str:
|
||||||
"""
|
"""
|
||||||
Remove accents from letters, remove non-ascii letters, keep only letters and digits.
|
Remove accents from letters, remove non-ascii letters, keep only letters and digits.
|
||||||
|
For instance, "I l0ve Pokémons & co." gives "il0vepokemonsco".
|
||||||
"""
|
"""
|
||||||
return ''.join(char for char in unicodedata.normalize(
|
return ''.join(char for char in unicodedata.normalize(
|
||||||
'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe'))
|
'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe'))
|
||||||
|
@ -31,52 +43,169 @@ def normalize(doc: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Transform a list of documents into a shingle matrix.
|
||||||
|
This takes as input a list of documents (strings) that are well-formated (without any special character),
|
||||||
|
and the length of the shingles.
|
||||||
|
It outputs a Numpy boolean matrix where M(i, j) = True states that the shingle i appears in the document j.
|
||||||
|
|
||||||
|
Since we don't know first the shingles count, we extend regularly the size of the matrix, without
|
||||||
|
generating an overflow.
|
||||||
|
"""
|
||||||
|
# Initialize the shingle matrix with 2 shingles
|
||||||
shingle_matrix = np.zeros((2, len(docs)), dtype=bool)
|
shingle_matrix = np.zeros((2, len(docs)), dtype=bool)
|
||||||
shingle_id = {}
|
shingle_id = {}
|
||||||
|
|
||||||
for doc_id, doc in enumerate(docs):
|
for doc_id, doc in enumerate(docs):
|
||||||
|
# Compute different shingles for a single document
|
||||||
char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)]
|
char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)]
|
||||||
for sh in char_shing:
|
for sh in char_shing:
|
||||||
|
# The given shingle is unknown. Register it
|
||||||
if sh not in shingle_id:
|
if sh not in shingle_id:
|
||||||
shingle_id[sh] = len(shingle_id)
|
shingle_id[sh] = len(shingle_id)
|
||||||
if shingle_id[sh] >= len(shingle_matrix):
|
if shingle_id[sh] >= len(shingle_matrix):
|
||||||
# Extend matrix, double its size
|
# Matrix is too slow, so we double its size
|
||||||
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0)
|
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0)
|
||||||
|
|
||||||
|
# Store the information that the shingle is in the document
|
||||||
shingle_matrix[shingle_id[sh], doc_id] = True
|
shingle_matrix[shingle_id[sh], doc_id] = True
|
||||||
|
|
||||||
|
# Reduce matrix size to useful content
|
||||||
shingle_matrix = shingle_matrix[:len(shingle_id)]
|
shingle_matrix = shingle_matrix[:len(shingle_id)]
|
||||||
|
|
||||||
return shingle_matrix
|
return shingle_matrix
|
||||||
|
|
||||||
|
|
||||||
def min_hash(doc: str, perm: list[str]) -> str:
|
def compute_optimal_matrix_size(threshold: float) -> tuple[int, int]:
|
||||||
for d in perm:
|
"""
|
||||||
if d in doc:
|
Compute bands and rows number for the signature matrix
|
||||||
return d
|
such that these values let an approximation of the similarity of two
|
||||||
|
documents, using LSH.
|
||||||
|
|
||||||
|
Recall that the threshold for a signature matrix with b bands of r rows
|
||||||
|
is given by t = (1 / b) ** (1 / r).
|
||||||
|
|
||||||
|
We want that this value is lower than the expected threshold to avoid
|
||||||
|
true negatives, but we want that this value stay lear the expected
|
||||||
|
value since we also want to avoid false positives.
|
||||||
|
|
||||||
|
Then, we ensure that the estimated threshold is between
|
||||||
|
2/3*threshold and threshold.
|
||||||
|
|
||||||
|
To achieve that, we start from some values, then we add bands if the
|
||||||
|
threshold is too high, or add some rows per band if it is too high.
|
||||||
|
"""
|
||||||
|
# Compute b and r such that s/2 < t < s
|
||||||
|
# Use at least 2 rows and 16 bands to have good values
|
||||||
|
rows = 2
|
||||||
|
bands = 16
|
||||||
|
est_threshold = (1 / bands) ** (1 / rows)
|
||||||
|
# Threshold is not acceptable
|
||||||
|
while not (2 * threshold / 3 < est_threshold < threshold):
|
||||||
|
# Add bands
|
||||||
|
if est_threshold >= threshold:
|
||||||
|
bands *= 2
|
||||||
|
# Add rows
|
||||||
|
else:
|
||||||
|
rows *= 2
|
||||||
|
est_threshold = (1 / bands) ** (1 / rows)
|
||||||
|
|
||||||
|
# Estimated threshold is now near required threshold
|
||||||
|
return bands, rows
|
||||||
|
|
||||||
|
|
||||||
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, display_tqdm: bool = False) -> np.ndarray:
|
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int, display_tqdm: bool = False) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Implementation of the min-hash algorithm.
|
||||||
|
|
||||||
|
We compute a signature matrix of shingles generated by random permutations.
|
||||||
|
The shingles parameters stands for the shingle boolean matrix (shingle x document)
|
||||||
|
where shingles[i, j] = True states that shingle i appears in document j.
|
||||||
|
|
||||||
|
The permutations_count argument indicates the number of random permutations to generate.
|
||||||
|
|
||||||
|
The output is the signature matrix, that has for dimensions (permutations_count x docs_count).
|
||||||
|
For each permutation, we generate it randomly, then we take the first shingle of the document.
|
||||||
|
|
||||||
|
While the permutation generation can be done quickly, the check of the first shingle of a
|
||||||
|
document in a permutation (which can be achieved with an argmax in a boolean row) may be
|
||||||
|
quite expensive, and take some time. If supported and option enabled, a progress bar can
|
||||||
|
be displayed.
|
||||||
|
"""
|
||||||
shingles_count, docs_count = shingles.shape
|
shingles_count, docs_count = shingles.shape
|
||||||
|
|
||||||
|
# Initialize matrix
|
||||||
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
|
||||||
|
|
||||||
permutations_iterator = range(permutations_count)
|
permutations_iterator = range(permutations_count)
|
||||||
|
# If supported, load tqdm to display the progress bar
|
||||||
if display_tqdm:
|
if display_tqdm:
|
||||||
try:
|
try:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
permutations_iterator = tqdm(permutations_iterator)
|
permutations_iterator = tqdm(permutations_iterator, unit="perm.")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("tqdm is not installed. Please install tqdm before using --tqdm option.")
|
print("tqdm is not installed. Please install tqdm before using --tqdm option.")
|
||||||
|
|
||||||
for permutation_id in permutations_iterator:
|
for permutation_id in permutations_iterator:
|
||||||
|
# Generate random permutation of shingles
|
||||||
|
# This is not the most expensive task
|
||||||
permutation = np.random.permutation(shingles)
|
permutation = np.random.permutation(shingles)
|
||||||
|
# For each document, get the smallest shingle after permutation
|
||||||
|
# This is the expensive operation
|
||||||
signature_matrix[permutation_id] = permutation.argmax(0)
|
signature_matrix[permutation_id] = permutation.argmax(0)
|
||||||
|
|
||||||
return signature_matrix
|
return signature_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def find_candidate_pairs(signature: np.ndarray, bands: int, rows: int) -> set[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Implementation of the LSH algorithm.
|
||||||
|
|
||||||
|
Given a signature matrix and band and rows per band numbers, we want to
|
||||||
|
find some candidate document pairs to be similar.
|
||||||
|
|
||||||
|
We already know that the probability that two documents have the same signature
|
||||||
|
is the same as their similarity.
|
||||||
|
|
||||||
|
Two documents are called are a candidate pair if they have the same signature on
|
||||||
|
all rows of at least one band.
|
||||||
|
|
||||||
|
The output is a set of pairs of document ids.
|
||||||
|
"""
|
||||||
|
_, docs_count = signature.shape
|
||||||
|
|
||||||
|
candidate_pairs = set()
|
||||||
|
|
||||||
|
for band_id in range(bands):
|
||||||
|
# Get interesting band
|
||||||
|
band = signature[band_id * rows:(band_id + 1) * rows]
|
||||||
|
|
||||||
|
buckets = {}
|
||||||
|
|
||||||
|
# Put documents into buckets
|
||||||
|
# A bucket is the tuple of all signatures of a row
|
||||||
|
for doc in range(docs_count):
|
||||||
|
sign_doc = tuple(band[:, doc])
|
||||||
|
buckets.setdefault(sign_doc, set())
|
||||||
|
buckets[sign_doc].add(doc)
|
||||||
|
|
||||||
|
# Find documents in the same bucket
|
||||||
|
for bucket in buckets.values():
|
||||||
|
for doc_a in bucket:
|
||||||
|
for doc_b in bucket:
|
||||||
|
if doc_a != doc_b:
|
||||||
|
# Sort documents for nice output
|
||||||
|
doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b)
|
||||||
|
candidate_pairs.add((doc_a, doc_b))
|
||||||
|
|
||||||
|
return candidate_pairs
|
||||||
|
|
||||||
|
|
||||||
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
|
"""
|
||||||
|
Compute jaccard similarity of two sets.
|
||||||
|
This is defined as 0 if both sets are empty, |A ∩ B| / |A ∪ B| if general cases.
|
||||||
|
"""
|
||||||
if not doc1 or not doc2:
|
if not doc1 or not doc2:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
@ -86,46 +215,30 @@ def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
|
|
||||||
|
|
||||||
def parse(stream, similarity: float, display_tqdm: bool = False) -> None:
|
def parse(stream, similarity: float, display_tqdm: bool = False) -> None:
|
||||||
docs = [line.rstrip('\n') for line in stream]
|
"""
|
||||||
|
Given a stream of documents (separated by line feeds) and a similarity threshold,
|
||||||
|
we display in standard output an estimation of document pairs that
|
||||||
|
have a Jaccard similarity higher than the requested threshold.
|
||||||
|
|
||||||
|
We use k-shringling, MinHash and LSH to compute the estimation.
|
||||||
|
"""
|
||||||
|
docs = [line.rstrip('\n') for line in stream] # Read stream
|
||||||
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||||
|
|
||||||
|
# Compute k-shingles
|
||||||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
|
|
||||||
# Compute b and r such that s/2 < t < s
|
# Compute best values for permutations count
|
||||||
# Use at least 2 rows and 16 bands to have good values
|
bands, rows = compute_optimal_matrix_size(similarity)
|
||||||
rows = 2
|
# Compute signature matrix using MinHash
|
||||||
bands = 16
|
|
||||||
threshold = (1 / bands) ** (1 / rows)
|
|
||||||
while not (2 * similarity / 3 < threshold < similarity):
|
|
||||||
if threshold >= similarity:
|
|
||||||
bands *= 2
|
|
||||||
else:
|
|
||||||
rows *= 2
|
|
||||||
threshold = (1 / bands) ** (1 / rows)
|
|
||||||
|
|
||||||
signature = compute_signature_matrix(shingles, bands * rows, display_tqdm)
|
signature = compute_signature_matrix(shingles, bands * rows, display_tqdm)
|
||||||
|
|
||||||
candidate_pairs = set()
|
# Guess candidate pairs using LSH
|
||||||
|
candidate_pairs = find_candidate_pairs(signature, bands, rows)
|
||||||
for band_id in range(bands):
|
# Sort pairs for a nice output
|
||||||
band = signature[band_id * rows:(band_id + 1) * rows]
|
|
||||||
|
|
||||||
buckets = {}
|
|
||||||
|
|
||||||
for doc in range(len(docs)):
|
|
||||||
sign_doc = tuple(band[:, doc])
|
|
||||||
buckets.setdefault(sign_doc, set())
|
|
||||||
buckets[sign_doc].add(doc)
|
|
||||||
|
|
||||||
for bucket in buckets.values():
|
|
||||||
for doc_a in bucket:
|
|
||||||
for doc_b in bucket:
|
|
||||||
if doc_a != doc_b:
|
|
||||||
doc_a, doc_b = min(doc_a, doc_b), max(doc_a, doc_b)
|
|
||||||
candidate_pairs.add((doc_a, doc_b))
|
|
||||||
|
|
||||||
candidate_pairs = sorted(candidate_pairs)
|
candidate_pairs = sorted(candidate_pairs)
|
||||||
|
|
||||||
|
# For each document pair, compute true Jaccard similarity and display it
|
||||||
for doc_a, doc_b in candidate_pairs:
|
for doc_a, doc_b in candidate_pairs:
|
||||||
# Compute true jaccard similarity
|
# Compute true jaccard similarity
|
||||||
shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a])
|
shingles_a = set(x for x in range(len(shingles)) if shingles[x, doc_a])
|
||||||
|
@ -136,9 +249,11 @@ def parse(stream, similarity: float, display_tqdm: bool = False) -> None:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Parse arguments from command line
|
||||||
ns = parse_args()
|
ns = parse_args()
|
||||||
|
|
||||||
if not (0 < ns.similarity <= 1):
|
if not (0 < ns.similarity <= 1):
|
||||||
raise ValueError(f"Invalid similiarity value: {ns.similarity}")
|
raise ValueError(f"Invalid similiarity value: {ns.similarity}")
|
||||||
|
|
||||||
|
# Analyse documents
|
||||||
parse(ns.input, ns.similarity, ns.progress)
|
parse(ns.input, ns.similarity, ns.progress)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
[metadata]
|
[metadata]
|
||||||
name = algods
|
name = algods
|
||||||
|
version = 1.0
|
||||||
long_description = file: README.md
|
long_description = file: README.md
|
||||||
long_description_content_type = text/markdown
|
long_description_content_type = text/markdown
|
||||||
description = Django application to manage a network.
|
description = Django application to manage a network.
|
||||||
|
|
Loading…
Reference in New Issue