algo-ds/algods/algods.py

107 lines
3.3 KiB
Python
Raw Normal View History

2021-10-08 13:13:52 +00:00
import argparse
2021-10-17 08:50:50 +00:00
import random
2021-10-17 09:48:56 +00:00
import time
2021-10-15 12:36:47 +00:00
import unicodedata
2021-10-08 13:13:52 +00:00
import sys
2021-10-17 08:50:50 +00:00
import numpy as np
SHINGLE_SIZE = 5 # Known as k
PERMUTATIONS_COUNT = 3
2021-10-08 13:13:52 +00:00
def parse_args(argv: dict = None) -> argparse.Namespace:
if argv is None:
argv = sys.argv
parser = argparse.ArgumentParser(description='Exercise 1')
parser.add_argument('input', nargs='?', type=argparse.FileType('r'), help='Documents to read.', default=sys.stdin)
parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05)
return parser.parse_args(argv[1:])
2021-10-15 12:36:47 +00:00
def normalize(doc: str) -> str:
"""
Remove accents from letters, remove non-ascii letters, keep only letters and digits.
"""
return ''.join(char for char in unicodedata.normalize(
'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe'))
if unicodedata.category(char) in ['Lu', 'Ll', 'Nd']
).casefold().encode('ascii', 'ignore').decode('ascii')
2021-10-17 08:50:50 +00:00
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
2021-10-17 09:48:56 +00:00
shingle_matrix = np.zeros((2, len(docs)), dtype=bool)
2021-10-15 12:36:47 +00:00
shingle_id = {}
2021-10-17 08:50:50 +00:00
for doc_id, doc in enumerate(docs):
char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)]
2021-10-15 12:36:47 +00:00
for sh in char_shing:
if sh not in shingle_id:
2021-10-17 08:50:50 +00:00
shingle_id[sh] = len(shingle_id)
if shingle_id[sh] >= len(shingle_matrix):
# Extend matrix, double its size
2021-10-17 09:48:56 +00:00
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0)
2021-10-17 08:50:50 +00:00
2021-10-17 09:48:56 +00:00
shingle_matrix[shingle_id[sh], doc_id] = True
2021-10-17 08:50:50 +00:00
shingle_matrix = shingle_matrix[:len(shingle_id)]
return shingle_matrix
def min_hash(doc: str, perm: list[str]) -> str:
for d in perm:
if d in doc:
return d
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
2021-10-17 09:27:45 +00:00
shingles_count, docs_count = shingles.shape
2021-10-17 08:50:50 +00:00
2021-10-17 09:27:45 +00:00
signature_matrix = np.inf * np.ones((permutations_count, docs_count))
for permutation_id in range(permutations_count):
permutation = np.random.permutation(shingles_count)
for doc_id, doc_shingles in enumerate(shingles.T):
for x in permutation:
if doc_shingles[x]:
signature_matrix[permutation_id, doc_id] = x
break
return signature_matrix
2021-10-15 12:36:47 +00:00
def jaccard_similarity(doc1: set, doc2: set) -> float:
if not doc1 or not doc2:
return 0.0
inter = doc1.intersection(doc2)
union = doc1.union(doc2)
return len(inter) / len(union)
2021-10-08 13:13:52 +00:00
def parse(stream, similarity: float) -> None:
2021-10-15 12:36:47 +00:00
docs = [line.rstrip('\n') for line in stream]
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
2021-10-08 13:13:52 +00:00
2021-10-17 08:50:50 +00:00
shingles = compute_shingles(docs, SHINGLE_SIZE)
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
2021-10-08 13:13:52 +00:00
2021-10-17 09:48:56 +00:00
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
for i, doc1 in enumerate(shingles_sets):
t = time.time()
for j in range(i + 1, len(docs)):
doc2 = shingles_sets[j]
d = jaccard_similarity(doc1, doc2)
if d >= similarity:
print(f"{i} {j} {d:.06f}")
2021-10-08 13:13:52 +00:00
2021-10-08 12:53:40 +00:00
def main():
2021-10-08 13:13:52 +00:00
ns = parse_args()
parse(ns.input, ns.similarity)