import argparse import re import unicodedata import sys from typing import Generator def parse_args(argv: dict = None) -> argparse.Namespace: if argv is None: argv = sys.argv parser = argparse.ArgumentParser(description='Exercise 1') parser.add_argument('input', nargs='?', type=argparse.FileType('r'), help='Documents to read.', default=sys.stdin) parser.add_argument('similarity', nargs='?', type=float, help='Similarity threshold.', default=0.05) return parser.parse_args(argv[1:]) def normalize(doc: str) -> str: """ Remove accents from letters, remove non-ascii letters, keep only letters and digits. """ return ''.join(char for char in unicodedata.normalize( 'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe')) if unicodedata.category(char) in ['Lu', 'Ll', 'Nd'] ).casefold().encode('ascii', 'ignore').decode('ascii') def compute_shingles(docs: list[str], single_size: int) -> Generator[set[int], any, None]: shingle_id = {} id_shingle = [] ids = 0 for d in docs: char_shing = [d[i:i + single_size] for i in range(len(d) - single_size + 1)] sid = set() for sh in char_shing: if sh not in shingle_id: shingle_id[sh] = ids id_shingle.append(sh) ids = ids + 1 sid.add(shingle_id[sh]) yield sid def jaccard_similarity(doc1: set, doc2: set) -> float: if not doc1 or not doc2: return 0.0 inter = doc1.intersection(doc2) union = doc1.union(doc2) return len(inter) / len(union) def parse(stream, similarity: float) -> None: docs = [line.rstrip('\n') for line in stream] docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents shingles = list(compute_shingles(docs, 5)) for i, doc1 in enumerate(shingles): for j in range(i + 1, len(shingles)): doc2 = shingles[j] d = jaccard_similarity(doc1, doc2) if d >= similarity: print(f"{i} {j} {d:.06f}") def main(): ns = parse_args() parse(ns.input, ns.similarity)