From d60653461a7f0cfeeec7c4716b6345916c01fc09 Mon Sep 17 00:00:00 2001 From: Yohann D'ANELLO Date: Fri, 15 Oct 2021 14:36:47 +0200 Subject: [PATCH] Parsing is working --- algods/algods.py | 54 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/algods/algods.py b/algods/algods.py index 4e3dc72..169f277 100644 --- a/algods/algods.py +++ b/algods/algods.py @@ -1,5 +1,8 @@ import argparse +import re +import unicodedata import sys +from typing import Generator def parse_args(argv: dict = None) -> argparse.Namespace: @@ -13,15 +16,54 @@ def parse_args(argv: dict = None) -> argparse.Namespace: return parser.parse_args(argv[1:]) +def normalize(doc: str) -> str: + """ + Remove accents from letters, remove non-ascii letters, keep only letters and digits. + """ + return ''.join(char for char in unicodedata.normalize( + 'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe')) + if unicodedata.category(char) in ['Lu', 'Ll', 'Nd'] + ).casefold().encode('ascii', 'ignore').decode('ascii') + + +def compute_shingles(docs: list[str], single_size: int) -> Generator[set[int], any, None]: + shingle_id = {} + id_shingle = [] + ids = 0 + + for d in docs: + char_shing = [d[i:i + single_size] for i in range(len(d) - single_size + 1)] + sid = set() + for sh in char_shing: + if sh not in shingle_id: + shingle_id[sh] = ids + id_shingle.append(sh) + ids = ids + 1 + sid.add(shingle_id[sh]) + yield sid + + +def jaccard_similarity(doc1: set, doc2: set) -> float: + if not doc1 or not doc2: + return 0.0 + + inter = doc1.intersection(doc2) + union = doc1.union(doc2) + return len(inter) / len(union) + + def parse(stream, similarity: float) -> None: - with stream: - for doc in stream: - doc = doc.replace('\n', '') - parse_doc(doc) + docs = [line.rstrip('\n') for line in stream] + docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents + shingles = list(compute_shingles(docs, 5)) -def parse_doc(doc: str) -> None: - pass + for i, doc1 in enumerate(shingles): + for j in range(i + 1, len(shingles)): + doc2 = shingles[j] + d = jaccard_similarity(doc1, doc2) + if d >= similarity: + print(f"{i} {j} {d:.06f}") def main():