Parsing is working
This commit is contained in:
parent
ba7170e91c
commit
d60653461a
|
@ -1,5 +1,8 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
|
@ -13,15 +16,54 @@ def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
return parser.parse_args(argv[1:])
|
return parser.parse_args(argv[1:])
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(doc: str) -> str:
|
||||||
|
"""
|
||||||
|
Remove accents from letters, remove non-ascii letters, keep only letters and digits.
|
||||||
|
"""
|
||||||
|
return ''.join(char for char in unicodedata.normalize(
|
||||||
|
'NFKD', doc.casefold().replace('æ', 'ae').replace('œ', 'oe'))
|
||||||
|
if unicodedata.category(char) in ['Lu', 'Ll', 'Nd']
|
||||||
|
).casefold().encode('ascii', 'ignore').decode('ascii')
|
||||||
|
|
||||||
|
|
||||||
|
def compute_shingles(docs: list[str], single_size: int) -> Generator[set[int], any, None]:
|
||||||
|
shingle_id = {}
|
||||||
|
id_shingle = []
|
||||||
|
ids = 0
|
||||||
|
|
||||||
|
for d in docs:
|
||||||
|
char_shing = [d[i:i + single_size] for i in range(len(d) - single_size + 1)]
|
||||||
|
sid = set()
|
||||||
|
for sh in char_shing:
|
||||||
|
if sh not in shingle_id:
|
||||||
|
shingle_id[sh] = ids
|
||||||
|
id_shingle.append(sh)
|
||||||
|
ids = ids + 1
|
||||||
|
sid.add(shingle_id[sh])
|
||||||
|
yield sid
|
||||||
|
|
||||||
|
|
||||||
|
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
|
if not doc1 or not doc2:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
inter = doc1.intersection(doc2)
|
||||||
|
union = doc1.union(doc2)
|
||||||
|
return len(inter) / len(union)
|
||||||
|
|
||||||
|
|
||||||
def parse(stream, similarity: float) -> None:
|
def parse(stream, similarity: float) -> None:
|
||||||
with stream:
|
docs = [line.rstrip('\n') for line in stream]
|
||||||
for doc in stream:
|
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||||
doc = doc.replace('\n', '')
|
|
||||||
parse_doc(doc)
|
|
||||||
|
|
||||||
|
shingles = list(compute_shingles(docs, 5))
|
||||||
|
|
||||||
def parse_doc(doc: str) -> None:
|
for i, doc1 in enumerate(shingles):
|
||||||
pass
|
for j in range(i + 1, len(shingles)):
|
||||||
|
doc2 = shingles[j]
|
||||||
|
d = jaccard_similarity(doc1, doc2)
|
||||||
|
if d >= similarity:
|
||||||
|
print(f"{i} {j} {d:.06f}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Reference in New Issue