Compute signature matrix using NumPy
This commit is contained in:
parent
d60653461a
commit
99fada5b52
|
@ -1,8 +1,13 @@
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import random
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import sys
|
import sys
|
||||||
from typing import Generator
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
SHINGLE_SIZE = 5 # Known as k
|
||||||
|
PERMUTATIONS_COUNT = 3
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv: dict = None) -> argparse.Namespace:
|
def parse_args(argv: dict = None) -> argparse.Namespace:
|
||||||
|
@ -26,21 +31,38 @@ def normalize(doc: str) -> str:
|
||||||
).casefold().encode('ascii', 'ignore').decode('ascii')
|
).casefold().encode('ascii', 'ignore').decode('ascii')
|
||||||
|
|
||||||
|
|
||||||
def compute_shingles(docs: list[str], single_size: int) -> Generator[set[int], any, None]:
|
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
||||||
|
shingle_matrix = np.zeros((2, len(docs)))
|
||||||
shingle_id = {}
|
shingle_id = {}
|
||||||
id_shingle = []
|
|
||||||
ids = 0
|
|
||||||
|
|
||||||
for d in docs:
|
for doc_id, doc in enumerate(docs):
|
||||||
char_shing = [d[i:i + single_size] for i in range(len(d) - single_size + 1)]
|
char_shing = [doc[i:i + single_size] for i in range(len(doc) - single_size + 1)]
|
||||||
sid = set()
|
|
||||||
for sh in char_shing:
|
for sh in char_shing:
|
||||||
if sh not in shingle_id:
|
if sh not in shingle_id:
|
||||||
shingle_id[sh] = ids
|
shingle_id[sh] = len(shingle_id)
|
||||||
id_shingle.append(sh)
|
if shingle_id[sh] >= len(shingle_matrix):
|
||||||
ids = ids + 1
|
# Extend matrix, double its size
|
||||||
sid.add(shingle_id[sh])
|
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape), axis=0)
|
||||||
yield sid
|
|
||||||
|
shingle_matrix[shingle_id[sh], doc_id] = 1
|
||||||
|
|
||||||
|
shingle_matrix = shingle_matrix[:len(shingle_id)]
|
||||||
|
|
||||||
|
return shingle_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def min_hash(doc: str, perm: list[str]) -> str:
|
||||||
|
for d in perm:
|
||||||
|
if d in doc:
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def compute_signature_matrix(shingles: np.ndarray, permutations_count: int) -> np.ndarray:
|
||||||
|
permutation_matrix = np.zeros((permutations_count, len(shingles)))
|
||||||
|
for i in range(permutations_count):
|
||||||
|
permutation_matrix[i] = np.random.permutation(len(shingles))
|
||||||
|
|
||||||
|
return permutation_matrix @ shingles
|
||||||
|
|
||||||
|
|
||||||
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
def jaccard_similarity(doc1: set, doc2: set) -> float:
|
||||||
|
@ -56,14 +78,8 @@ def parse(stream, similarity: float) -> None:
|
||||||
docs = [line.rstrip('\n') for line in stream]
|
docs = [line.rstrip('\n') for line in stream]
|
||||||
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
docs = [normalize(doc) for doc in docs] # Remove special characters and normalize accents
|
||||||
|
|
||||||
shingles = list(compute_shingles(docs, 5))
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
|
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
||||||
for i, doc1 in enumerate(shingles):
|
|
||||||
for j in range(i + 1, len(shingles)):
|
|
||||||
doc2 = shingles[j]
|
|
||||||
d = jaccard_similarity(doc1, doc2)
|
|
||||||
if d >= similarity:
|
|
||||||
print(f"{i} {j} {d:.06f}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Reference in New Issue