Keep true Jaccard similarity
This commit is contained in:
parent
677ad5e8b7
commit
769c62ee93
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ def normalize(doc: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
||||||
shingle_matrix = np.zeros((2, len(docs)))
|
shingle_matrix = np.zeros((2, len(docs)), dtype=bool)
|
||||||
shingle_id = {}
|
shingle_id = {}
|
||||||
|
|
||||||
for doc_id, doc in enumerate(docs):
|
for doc_id, doc in enumerate(docs):
|
||||||
|
@ -42,9 +43,9 @@ def compute_shingles(docs: list[str], single_size: int) -> np.ndarray:
|
||||||
shingle_id[sh] = len(shingle_id)
|
shingle_id[sh] = len(shingle_id)
|
||||||
if shingle_id[sh] >= len(shingle_matrix):
|
if shingle_id[sh] >= len(shingle_matrix):
|
||||||
# Extend matrix, double its size
|
# Extend matrix, double its size
|
||||||
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape), axis=0)
|
shingle_matrix = np.append(shingle_matrix, np.zeros(shingle_matrix.shape, dtype=bool), axis=0)
|
||||||
|
|
||||||
shingle_matrix[shingle_id[sh], doc_id] = 1
|
shingle_matrix[shingle_id[sh], doc_id] = True
|
||||||
|
|
||||||
shingle_matrix = shingle_matrix[:len(shingle_id)]
|
shingle_matrix = shingle_matrix[:len(shingle_id)]
|
||||||
|
|
||||||
|
@ -90,6 +91,15 @@ def parse(stream, similarity: float) -> None:
|
||||||
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
shingles = compute_shingles(docs, SHINGLE_SIZE)
|
||||||
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
signature = compute_signature_matrix(shingles, PERMUTATIONS_COUNT)
|
||||||
|
|
||||||
|
shingles_sets = [set(x for x in range(len(shingles)) if shingles[x, doc_id]) for doc_id in range(len(docs))]
|
||||||
|
for i, doc1 in enumerate(shingles_sets):
|
||||||
|
t = time.time()
|
||||||
|
for j in range(i + 1, len(docs)):
|
||||||
|
doc2 = shingles_sets[j]
|
||||||
|
d = jaccard_similarity(doc1, doc2)
|
||||||
|
if d >= similarity:
|
||||||
|
print(f"{i} {j} {d:.06f}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ns = parse_args()
|
ns = parse_args()
|
||||||
|
|
Loading…
Reference in New Issue