#!/usr/bin/env python3 from datetime import date import os.path from random import choice from gensim.models import KeyedVectors BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # This file contains a file generated by word2vec, in binary format BIN_FILE = os.path.join( BASE_DIR, 'frwiki.skip.size500.win10.neg15.sample1e-5.min15.bin', ) def main() -> None: # Open pre-trained dataset wv = KeyedVectors.load_word2vec_format( BIN_FILE, binary=True, ) # Allowed words with open(os.path.join(BASE_DIR, 'lemmes.txt')) as f: words = [w for w in map(lambda w: w.replace('\n', ''), f.readlines()) if not w.startswith('#')] # Choose a word to search word = choice(words) # Compute all similarity values similarities = {w: wv.similarity(word, w) for w in words} # Compute scores by order rank = sorted(similarities.items(), key=lambda item: -item[1]) if not os.path.isdir(os.path.join(BASE_DIR, 'history')): os.mkdir(os.path.join(BASE_DIR, 'history')) # Store output today = date.today() filename = f'{today.year:04d}-{today.month:02d}-{today.day:02d}.txt' with open(os.path.join(BASE_DIR, 'history', filename), 'w') as f: for w, d in rank: f.write(f"{w} {100 * d:.02f}\n") if os.path.isfile(os.path.join(BASE_DIR, 'cemantix.txt')): os.unlink(os.path.join(BASE_DIR, 'cemantix.txt')) os.symlink(os.path.join(BASE_DIR, 'history', filename), 'cemantix.txt') if __name__ == '__main__': main()