Add Python script that generates a new word and the similarity values

Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
This commit is contained in:
Yohann D'ANELLO 2022-05-04 17:42:35 +02:00
parent f6cf031a22
commit 3835010510
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
3 changed files with 51500 additions and 1 deletions

3
.gitignore vendored
View File

@ -2,4 +2,5 @@
cemantix-charm
.ssh
cemantics.txt
cemantix.txt
history/

51443
lemmes.txt Normal file

File diff suppressed because it is too large Load Diff

55
new_word.py Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env python3
from datetime import date
import os.path
from random import choice
from gensim.models import KeyedVectors
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# This file contains a file generated by word2vec, in binary format
BIN_FILE = os.path.join(
BASE_DIR,
'frwiki.skip.size500.win10.neg15.sample1e-5.min15.bin',
)
def main() -> None:
# Open pre-trained dataset
wv = KeyedVectors.load_word2vec_format(
BIN_FILE,
binary=True,
)
# Allowed words
with open(os.path.join(BASE_DIR, 'lemmes.txt')) as f:
words = [w for w in map(lambda w: w.replace('\n', ''), f.readlines())
if not w.startswith('#')]
# Choose a word to search
word = choice(words)
# Compute all similarity values
similarities = {w: wv.similarity(word, w) for w in words}
# Compute scores by order
rank = sorted(similarities.items(), key=lambda item: -item[1])
if not os.path.isdir(os.path.join(BASE_DIR, 'history')):
os.mkdir(os.path.join(BASE_DIR, 'history'))
# Store output
today = date.today()
filename = f'{today.year:04d}-{today.month:02d}-{today.day:02d}.txt'
with open(os.path.join(BASE_DIR, 'history', filename), 'w') as f:
for w, d in rank:
f.write(f"{w} {100 * d:.02f}\n")
if os.path.isfile(os.path.join(BASE_DIR, 'cemantix.txt')):
os.unlink(os.path.join(BASE_DIR, 'cemantix.txt'))
os.symlink(os.path.join(BASE_DIR, 'history', filename), 'cemantix.txt')
if __name__ == '__main__':
main()