Add Python script that generates a new word and the similarity values
Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
This commit is contained in:
parent
f6cf031a22
commit
3835010510
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,4 +2,5 @@
|
|||||||
|
|
||||||
cemantix-charm
|
cemantix-charm
|
||||||
.ssh
|
.ssh
|
||||||
cemantics.txt
|
cemantix.txt
|
||||||
|
history/
|
||||||
|
51443
lemmes.txt
Normal file
51443
lemmes.txt
Normal file
File diff suppressed because it is too large
Load Diff
55
new_word.py
Executable file
55
new_word.py
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
import os.path
|
||||||
|
from random import choice
|
||||||
|
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# This file contains a file generated by word2vec, in binary format
|
||||||
|
BIN_FILE = os.path.join(
|
||||||
|
BASE_DIR,
|
||||||
|
'frwiki.skip.size500.win10.neg15.sample1e-5.min15.bin',
|
||||||
|
)
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
# Open pre-trained dataset
|
||||||
|
wv = KeyedVectors.load_word2vec_format(
|
||||||
|
BIN_FILE,
|
||||||
|
binary=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Allowed words
|
||||||
|
with open(os.path.join(BASE_DIR, 'lemmes.txt')) as f:
|
||||||
|
words = [w for w in map(lambda w: w.replace('\n', ''), f.readlines())
|
||||||
|
if not w.startswith('#')]
|
||||||
|
|
||||||
|
# Choose a word to search
|
||||||
|
word = choice(words)
|
||||||
|
|
||||||
|
# Compute all similarity values
|
||||||
|
similarities = {w: wv.similarity(word, w) for w in words}
|
||||||
|
|
||||||
|
# Compute scores by order
|
||||||
|
rank = sorted(similarities.items(), key=lambda item: -item[1])
|
||||||
|
|
||||||
|
if not os.path.isdir(os.path.join(BASE_DIR, 'history')):
|
||||||
|
os.mkdir(os.path.join(BASE_DIR, 'history'))
|
||||||
|
|
||||||
|
# Store output
|
||||||
|
today = date.today()
|
||||||
|
filename = f'{today.year:04d}-{today.month:02d}-{today.day:02d}.txt'
|
||||||
|
with open(os.path.join(BASE_DIR, 'history', filename), 'w') as f:
|
||||||
|
for w, d in rank:
|
||||||
|
f.write(f"{w} {100 * d:.02f}\n")
|
||||||
|
|
||||||
|
if os.path.isfile(os.path.join(BASE_DIR, 'cemantix.txt')):
|
||||||
|
os.unlink(os.path.join(BASE_DIR, 'cemantix.txt'))
|
||||||
|
os.symlink(os.path.join(BASE_DIR, 'history', filename), 'cemantix.txt')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user