Add Python script that generates a new word and the similarity values

Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
2022-05-04 17:42:35 +02:00
parent f6cf031a22
commit 3835010510
3 changed files with 51500 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@

 cemantix-charm
 .ssh
-cemantics.txt
+cemantix.txt
+history/
--- a/lemmes.txt
+++ b/lemmes.txt
--- a/new_word.py
+++ b/new_word.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+from datetime import date
+import os.path
+from random import choice
+
+from gensim.models import KeyedVectors
+
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# This file contains a file generated by word2vec, in binary format
+BIN_FILE = os.path.join(
+        BASE_DIR,
+        'frwiki.skip.size500.win10.neg15.sample1e-5.min15.bin',
+)
+
+def main() -> None:
+    # Open pre-trained dataset
+    wv = KeyedVectors.load_word2vec_format(
+            BIN_FILE,
+            binary=True,
+    )
+
+    # Allowed words
+    with open(os.path.join(BASE_DIR, 'lemmes.txt')) as f:
+        words = [w for w in map(lambda w: w.replace('\n', ''), f.readlines())
+                 if not w.startswith('#')]
+
+    # Choose a word to search
+    word = choice(words)
+
+    # Compute all similarity values
+    similarities = {w: wv.similarity(word, w) for w in words}
+
+    # Compute scores by order
+    rank = sorted(similarities.items(), key=lambda item: -item[1])
+
+    if not os.path.isdir(os.path.join(BASE_DIR, 'history')):
+        os.mkdir(os.path.join(BASE_DIR, 'history'))
+
+    # Store output
+    today = date.today()
+    filename = f'{today.year:04d}-{today.month:02d}-{today.day:02d}.txt'
+    with open(os.path.join(BASE_DIR, 'history', filename), 'w') as f:
+        for w, d in rank:
+            f.write(f"{w} {100 * d:.02f}\n")
+
+    if os.path.isfile(os.path.join(BASE_DIR, 'cemantix.txt')):
+        os.unlink(os.path.join(BASE_DIR, 'cemantix.txt'))
+    os.symlink(os.path.join(BASE_DIR, 'history', filename), 'cemantix.txt')
+
+
+if __name__ == '__main__':
+    main()