Add Python script that generates a new word and the similarity values
Signed-off-by: Yohann D'ANELLO <ynerant@crans.org>
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -2,4 +2,5 @@
 | 
			
		||||
 | 
			
		||||
cemantix-charm
 | 
			
		||||
.ssh
 | 
			
		||||
cemantics.txt
 | 
			
		||||
cemantix.txt
 | 
			
		||||
history/
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										51443
									
								
								lemmes.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51443
									
								
								lemmes.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										55
									
								
								new_word.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										55
									
								
								new_word.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,55 @@
 | 
			
		||||
#!/usr/bin/env python3
 | 
			
		||||
 | 
			
		||||
from datetime import date
 | 
			
		||||
import os.path
 | 
			
		||||
from random import choice
 | 
			
		||||
 | 
			
		||||
from gensim.models import KeyedVectors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 | 
			
		||||
 | 
			
		||||
# This file contains a file generated by word2vec, in binary format
 | 
			
		||||
BIN_FILE = os.path.join(
 | 
			
		||||
        BASE_DIR,
 | 
			
		||||
        'frwiki.skip.size500.win10.neg15.sample1e-5.min15.bin',
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
def main() -> None:
 | 
			
		||||
    # Open pre-trained dataset
 | 
			
		||||
    wv = KeyedVectors.load_word2vec_format(
 | 
			
		||||
            BIN_FILE,
 | 
			
		||||
            binary=True,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Allowed words
 | 
			
		||||
    with open(os.path.join(BASE_DIR, 'lemmes.txt')) as f:
 | 
			
		||||
        words = [w for w in map(lambda w: w.replace('\n', ''), f.readlines())
 | 
			
		||||
                 if not w.startswith('#')]
 | 
			
		||||
 | 
			
		||||
    # Choose a word to search
 | 
			
		||||
    word = choice(words)
 | 
			
		||||
 | 
			
		||||
    # Compute all similarity values
 | 
			
		||||
    similarities = {w: wv.similarity(word, w) for w in words}
 | 
			
		||||
 | 
			
		||||
    # Compute scores by order
 | 
			
		||||
    rank = sorted(similarities.items(), key=lambda item: -item[1])
 | 
			
		||||
 | 
			
		||||
    if not os.path.isdir(os.path.join(BASE_DIR, 'history')):
 | 
			
		||||
        os.mkdir(os.path.join(BASE_DIR, 'history'))
 | 
			
		||||
 | 
			
		||||
    # Store output
 | 
			
		||||
    today = date.today()
 | 
			
		||||
    filename = f'{today.year:04d}-{today.month:02d}-{today.day:02d}.txt'
 | 
			
		||||
    with open(os.path.join(BASE_DIR, 'history', filename), 'w') as f:
 | 
			
		||||
        for w, d in rank:
 | 
			
		||||
            f.write(f"{w} {100 * d:.02f}\n")
 | 
			
		||||
 | 
			
		||||
    if os.path.isfile(os.path.join(BASE_DIR, 'cemantix.txt')):
 | 
			
		||||
        os.unlink(os.path.join(BASE_DIR, 'cemantix.txt'))
 | 
			
		||||
    os.symlink(os.path.join(BASE_DIR, 'history', filename), 'cemantix.txt')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    main()
 | 
			
		||||
		Reference in New Issue
	
	Block a user