1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-30 06:53:03 +00:00

Normalize before removing wrong letters

This commit is contained in:
Yohann D'ANELLO 2020-10-02 16:59:58 +02:00
parent 8fa724e848
commit 3edc3ffa02

View File

@ -31,7 +31,6 @@ def generate_side_identifier(title, authors, subtitle=None):
title_normalized = re.sub(r'^THE ', '', title_normalized) title_normalized = re.sub(r'^THE ', '', title_normalized)
title_normalized = re.sub(r'Œ', 'OE', title_normalized) title_normalized = re.sub(r'Œ', 'OE', title_normalized)
title_normalized = title_normalized.replace(' ', '') title_normalized = title_normalized.replace(' ', '')
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
title_normalized = ''.join( title_normalized = ''.join(
char char
for char in unicodedata.normalize( for char in unicodedata.normalize(
@ -39,6 +38,7 @@ def generate_side_identifier(title, authors, subtitle=None):
if all(not unicodedata.category(char).startswith(cat) if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' ' for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper() ).casefold().upper()
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
authors = authors.copy() authors = authors.copy()
def sort(author): def sort(author):