1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-26 20:47:10 +00:00

Add script to regenerate side identifiers

This commit is contained in:
Yohann D'ANELLO 2020-09-28 10:51:21 +02:00
parent be76bf4857
commit 9ecd876923
3 changed files with 121 additions and 70 deletions

View File

@ -1,5 +1,5 @@
# -*- mode: python; coding: utf-8 -*- # -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay # Copyright (C) 2017-2020 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
import json import json
@ -7,6 +7,7 @@ import re
import unicodedata import unicodedata
import urllib.request import urllib.request
from django.db.models import QuerySet
from django.forms import ModelForm from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
@ -14,6 +15,60 @@ from .models import Auteur, BD
from .scraper import BedetequeScraper from .scraper import BedetequeScraper
def generate_side_identifier(title, authors, subtitle=None):
if isinstance(authors, QuerySet):
authors = list(authors)
title_normalized = title.upper()
title_normalized = title_normalized.replace('', '\'')
title_normalized = ''.join(
char
for char in unicodedata.normalize(
'NFKD', title_normalized.casefold())
if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
title_normalized = re.sub(r'^DE ', '', title_normalized)
title_normalized = re.sub(r'^LE ', '', title_normalized)
title_normalized = re.sub(r'^LA ', '', title_normalized)
title_normalized = re.sub(r'^LES ', '', title_normalized)
title_normalized = re.sub(r'^L\'', '', title_normalized)
title_normalized = re.sub(r'^THE ', '', title_normalized)
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
title_normalized = title_normalized.replace(' ', '')
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
authors = authors.copy()
def sort(author):
return str(-author.note) + author.name.split(" ")[-1] + ".{:05d}".format(author.pk)
authors.sort(key=sort)
primary_author = authors[0]
author_name = primary_author.name.upper()
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[-1]
author_name = ''.join(
char for char in unicodedata.normalize('NFKD', author_name.casefold())
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
author_name = re.sub("[^A-Z]", "", author_name)
side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
if subtitle:
subtitle = re.sub(r'</span>', '', subtitle)
subtitle = re.sub(r'<span.*>', '', subtitle)
start = subtitle.split(' ')[0].replace('.', '')
if start.isnumeric():
side_identifier += " {:0>2}".format(start, )
# Normalize side identifier, in order to remove accents
side_identifier = ''.join(char for char in unicodedata.normalize('NFKD', side_identifier.casefold())
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'})
or char == ' ').casefold().upper()
return side_identifier
class MediaAdminForm(ModelForm): class MediaAdminForm(ModelForm):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -188,54 +243,12 @@ class MediaAdminForm(ModelForm):
) )
if self.cleaned_data['authors']: if self.cleaned_data['authors']:
authors = self.cleaned_data['authors'] side_identifier = generate_side_identifier(
old_authors = authors.copy() self.cleaned_data["title"],
self.cleaned_data["authors"],
self.cleaned_data["subtitle"],
)
def sort(author):
return str(-author.note) + "." \
+ str(old_authors.index(author)) \
+ "." + author.name
authors.sort(key=sort)
author_name = self.cleaned_data['authors'][0].name
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[-1]
title_normalized = self.cleaned_data['title'].upper()
title_normalized = re.sub(r'^LE ', '', title_normalized)
title_normalized = re.sub(r'^LA ', '', title_normalized)
title_normalized = re.sub(r'^LES ', '', title_normalized)
title_normalized = re.sub(r'^L\'', '', title_normalized)
title_normalized = re.sub(r'^THE ', '', title_normalized)
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
side_identifier = "{:.3} {:.3}".format(
author_name,
title_normalized.replace(' ', ''), )
if self.cleaned_data['subtitle']:
self.cleaned_data['subtitle'] = re.sub(
r'</span>',
'',
self.cleaned_data['subtitle']
)
self.cleaned_data['subtitle'] = re.sub(
r'<span.*>',
'',
self.cleaned_data['subtitle']
)
start = self.cleaned_data['subtitle'].split(' ')[0] \
.replace('.', '')
if start.isnumeric():
side_identifier += " {:0>2}".format(start, )
# Normalize side identifier, in order to remove accents
side_identifier = ''.join(
char
for char in unicodedata.normalize(
'NFKD', side_identifier.casefold())
if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
self.cleaned_data['side_identifier'] = side_identifier self.cleaned_data['side_identifier'] = side_identifier
return self.cleaned_data return self.cleaned_data

View File

@ -1,10 +1,10 @@
import re
import unicodedata
from argparse import FileType from argparse import FileType
from sys import stdin from sys import stdin
from django.core.management import BaseCommand from django.core.management import BaseCommand
from media.models import Auteur, Roman
from media.forms import generate_side_identifier
from media.models import Roman, Auteur
class Command(BaseCommand): class Command(BaseCommand):
@ -29,27 +29,9 @@ class Command(BaseCommand):
continue continue
title = book[1] title = book[1]
title_normalized = title.upper()
title_normalized = title_normalized.replace('', '\'')
title_normalized = ''.join(
char
for char in unicodedata.normalize(
'NFKD', title_normalized.casefold())
if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
title_normalized = re.sub(r'^DE ', '', title_normalized)
title_normalized = re.sub(r'^LE ', '', title_normalized)
title_normalized = re.sub(r'^LA ', '', title_normalized)
title_normalized = re.sub(r'^LES ', '', title_normalized)
title_normalized = re.sub(r'^L\'', '', title_normalized)
title_normalized = re.sub(r'^THE ', '', title_normalized)
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
title_normalized = title_normalized.replace(' ', '')
authors = [Auteur.objects.get_or_create(name=n)[0] authors = [Auteur.objects.get_or_create(name=n)[0]
for n in book[0].split(';')] for n in book[0].split(';')]
side_identifier = "{:.3} {:.3}" \ side_identifier = generate_side_identifier(title, authors)
.format(authors[0].name.upper(), title_normalized, )
roman = Roman.objects.create( roman = Roman.objects.create(
title=title, title=title,
side_identifier=side_identifier, side_identifier=side_identifier,

View File

@ -0,0 +1,56 @@
from django.core.management import BaseCommand
from django.db import transaction
from media.forms import generate_side_identifier
from media.models import BD, Manga, Roman
class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--type', '-t',
type=str,
default='bd',
choices=['bd', 'manga', 'roman'],
help="Type of medium where the sides need to be regenerated.")
parser.add_argument('--noninteractivemode', '-n', action="store_true",
help="Disable the interaction mode and replace existing side identifiers.")
@transaction.atomic
def handle(self, *args, **options):
t = options["type"]
medium_class = None
if t == "bd":
medium_class = BD
elif t == "manga":
medium_class = Manga
elif t == "roman":
medium_class = Roman
interactive_mode = not options["noninteractivemode"]
replaced = 0
for obj in medium_class.objects.all():
current_side_identifier = obj.side_identifier
if not obj.authors.all():
self.stdout.write(str(obj))
subtitle = obj.subtitle if hasattr(obj, "subtitle") else None
generated_side_identifier = generate_side_identifier(obj.title, obj.authors.all(), subtitle)
if current_side_identifier != generated_side_identifier:
answer = 'y'
if interactive_mode:
answer = ''
while answer != 'y' and answer != 'n':
answer = input(f"For medium {obj}, current side: {current_side_identifier}, generated side: "
f"{generated_side_identifier}, would you like to replace ? [y/n]").lower()[0]
if answer == 'y':
self.stdout.write(self.style.WARNING(f"Replace side of {obj} from {current_side_identifier} "
f"to {generated_side_identifier}..."))
obj.side_identifier = generated_side_identifier
obj.save()
replaced += 1
if replaced:
self.stdout.write(self.style.SUCCESS(f"{replaced} side identifiers were replaced."))
else:
self.stdout.write(self.style.WARNING("Nothing changed."))