Add script to regenerate side identifiers

2020-09-28 10:51:21 +02:00 · 2020-09-28 10:51:21 +02:00 · 9ecd876923
parent be76bf4857
commit 9ecd876923
3 changed files with 121 additions and 70 deletions
--- a/media/forms.py
+++ b/media/forms.py
@ -1,5 +1,5 @@
 # -*- mode: python; coding: utf-8 -*-
-# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay
+# Copyright (C) 2017-2020 by BDE ENS Paris-Saclay
 # SPDX-License-Identifier: GPL-3.0-or-later

 import json
@ -7,6 +7,7 @@ import re
 import unicodedata
 import urllib.request

+from django.db.models import QuerySet
 from django.forms import ModelForm
 from django.utils.translation import gettext_lazy as _

@ -14,6 +15,60 @@ from .models import Auteur, BD
 from .scraper import BedetequeScraper


+def generate_side_identifier(title, authors, subtitle=None):
+    if isinstance(authors, QuerySet):
+        authors = list(authors)
+
+    title_normalized = title.upper()
+    title_normalized = title_normalized.replace('’', '\'')
+    title_normalized = ''.join(
+        char
+        for char in unicodedata.normalize(
+            'NFKD', title_normalized.casefold())
+        if all(not unicodedata.category(char).startswith(cat)
+               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
+    ).casefold().upper()
+    title_normalized = re.sub(r'^DE ', '', title_normalized)
+    title_normalized = re.sub(r'^LE ', '', title_normalized)
+    title_normalized = re.sub(r'^LA ', '', title_normalized)
+    title_normalized = re.sub(r'^LES ', '', title_normalized)
+    title_normalized = re.sub(r'^L\'', '', title_normalized)
+    title_normalized = re.sub(r'^THE ', '', title_normalized)
+    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
+    title_normalized = title_normalized.replace(' ', '')
+    title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
+    authors = authors.copy()
+
+    def sort(author):
+        return str(-author.note) + author.name.split(" ")[-1] + ".{:05d}".format(author.pk)
+
+    authors.sort(key=sort)
+    primary_author = authors[0]
+    author_name = primary_author.name.upper()
+    if ',' not in author_name and ' ' in author_name:
+        author_name = author_name.split(' ')[-1]
+    author_name = ''.join(
+        char for char in unicodedata.normalize('NFKD', author_name.casefold())
+        if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
+    ).casefold().upper()
+    author_name = re.sub("[^A-Z]", "", author_name)
+    side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
+    if subtitle:
+        subtitle = re.sub(r'</span>', '', subtitle)
+        subtitle = re.sub(r'<span.*>', '', subtitle)
+        start = subtitle.split(' ')[0].replace('.', '')
+
+        if start.isnumeric():
+            side_identifier += " {:0>2}".format(start, )
+
+    # Normalize side identifier, in order to remove accents
+    side_identifier = ''.join(char for char in unicodedata.normalize('NFKD', side_identifier.casefold())
+                              if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'})
+                              or char == ' ').casefold().upper()
+
+    return side_identifier
+
+
 class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -188,54 +243,12 @@ class MediaAdminForm(ModelForm):
                    )

                if self.cleaned_data['authors']:
-                    authors = self.cleaned_data['authors']
-                    old_authors = authors.copy()
-
-                    def sort(author):
-                        return str(-author.note) + "." \
-                            + str(old_authors.index(author)) \
-                            + "." + author.name
-
-                    authors.sort(key=sort)
-                    author_name = self.cleaned_data['authors'][0].name
-                    if ',' not in author_name and ' ' in author_name:
-                        author_name = author_name.split(' ')[-1]
-                    title_normalized = self.cleaned_data['title'].upper()
-                    title_normalized = re.sub(r'^LE ', '', title_normalized)
-                    title_normalized = re.sub(r'^LA ', '', title_normalized)
-                    title_normalized = re.sub(r'^LES ', '', title_normalized)
-                    title_normalized = re.sub(r'^L\'', '', title_normalized)
-                    title_normalized = re.sub(r'^THE ', '', title_normalized)
-                    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
-                    side_identifier = "{:.3} {:.3}".format(
-                        author_name,
-                        title_normalized.replace(' ', ''), )
-
-                    if self.cleaned_data['subtitle']:
-                        self.cleaned_data['subtitle'] = re.sub(
-                            r'</span>',
-                            '',
-                            self.cleaned_data['subtitle']
+                    side_identifier = generate_side_identifier(
+                        self.cleaned_data["title"],
+                        self.cleaned_data["authors"],
+                        self.cleaned_data["subtitle"],
                    )
-                        self.cleaned_data['subtitle'] = re.sub(
-                            r'<span.*>',
-                            '',
-                            self.cleaned_data['subtitle']
-                        )
-                        start = self.cleaned_data['subtitle'].split(' ')[0] \
-                                    .replace('.', '')

-                        if start.isnumeric():
-                            side_identifier += " {:0>2}".format(start, )
-
-                    # Normalize side identifier, in order to remove accents
-                    side_identifier = ''.join(
-                        char
-                        for char in unicodedata.normalize(
-                            'NFKD', side_identifier.casefold())
-                        if all(not unicodedata.category(char).startswith(cat)
-                               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
-                    ).casefold().upper()
                    self.cleaned_data['side_identifier'] = side_identifier

        return self.cleaned_data
--- a/media/management/commands/import_no_isbn_roman.py
+++ b/media/management/commands/import_no_isbn_roman.py
@ -1,10 +1,10 @@
-import re
-import unicodedata
 from argparse import FileType
 from sys import stdin

 from django.core.management import BaseCommand
-from media.models import Auteur, Roman
+
+from media.forms import generate_side_identifier
+from media.models import Roman, Auteur


 class Command(BaseCommand):
@ -29,27 +29,9 @@ class Command(BaseCommand):
                continue

            title = book[1]
-            title_normalized = title.upper()
-            title_normalized = title_normalized.replace('’', '\'')
-            title_normalized = ''.join(
-                char
-                for char in unicodedata.normalize(
-                    'NFKD', title_normalized.casefold())
-                if all(not unicodedata.category(char).startswith(cat)
-                       for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
-            ).casefold().upper()
-            title_normalized = re.sub(r'^DE ', '', title_normalized)
-            title_normalized = re.sub(r'^LE ', '', title_normalized)
-            title_normalized = re.sub(r'^LA ', '', title_normalized)
-            title_normalized = re.sub(r'^LES ', '', title_normalized)
-            title_normalized = re.sub(r'^L\'', '', title_normalized)
-            title_normalized = re.sub(r'^THE ', '', title_normalized)
-            title_normalized = re.sub(r'Œ', 'OE', title_normalized)
-            title_normalized = title_normalized.replace(' ', '')
            authors = [Auteur.objects.get_or_create(name=n)[0]
                       for n in book[0].split(';')]
-            side_identifier = "{:.3} {:.3}" \
-                .format(authors[0].name.upper(), title_normalized, )
+            side_identifier = generate_side_identifier(title, authors)
            roman = Roman.objects.create(
                title=title,
                side_identifier=side_identifier,
--- a/media/management/commands/regenerate_side_identifiers.py
+++ b/media/management/commands/regenerate_side_identifiers.py
@ -0,0 +1,56 @@
+from django.core.management import BaseCommand
+from django.db import transaction
+
+from media.forms import generate_side_identifier
+from media.models import BD, Manga, Roman
+
+
+class Command(BaseCommand):
+    def add_arguments(self, parser):
+        parser.add_argument('--type', '-t',
+                            type=str,
+                            default='bd',
+                            choices=['bd', 'manga', 'roman'],
+                            help="Type of medium where the sides need to be regenerated.")
+        parser.add_argument('--noninteractivemode', '-n', action="store_true",
+                            help="Disable the interaction mode and replace existing side identifiers.")
+
+    @transaction.atomic
+    def handle(self, *args, **options):
+        t = options["type"]
+        medium_class = None
+        if t == "bd":
+            medium_class = BD
+        elif t == "manga":
+            medium_class = Manga
+        elif t == "roman":
+            medium_class = Roman
+
+        interactive_mode = not options["noninteractivemode"]
+
+        replaced = 0
+
+        for obj in medium_class.objects.all():
+            current_side_identifier = obj.side_identifier
+            if not obj.authors.all():
+                self.stdout.write(str(obj))
+            subtitle = obj.subtitle if hasattr(obj, "subtitle") else None
+            generated_side_identifier = generate_side_identifier(obj.title, obj.authors.all(), subtitle)
+            if current_side_identifier != generated_side_identifier:
+                answer = 'y'
+                if interactive_mode:
+                    answer = ''
+                    while answer != 'y' and answer != 'n':
+                        answer = input(f"For medium {obj}, current side: {current_side_identifier}, generated side: "
+                                       f"{generated_side_identifier}, would you like to replace ? [y/n]").lower()[0]
+                if answer == 'y':
+                    self.stdout.write(self.style.WARNING(f"Replace side of {obj} from {current_side_identifier} "
+                                                         f"to {generated_side_identifier}..."))
+                    obj.side_identifier = generated_side_identifier
+                    obj.save()
+                    replaced += 1
+
+        if replaced:
+            self.stdout.write(self.style.SUCCESS(f"{replaced} side identifiers were replaced."))
+        else:
+            self.stdout.write(self.style.WARNING("Nothing changed."))