med/media/forms.py

# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2021 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later

import json
import os
import re
import unicodedata
from urllib.error import HTTPError
import urllib.request

from django.core.exceptions import ValidationError
from django.db.models import QuerySet
from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _

from .models import Author, Comic
from .scraper import BedetequeScraper


def generate_side_identifier(title, authors, subtitle=None):
    if isinstance(authors, QuerySet):
        authors = list(authors)

    title_normalized = title.upper()
    title_normalized = title_normalized.replace('’', '\'')
    title_normalized = re.sub(r'^DE ', '', title_normalized)
    title_normalized = re.sub(r'^DES ', '', title_normalized)
    title_normalized = re.sub(r'^LE ', '', title_normalized)
    title_normalized = re.sub(r'^LA ', '', title_normalized)
    title_normalized = re.sub(r'^LES ', '', title_normalized)
    title_normalized = re.sub(r'^L\'', '', title_normalized)
    title_normalized = re.sub(r'^UN ', '', title_normalized)
    title_normalized = re.sub(r'^UNE ', '', title_normalized)
    title_normalized = re.sub(r'^THE ', '', title_normalized)
    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
    title_normalized = title_normalized.replace(' ', '')
    title_normalized = ''.join(
        char
        for char in unicodedata.normalize(
            'NFKD', title_normalized.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
    ).casefold().upper()
    title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
    authors = authors.copy()

    def sort(author):
        return "{:042d}".format(-author.note) + author.name.split(" ")[-1]\
               + ".{:042d}".format(author.pk)

    authors.sort(key=sort)
    primary_author = authors[0]
    author_name = primary_author.name.upper()
    if ',' not in author_name and ' ' in author_name:
        author_name = author_name.split(' ')[-1]
    author_name = ''.join(
        char for char in unicodedata.normalize('NFKD', author_name.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
    ).casefold().upper()
    author_name = re.sub("[^A-Z]", "", author_name)
    side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
    if subtitle:
        subtitle = re.sub(r'</span>', '', subtitle)
        subtitle = re.sub(r'<span.*>', '', subtitle)
        start = subtitle.split(' ')[0].replace('.', '')
        start = re.sub("^R?", "", start)

        if start.isnumeric():
            side_identifier += " {:0>2}".format(start, )

    # Normalize side identifier, in order to remove accents
    side_identifier = ''.join(
        char for char in unicodedata.normalize('NFKD',
                                               side_identifier.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'})
        or char == ' ').casefold().upper()

    return side_identifier


class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        isbn_field = self.fields.get('isbn')
        if isbn_field:
            isbn_field.widget.template_name = "media/isbn_button.html"
            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
        side_identifier_field = self.fields.get('side_identifier')
        if side_identifier_field and self.instance and self.instance.pk:
            instance = self.instance
            title, authors, subtitle = instance.title,\
                instance.authors.all(), None
            if hasattr(instance, "subtitle"):
                subtitle = instance.subtitle
            side_identifier_field.widget.attrs.update(
                {'data-generated-side-identifier':
                 generate_side_identifier(title, authors, subtitle)})
            side_identifier_field.widget.template_name =\
                "media/generate_side_identifier.html"

    def download_data_isbndb(self, isbn):
        api_url = "https://api2.isbndb.com/book/" + str(isbn)\
                  + "?Authorization=" + os.getenv("ISBNDB_KEY", "")
        req = urllib.request.Request(api_url)
        req.add_header("Authorization", os.getenv("ISBNDB_KEY", ""))
        try:
            with urllib.request.urlopen(req) as url:
                data: dict = json.loads(url.read().decode())["book"]
        except HTTPError:
            return False
        print(data)
        data.setdefault("title", "")
        data.setdefault("date_published", "1970-01-01")
        data.setdefault("pages", 0)
        data.setdefault("authors", [])
        data.setdefault("image", "")
        self.cleaned_data["title"] = data["title"]
        self.cleaned_data["publish_date"] = data["date_published"][:10]
        while len(self.cleaned_data["publish_date"]) == 4 \
                or len(self.cleaned_data["publish_date"]) == 7:
            self.cleaned_data["publish_date"] += "-01"
        self.cleaned_data["number_of_pages"] = data["pages"]
        self.cleaned_data["authors"] = \
            list(Author.objects.get_or_create(name=author_name)[0]
                 for author_name in data["authors"])
        self.cleaned_data["external_url"] = data["image"]
        return True

    def download_data_bedeteque(self, isbn):
        """
        Download data from bedeteque
        :return True if success
        """
        scraper = BedetequeScraper()
        r = scraper.search_by_isbn(isbn)
        if not r:
            return False
        # If results, then take the most accurate
        data = scraper.scrap_comic_info(r[0])
        self.cleaned_data.update(data)
        return True

    def download_data_google(self, isbn):
        """
        Download data from google books
        :return True if success
        """
        api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
            .format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['totalItems']:
            fetched_item = None
            for item in data['items']:
                for identifiers in item["volumeInfo"]["industryIdentifiers"]:
                    if identifiers["identifier"] == isbn:
                        fetched_item = item
                        break
                if fetched_item:
                    break
            if not fetched_item:
                return False
            # Fill the data
            self.parse_data_google(fetched_item)
            return True
        return False

    def parse_data_google(self, data):
        info = data['volumeInfo']
        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
        if 'title' in info:
            self.cleaned_data['title'] = info['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = info['subtitle']

        if 'pageCount' in info:
            self.cleaned_data['number_of_pages'] = \
                info['pageCount']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publishedDate' in info:
            if "-" not in info["publishedDate"]:
                info["publishedDate"] += "-01-01"
            elif len(info["publishedDate"]) == 7:
                info["publishedDate"] += "-01"
            self.cleaned_data['publish_date'] = info['publishedDate'][:10]

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in info:
            for author in info['authors']:
                author_obj = Author.objects.get_or_create(
                    name=author)[0]
                self.cleaned_data['authors'].append(author_obj)

        print(self.cleaned_data)

    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
        :return True if success
        """
        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
                  "&format=json&jscmd=data".format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['ISBN:' + isbn]:
            data = data['ISBN:' + isbn]
            if 'url' in data:
                # Fill the data
                self.parse_data_openlibrary(data)
                return True
        return False

    def parse_data_openlibrary(self, data):
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = data['subtitle']

        if 'number_of_pages' in data:
            self.cleaned_data['number_of_pages'] = \
                data['number_of_pages']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publish_date' in data:
            months = ['January', 'February', "March", "April", "Mai",
                      "June", "July", "August", "September",
                      "October", "November", "December"]
            split = data['publish_date'].replace(',', '').split(' ')
            if len(split) == 1:
                self.cleaned_data['publish_date'] = split[0] + "-01-01"
            else:
                month_to_number = dict(
                    Jan="01",
                    Feb="02",
                    Mar="03",
                    Apr="04",
                    May="05",
                    Jun="06",
                    Jul="07",
                    Aug="08",
                    Sep="09",
                    Oct="10",
                    Nov="11",
                    Dec="12",
                )
                if split[0][:3] in month_to_number:
                    self.cleaned_data['publish_date']\
                        = split[2] + "-" \
                        + month_to_number[split[0][:3]] + "-" + split[1]
                else:
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                        .format(split[2], months.index(split[0])
                                + 1, int(split[1]), )

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in data:
            for author in data['authors']:
                author_obj = Author.objects.get_or_create(
                    name=author['name'])[0]
                self.cleaned_data['authors'].append(author_obj)

    def clean(self):
        """
        If user fetch ISBN data, then download data before validating the form
        """
        super().clean()

        if "_isbn" in self.data\
                or "_isbn_addanother" in self.data:
            isbn = self.cleaned_data.get('isbn')
            if "_isbn_addanother" in self.data:
                self.data = self.data.copy()
                self.data['_addanother'] = 42
                self.request.POST = self.data
            if isbn:
                scrap_result = self.download_data_isbndb(isbn)
                if not scrap_result:
                    # ISBN is present, try with bedeteque
                    scrap_result = self.download_data_bedeteque(isbn)
                    if not scrap_result:
                        # Try with Google
                        scrap_result = self.download_data_google(isbn)
                        if not scrap_result:
                            # Try with OpenLibrary
                            if not self.download_data_openlibrary(isbn):
                                self.add_error('isbn',
                                               _("This ISBN is not found."))
                                return self.cleaned_data

                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
                        r'\(AUT\) ',
                        '',
                        self.cleaned_data['title']
                    )

                if self.cleaned_data['authors']:
                    side_identifier = generate_side_identifier(
                        self.cleaned_data["title"],
                        self.cleaned_data["authors"],
                        self.cleaned_data["subtitle"],
                    )

                    self.cleaned_data['side_identifier'] = side_identifier

        return self.cleaned_data

    def _clean_fields(self):
        # First clean ISBN field
        isbn_field = self.fields['isbn']
        isbn = isbn_field.widget.value_from_datadict(
            self.data, self.files, self.add_prefix('isbn'))
        isbn = isbn_field.clean(isbn)
        self.cleaned_data['isbn'] = isbn

        for name, field in self.fields.items():
            # value_from_datadict() gets the data from the data dictionaries.
            # Each widget type knows how to retrieve its own data, because some
            # widgets split data over several HTML fields.
            if field.disabled:
                value = self.get_initial_for_field(field, name)
            else:
                value = field.widget.value_from_datadict(
                    self.data, self.files, self.add_prefix(name))
            try:
                # We don't want to check a field when we enter an ISBN.
                if "isbn" not in self.data \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value
                if hasattr(self, 'clean_%s' % name):
                    value = getattr(self, 'clean_%s' % name)()
                    self.cleaned_data[name] = value
            except ValidationError as e:
                self.add_error(name, e)

    class Meta:
        model = Comic
        fields = ('isbn', 'title', 'subtitle', 'external_url',
                  'side_identifier', 'authors', 'number_of_pages',
                  'publish_date', 'present', )