med/media/forms.py

# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later

import json
import re
import unicodedata
import urllib.request

from django.forms import ModelForm

from .models import Auteur
from .scraper import BedetequeScraper


class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        isbn_field = self.fields.get('isbn')
        if isbn_field:
            isbn_field.widget.template_name = "media/isbn_button.html"
            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})

    def download_data_bedeteque(self, isbn):
        """
        Download data from bedeteque
        :return True if success
        """
        scraper = BedetequeScraper()
        r = scraper.search_by_isbn(isbn)
        if not r:
            return False
        # If results, then take the most accurate
        data = scraper.scrap_bd_info(r[0])
        self.cleaned_data.update(data)
        print(self.cleaned_data)
        return True

    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
        :return True if success
        """
        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
                  "&format=json&jscmd=data".format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['ISBN:' + isbn]:
            data = data['ISBN:' + isbn]
            if 'url' in data:
                # Fill the data
                self.parse_data_openlibrary(data)
                return True
        return False

    def parse_data_openlibrary(self, data):
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = data['subtitle']

        if 'number_of_pages' in data:
            self.cleaned_data['number_of_pages'] = \
                data['number_of_pages']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publish_date' in data:
            months = ['January', 'February', "March", "April", "Mai",
                      "June", "July", "August", "September",
                      "October", "November", "December"]
            split = data['publish_date'].replace(',', '').split(' ')
            if len(split) == 1:
                self.cleaned_data['publish_date'] = split[0] + "-01-01"
            else:
                self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                    .format(split[2], months.index(split[0]) + 1,
                            int(split[1]), )

        if 'authors' in data:
            if 'author' not in self.cleaned_data:
                self.cleaned_data['authors'] = list()
            for author in data['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author['name'])[0]
                self.cleaned_data['authors'].append(author_obj)

    def clean(self):
        """
        If user fetch ISBN data, then download data before validating the form
        """
        super().clean()

        if "_continue" in self.request.POST:
            isbn = self.cleaned_data.get('isbn')
            if isbn:
                # ISBN is present, try with bedeteque
                scrap_result = self.download_data_bedeteque(isbn)
                if not scrap_result:
                    # Try with OpenLibrary
                    self.download_data_openlibrary(isbn)

                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
                        r'\(AUT\) ',
                        '',
                        self.cleaned_data['title']
                    )

                if self.cleaned_data['authors']:
                    authors = self.cleaned_data['authors']
                    old_authors = authors.copy()

                    def sort(author):
                        return str(-author.note) + "." \
                            + str(old_authors.index(author)) \
                            + "." + author.name

                    authors.sort(key=sort)
                    author_name = self.cleaned_data['authors'][0].name
                    if ',' not in author_name and ' ' in author_name:
                        author_name = author_name.split(' ')[-1]
                    title_normalized = self.cleaned_data['title'].upper()
                    title_normalized = re.sub(r'^LE ', '', title_normalized)
                    title_normalized = re.sub(r'^LA ', '', title_normalized)
                    title_normalized = re.sub(r'^LES ', '', title_normalized)
                    title_normalized = re.sub(r'^L\'', '', title_normalized)
                    title_normalized = re.sub(r'^THE ', '', title_normalized)
                    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
                    side_identifier = "{:.3} {:.3}".format(
                        author_name,
                        title_normalized.replace(' ', ''), )

                    if self.cleaned_data['subtitle']:
                        self.cleaned_data['subtitle'] = re.sub(
                            r'</span>',
                            '',
                            self.cleaned_data['subtitle']
                        )
                        self.cleaned_data['subtitle'] = re.sub(
                            r'<span.*>',
                            '',
                            self.cleaned_data['subtitle']
                        )
                        start = self.cleaned_data['subtitle'].split(' ')[0] \
                                    .replace('.', '')

                        if start.isnumeric():
                            side_identifier += " {:0>2}".format(start, )

                    # Normalize side identifier, in order to remove accents
                    side_identifier = ''.join(
                        char
                        for char in unicodedata.normalize(
                            'NFKD', side_identifier.casefold())
                        if all(not unicodedata.category(char).startswith(cat)
                               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
                    ).casefold().upper()
                    self.cleaned_data['side_identifier'] = side_identifier

        return self.cleaned_data

    def _clean_fields(self):
        for name, field in self.fields.items():
            # value_from_datadict() gets the data from the data dictionaries.
            # Each widget type knows how to retrieve its own data, because some
            # widgets split data over several HTML fields.
            if field.disabled:
                value = self.get_initial_for_field(field, name)
            else:
                value = field.widget.value_from_datadict(
                    self.data, self.files, self.add_prefix(name))
            from django.core.exceptions import ValidationError
            try:
                # We don't want to check a field when we enter an ISBN.
                if "_continue" not in self.request.POST \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value
                if hasattr(self, 'clean_%s' % name):
                    value = getattr(self, 'clean_%s' % name)()
                    self.cleaned_data[name] = value
            except ValidationError as e:
                self.add_error(name, e)