med/media/forms.py

# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later

import json
import re
import unicodedata
import urllib.request

from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _

from .models import Auteur, BD
from .scraper import BedetequeScraper


class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        isbn_field = self.fields.get('isbn')
        if isbn_field:
            isbn_field.widget.template_name = "media/isbn_button.html"
            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})

    def download_data_bedeteque(self, isbn):
        """
        Download data from bedeteque
        :return True if success
        """
        scraper = BedetequeScraper()
        r = scraper.search_by_isbn(isbn)
        if not r:
            return False
        # If results, then take the most accurate
        data = scraper.scrap_bd_info(r[0])
        self.cleaned_data.update(data)
        return True

    def download_data_google(self, isbn):
        """
        Download data from google books
        :return True if success
        """
        api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
            .format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['totalItems']:
            data = data['items'][0]
            # Fill the data
            self.parse_data_google(data)
            return True
        return False

    def parse_data_google(self, data):
        info = data['volumeInfo']
        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
        if 'title' in info:
            self.cleaned_data['title'] = info['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = info['subtitle']

        if 'pageCount' in info:
            self.cleaned_data['number_of_pages'] = \
                info['pageCount']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publishedDate' in info:
            self.cleaned_data['publish_date'] = info['publishedDate']

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in info:
            for author in info['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author)[0]
                self.cleaned_data['authors'].append(author_obj)

        print(self.cleaned_data)

    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
        :return True if success
        """
        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
                  "&format=json&jscmd=data".format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['ISBN:' + isbn]:
            data = data['ISBN:' + isbn]
            if 'url' in data:
                # Fill the data
                self.parse_data_openlibrary(data)
                return True
        return False

    def parse_data_openlibrary(self, data):
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = data['subtitle']

        if 'number_of_pages' in data:
            self.cleaned_data['number_of_pages'] = \
                data['number_of_pages']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publish_date' in data:
            months = ['January', 'February', "March", "April", "Mai",
                      "June", "July", "August", "September",
                      "October", "November", "December"]
            split = data['publish_date'].replace(',', '').split(' ')
            if len(split) == 1:
                self.cleaned_data['publish_date'] = split[0] + "-01-01"
            else:
                month_to_number = dict(
                    Jan="01",
                    Feb="02",
                    Mar="03",
                    Apr="04",
                    May="05",
                    Jun="06",
                    Jul="07",
                    Aug="08",
                    Sep="09",
                    Oct="10",
                    Nov="11",
                    Dec="12",
                )
                if split[0][:3] in month_to_number:
                    self.cleaned_data['publish_date']\
                        = split[2] + "-" \
                        + month_to_number[split[0][:3]] + "-" + split[1]
                else:
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                        .format(split[2], months.index(split[0])
                                + 1, int(split[1]), )

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in data:
            for author in data['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author['name'])[0]
                self.cleaned_data['authors'].append(author_obj)

    def clean(self):
        """
        If user fetch ISBN data, then download data before validating the form
        """
        super().clean()

        if "_isbn" in self.data\
                or "_isbn_addanother" in self.data:
            isbn = self.cleaned_data.get('isbn')
            if "_isbn_addanother" in self.data:
                self.data = self.data.copy()
                self.data['_addanother'] = 42
                self.request.POST = self.data
            if isbn:
                # ISBN is present, try with bedeteque
                scrap_result = self.download_data_bedeteque(isbn)
                if not scrap_result:
                    # Try with Google
                    scrap_result = self.download_data_google(isbn)
                    if not scrap_result:
                        # Try with OpenLibrary
                        if not self.download_data_openlibrary(isbn):
                            self.add_error('isbn',
                                           _("This ISBN is not found."))
                            return self.cleaned_data

                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
                        r'\(AUT\) ',
                        '',
                        self.cleaned_data['title']
                    )

                if self.cleaned_data['authors']:
                    authors = self.cleaned_data['authors']
                    old_authors = authors.copy()

                    def sort(author):
                        return str(-author.note) + "." \
                            + str(old_authors.index(author)) \
                            + "." + author.name

                    authors.sort(key=sort)
                    author_name = self.cleaned_data['authors'][0].name
                    if ',' not in author_name and ' ' in author_name:
                        author_name = author_name.split(' ')[-1]
                    title_normalized = self.cleaned_data['title'].upper()
                    title_normalized = re.sub(r'^LE ', '', title_normalized)
                    title_normalized = re.sub(r'^LA ', '', title_normalized)
                    title_normalized = re.sub(r'^LES ', '', title_normalized)
                    title_normalized = re.sub(r'^L\'', '', title_normalized)
                    title_normalized = re.sub(r'^THE ', '', title_normalized)
                    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
                    side_identifier = "{:.3} {:.3}".format(
                        author_name,
                        title_normalized.replace(' ', ''), )

                    if self.cleaned_data['subtitle']:
                        self.cleaned_data['subtitle'] = re.sub(
                            r'</span>',
                            '',
                            self.cleaned_data['subtitle']
                        )
                        self.cleaned_data['subtitle'] = re.sub(
                            r'<span.*>',
                            '',
                            self.cleaned_data['subtitle']
                        )
                        start = self.cleaned_data['subtitle'].split(' ')[0] \
                                    .replace('.', '')

                        if start.isnumeric():
                            side_identifier += " {:0>2}".format(start, )

                    # Normalize side identifier, in order to remove accents
                    side_identifier = ''.join(
                        char
                        for char in unicodedata.normalize(
                            'NFKD', side_identifier.casefold())
                        if all(not unicodedata.category(char).startswith(cat)
                               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
                    ).casefold().upper()
                    self.cleaned_data['side_identifier'] = side_identifier

        return self.cleaned_data

    def _clean_fields(self):
        for name, field in self.fields.items():
            # value_from_datadict() gets the data from the data dictionaries.
            # Each widget type knows how to retrieve its own data, because some
            # widgets split data over several HTML fields.
            if field.disabled:
                value = self.get_initial_for_field(field, name)
            else:
                value = field.widget.value_from_datadict(
                    self.data, self.files, self.add_prefix(name))
            from django.core.exceptions import ValidationError
            try:
                # We don't want to check a field when we enter an ISBN.
                if "isbn" not in self.data \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value
                if hasattr(self, 'clean_%s' % name):
                    value = getattr(self, 'clean_%s' % name)()
                    self.cleaned_data[name] = value
            except ValidationError as e:
                self.add_error(name, e)

    class Meta:
        model = BD
        fields = '__all__'
Massive cleanup (1) 2019-08-02 14:57:53 +02:00			`# -- mode: python; coding: utf-8 --`
			`# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay`
			`# SPDX-License-Identifier: GPL-3.0-or-later`

Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`import json`
Side identifiers cannot begin with le/la/les/l'/the 2020-02-10 17:51:09 +01:00			`import re`
Remove special chars from side identifiers 2020-02-10 16:47:05 +01:00			`import unicodedata`
FUCK LINTERS TEST 2020-02-10 21:12:12 +01:00			`import urllib.request`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00
Massive cleanup (1) 2019-08-02 14:57:53 +02:00			`from django.forms import ModelForm`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`from django.utils.translation import gettext_lazy as _`
Massive cleanup (1) 2019-08-02 14:57:53 +02:00
Ajout des CD et des vinyles 2020-05-22 18:04:41 +02:00			`from .models import Auteur, BD`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`from .scraper import BedetequeScraper`

Add ISBN data downloader 2019-08-11 10:40:39 +02:00
			`class MediaAdminForm(ModelForm):`
			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
Fix isbn widget 2019-08-15 16:30:44 +02:00			`isbn_field = self.fields.get('isbn')`
			`if isbn_field:`
			`isbn_field.widget.template_name = "media/isbn_button.html"`
			`isbn_field.widget.attrs.update({'autofocus': 'autofocus'})`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`def download_data_bedeteque(self, isbn):`
			`"""`
			`Download data from bedeteque`
			`:return True if success`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`"""`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`scraper = BedetequeScraper()`
			`r = scraper.search_by_isbn(isbn)`
			`if not r:`
			`return False`
			`# If results, then take the most accurate`
			`data = scraper.scrap_bd_info(r[0])`
			`self.cleaned_data.update(data)`
			`return True`

Scrap from Google books API 2020-05-12 17:08:40 +02:00			`def download_data_google(self, isbn):`
			`"""`
			`Download data from google books`
			`:return True if success`
			`"""`
Fix linters 2020-05-12 17:34:53 +02:00			`api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\`
			`.format(isbn)`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`with urllib.request.urlopen(api_url) as url:`
			`data = json.loads(url.read().decode())`

			`if data and data['totalItems']:`
			`data = data['items'][0]`
			`# Fill the data`
			`self.parse_data_google(data)`
			`return True`
			`return False`

			`def parse_data_google(self, data):`
			`info = data['volumeInfo']`
			`self.cleaned_data['external_url'] = info['canonicalVolumeLink']`
			`if 'title' in info:`
			`self.cleaned_data['title'] = info['title']`
			`if 'subtitle' in data:`
			`self.cleaned_data['subtitle'] = info['subtitle']`

			`if 'pageCount' in info:`
			`self.cleaned_data['number_of_pages'] = \`
			`info['pageCount']`
			`elif not self.cleaned_data['number_of_pages']:`
			`self.cleaned_data['number_of_pages'] = 0`

			`if 'publishedDate' in info:`
			`self.cleaned_data['publish_date'] = info['publishedDate']`

Fix synthax 2020-05-21 17:07:50 +02:00			`if 'authors' not in self.cleaned_data \`
			`or not self.cleaned_data['authors']:`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`self.cleaned_data['authors'] = list()`

			`if 'authors' in info:`
			`for author in info['authors']:`
			`author_obj = Auteur.objects.get_or_create(`
			`name=author)[0]`
			`self.cleaned_data['authors'].append(author_obj)`

			`print(self.cleaned_data)`

Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`def download_data_openlibrary(self, isbn):`
			`"""`
			`Download data from openlibrary`
			`:return True if success`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`"""`
			`api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \`
			`"&format=json&jscmd=data".format(isbn)`
			`with urllib.request.urlopen(api_url) as url:`
			`data = json.loads(url.read().decode())`
Clean code 2020-02-10 11:47:59 +01:00
Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`if data and data['ISBN:' + isbn]:`
			`data = data['ISBN:' + isbn]`
Fix data dict for isbn 2019-08-11 11:19:24 +02:00			`if 'url' in data:`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`# Fill the data`
Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00			`self.parse_data_openlibrary(data)`
			`return True`
			`return False`
Clean code 2020-02-10 11:47:59 +01:00
Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00			`def parse_data_openlibrary(self, data):`
			`self.cleaned_data['external_url'] = data['url']`
			`if 'title' in data:`
			`self.cleaned_data['title'] = data['title']`
			`if 'subtitle' in data:`
			`self.cleaned_data['subtitle'] = data['subtitle']`
Clean code 2020-02-10 11:47:59 +01:00
Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00			`if 'number_of_pages' in data:`
			`self.cleaned_data['number_of_pages'] = \`
			`data['number_of_pages']`
			`elif not self.cleaned_data['number_of_pages']:`
			`self.cleaned_data['number_of_pages'] = 0`
Clean code 2020-02-10 11:47:59 +01:00
Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00			`if 'publish_date' in data:`
			`months = ['January', 'February', "March", "April", "Mai",`
			`"June", "July", "August", "September",`
			`"October", "November", "December"]`
			`split = data['publish_date'].replace(',', '').split(' ')`
Fixed publish date when there is only the year 2020-02-10 19:33:34 +01:00			`if len(split) == 1:`
			`self.cleaned_data['publish_date'] = split[0] + "-01-01"`
			`else:`
Fix openlibrary scrap, fix linters 2020-05-12 14:56:31 +02:00			`month_to_number = dict(`
			`Jan="01",`
			`Feb="02",`
			`Mar="03",`
			`Apr="04",`
			`May="05",`
			`Jun="06",`
			`Jul="07",`
			`Aug="08",`
			`Sep="09",`
			`Oct="10",`
			`Nov="11",`
			`Dec="12",`
			`)`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`if split[0][:3] in month_to_number:`
Fix openlibrary scrap, fix linters 2020-05-12 14:56:31 +02:00			`self.cleaned_data['publish_date']\`
			`= split[2] + "-" \`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`+ month_to_number[split[0][:3]] + "-" + split[1]`
Fix openlibrary scrap, fix linters 2020-05-12 14:56:31 +02:00			`else:`
			`self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \`
			`.format(split[2], months.index(split[0])`
			`+ 1, int(split[1]), )`
Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00
Fix synthax 2020-05-21 17:07:50 +02:00			`if 'authors' not in self.cleaned_data \`
			`or not self.cleaned_data['authors']:`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`self.cleaned_data['authors'] = list()`

Less complexity for download_data_openlibrary 2020-02-10 12:08:48 +01:00			`if 'authors' in data:`
			`for author in data['authors']:`
			`author_obj = Auteur.objects.get_or_create(`
			`name=author['name'])[0]`
			`self.cleaned_data['authors'].append(author_obj)`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00
			`def clean(self):`
			`"""`
			`If user fetch ISBN data, then download data before validating the form`
			`"""`
Required fields are not checked when the ISBN is typed 2020-02-10 03:08:08 +01:00			`super().clean()`

Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`if "_isbn" in self.data\`
			`or "_isbn_addanother" in self.data:`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`isbn = self.cleaned_data.get('isbn')`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`if "_isbn_addanother" in self.data:`
			`self.data = self.data.copy()`
			`self.data['_addanother'] = 42`
Romans were missing 2020-05-22 21:08:44 +02:00			`self.request.POST = self.data`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00			`if isbn:`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`# ISBN is present, try with bedeteque`
			`scrap_result = self.download_data_bedeteque(isbn)`
			`if not scrap_result:`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`# Try with Google`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`scrap_result = self.download_data_google(isbn)`
			`if not scrap_result:`
			`# Try with OpenLibrary`
			`if not self.download_data_openlibrary(isbn):`
Fix linters 2020-05-12 17:34:53 +02:00			`self.add_error('isbn',`
			`_("This ISBN is not found."))`
Scrap from Google books API 2020-05-12 17:08:40 +02:00			`return self.cleaned_data`
Add ISBN data downloader 2019-08-11 10:40:39 +02:00
Remove `(AUT)` fields from titles 2020-02-10 18:23:13 +01:00			`if self.cleaned_data['title']:`
Fix indentation 2020-02-11 21:12:00 +01:00			`self.cleaned_data['title'] = re.sub(`
			`r'\(AUT\) ',`
			`'',`
			`self.cleaned_data['title']`
			`)`
Remove `(AUT)` fields from titles 2020-02-10 18:23:13 +01:00
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`if self.cleaned_data['authors']:`
Python 3.5 support 2020-02-10 17:29:10 +01:00			`authors = self.cleaned_data['authors']`
Fix author sort 2020-02-10 17:35:07 +01:00			`old_authors = authors.copy()`
FUCK LINTERS TEST 2020-02-10 21:12:12 +01:00
Add author note 2020-02-10 17:27:30 +01:00			`def sort(author):`
Fix author sort 2020-02-10 17:35:35 +01:00			`return str(-author.note) + "." \`
FUCK LINTERS TEST (ERDNAXE C'EST RELOUUU) 2020-02-10 21:21:32 +01:00			`+ str(old_authors.index(author)) \`
			`+ "." + author.name`
FUCK LINTERS TEST 2020-02-10 21:12:12 +01:00
Add author note 2020-02-10 17:27:30 +01:00			`authors.sort(key=sort)`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`author_name = self.cleaned_data['authors'][0].name`
			`if ',' not in author_name and ' ' in author_name:`
The last name of the author is considered for the side identifier 2020-02-10 16:48:57 +01:00			`author_name = author_name.split(' ')[-1]`
Side identifiers cannot begin with le/la/les/l'/the 2020-02-10 17:51:09 +01:00			`title_normalized = self.cleaned_data['title'].upper()`
			`title_normalized = re.sub(r'^LE ', '', title_normalized)`
			`title_normalized = re.sub(r'^LA ', '', title_normalized)`
			`title_normalized = re.sub(r'^LES ', '', title_normalized)`
			`title_normalized = re.sub(r'^L\'', '', title_normalized)`
			`title_normalized = re.sub(r'^THE ', '', title_normalized)`
Œ => OE 2020-02-10 18:18:42 +01:00			`title_normalized = re.sub(r'Œ', 'OE', title_normalized)`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`side_identifier = "{:.3} {:.3}".format(`
Remove special chars from side identifiers 2020-02-10 16:47:05 +01:00			`author_name,`
Side identifiers cannot begin with le/la/les/l'/the 2020-02-10 17:51:09 +01:00			`title_normalized.replace(' ', ''), )`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00
			`if self.cleaned_data['subtitle']:`
Fix indentation 2020-02-11 21:12:00 +01:00			`self.cleaned_data['subtitle'] = re.sub(`
			`r'</span>',`
			`'',`
			`self.cleaned_data['subtitle']`
			`)`
			`self.cleaned_data['subtitle'] = re.sub(`
			`r'<span.*>',`
			`'',`
			`self.cleaned_data['subtitle']`
			`)`
Clean code 2020-02-10 11:47:59 +01:00			`start = self.cleaned_data['subtitle'].split(' ')[0] \`
Fix indentation 2020-02-11 21:12:00 +01:00			`.replace('.', '')`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00
			`if start.isnumeric():`
			`side_identifier += " {:0>2}".format(start, )`

Remove special chars from side identifiers 2020-02-10 16:47:05 +01:00			`# Normalize side identifier, in order to remove accents`
			`side_identifier = ''.join(`
			`char`
Clean code 2020-02-10 17:07:06 +01:00			`for char in unicodedata.normalize(`
			`'NFKD', side_identifier.casefold())`
Remove special chars from side identifiers 2020-02-10 16:47:05 +01:00			`if all(not unicodedata.category(char).startswith(cat)`
			`for cat in {'M', 'P', 'Z', 'C'}) or char == ' '`
			`).casefold().upper()`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`self.cleaned_data['side_identifier'] = side_identifier`

Required fields are not checked when the ISBN is typed 2020-02-10 03:08:08 +01:00			`return self.cleaned_data`

			`def _clean_fields(self):`
			`for name, field in self.fields.items():`
			`# value_from_datadict() gets the data from the data dictionaries.`
			`# Each widget type knows how to retrieve its own data, because some`
			`# widgets split data over several HTML fields.`
			`if field.disabled:`
			`value = self.get_initial_for_field(field, name)`
			`else:`
Pipelines 2020-02-10 04:59:39 +01:00			`value = field.widget.value_from_datadict(`
			`self.data, self.files, self.add_prefix(name))`
Required fields are not checked when the ISBN is typed 2020-02-10 03:08:08 +01:00			`from django.core.exceptions import ValidationError`
			`try:`
			`# We don't want to check a field when we enter an ISBN.`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`if "isbn" not in self.data \`
Pipelines 2020-02-10 04:59:39 +01:00			`or not self.cleaned_data.get('isbn'):`
Required fields are not checked when the ISBN is typed 2020-02-10 03:08:08 +01:00			`value = field.clean(value)`
			`self.cleaned_data[name] = value`
			`if hasattr(self, 'clean_%s' % name):`
			`value = getattr(self, 'clean_%s' % name)()`
			`self.cleaned_data[name] = value`
			`except ValidationError as e:`
			`self.add_error(name, e)`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00
			`class Meta:`
Ajout des CD et des vinyles 2020-05-22 18:04:41 +02:00			`model = BD`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`fields = '__all__'`