med/media/forms.py

# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2021 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later

import json
import os
import re
import unicodedata
from urllib.error import HTTPError
import urllib.request

from django.db.models import QuerySet
from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _

from .models import Author, Comic
from .scraper import BedetequeScraper


def generate_side_identifier(title, authors, subtitle=None):
    if isinstance(authors, QuerySet):
        authors = list(authors)

    title_normalized = title.upper()
    title_normalized = title_normalized.replace('’', '\'')
    title_normalized = re.sub(r'^DE ', '', title_normalized)
    title_normalized = re.sub(r'^DES ', '', title_normalized)
    title_normalized = re.sub(r'^LE ', '', title_normalized)
    title_normalized = re.sub(r'^LA ', '', title_normalized)
    title_normalized = re.sub(r'^LES ', '', title_normalized)
    title_normalized = re.sub(r'^L\'', '', title_normalized)
    title_normalized = re.sub(r'^UN ', '', title_normalized)
    title_normalized = re.sub(r'^UNE ', '', title_normalized)
    title_normalized = re.sub(r'^THE ', '', title_normalized)
    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
    title_normalized = title_normalized.replace(' ', '')
    title_normalized = ''.join(
        char
        for char in unicodedata.normalize(
            'NFKD', title_normalized.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
    ).casefold().upper()
    title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
    authors = authors.copy()

    def sort(author):
        return "{:042d}".format(-author.note) + author.name.split(" ")[-1]\
               + ".{:042d}".format(author.pk)

    authors.sort(key=sort)
    primary_author = authors[0]
    author_name = primary_author.name.upper()
    if ',' not in author_name and ' ' in author_name:
        author_name = author_name.split(' ')[-1]
    author_name = ''.join(
        char for char in unicodedata.normalize('NFKD', author_name.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
    ).casefold().upper()
    author_name = re.sub("[^A-Z]", "", author_name)
    side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
    if subtitle:
        subtitle = re.sub(r'</span>', '', subtitle)
        subtitle = re.sub(r'<span.*>', '', subtitle)
        start = subtitle.split(' ')[0].replace('.', '')
        start = re.sub("^R?", "", start)

        if start.isnumeric():
            side_identifier += " {:0>2}".format(start, )

    # Normalize side identifier, in order to remove accents
    side_identifier = ''.join(
        char for char in unicodedata.normalize('NFKD',
                                               side_identifier.casefold())
        if all(not unicodedata.category(char).startswith(cat)
               for cat in {'M', 'P', 'Z', 'C'})
        or char == ' ').casefold().upper()

    return side_identifier


class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        isbn_field = self.fields.get('isbn')
        if isbn_field:
            isbn_field.widget.template_name = "media/isbn_button.html"
            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
        side_identifier_field = self.fields.get('side_identifier')
        if side_identifier_field and self.instance and self.instance.pk:
            instance = self.instance
            title, authors, subtitle = instance.title,\
                instance.authors.all(), None
            if hasattr(instance, "subtitle"):
                subtitle = instance.subtitle
            side_identifier_field.widget.attrs.update(
                {'data-generated-side-identifier':
                 generate_side_identifier(title, authors, subtitle)})
            side_identifier_field.widget.template_name =\
                "media/generate_side_identifier.html"

    def download_data_isbndb(self, isbn):
        api_url = "https://api2.isbndb.com/book/" + str(isbn)\
                  + "?Authorization=" + os.getenv("ISBNDB_KEY", "")
        req = urllib.request.Request(api_url)
        req.add_header("Authorization", os.getenv("ISBNDB_KEY", ""))
        try:
            with urllib.request.urlopen(req) as url:
                data: dict = json.loads(url.read().decode())["book"]
        except HTTPError:
            return False
        print(data)
        data.setdefault("title", "")
        data.setdefault("date_published", "1970-01-01")
        data.setdefault("pages", 0)
        data.setdefault("authors", [])
        data.setdefault("image", "")
        self.cleaned_data["title"] = data["title"]
        self.cleaned_data["publish_date"] = data["date_published"][:10]
        while len(self.cleaned_data["publish_date"]) == 4 \
                or len(self.cleaned_data["publish_date"]) == 7:
            self.cleaned_data["publish_date"] += "-01"
        self.cleaned_data["number_of_pages"] = data["pages"]
        self.cleaned_data["authors"] = \
            list(Author.objects.get_or_create(name=author_name)[0]
                 for author_name in data["authors"])
        self.cleaned_data["external_url"] = data["image"]
        return True

    def download_data_bedeteque(self, isbn):
        """
        Download data from bedeteque
        :return True if success
        """
        scraper = BedetequeScraper()
        r = scraper.search_by_isbn(isbn)
        if not r:
            return False
        # If results, then take the most accurate
        data = scraper.scrap_comic_info(r[0])
        self.cleaned_data.update(data)
        return True

    def download_data_google(self, isbn):
        """
        Download data from google books
        :return True if success
        """
        api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
            .format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['totalItems']:
            fetched_item = None
            for item in data['items']:
                for identifiers in item["volumeInfo"]["industryIdentifiers"]:
                    if identifiers["identifier"] == isbn:
                        fetched_item = item
                        break
                if fetched_item:
                    break
            if not fetched_item:
                return False
            # Fill the data
            self.parse_data_google(fetched_item)
            return True
        return False

    def parse_data_google(self, data):
        info = data['volumeInfo']
        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
        if 'title' in info:
            self.cleaned_data['title'] = info['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = info['subtitle']

        if 'pageCount' in info:
            self.cleaned_data['number_of_pages'] = \
                info['pageCount']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publishedDate' in info:
            if "-" not in info["publishedDate"]:
                info["publishedDate"] += "-01-01"
            elif len(info["publishedDate"]) == 7:
                info["publishedDate"] += "-01"
            self.cleaned_data['publish_date'] = info['publishedDate'][:10]

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in info:
            for author in info['authors']:
                author_obj = Author.objects.get_or_create(
                    name=author)[0]
                self.cleaned_data['authors'].append(author_obj)

        print(self.cleaned_data)

    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
        :return True if success
        """
        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
                  "&format=json&jscmd=data".format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())

        if data and data['ISBN:' + isbn]:
            data = data['ISBN:' + isbn]
            if 'url' in data:
                # Fill the data
                self.parse_data_openlibrary(data)
                return True
        return False

    def parse_data_openlibrary(self, data):
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = data['subtitle']

        if 'number_of_pages' in data:
            self.cleaned_data['number_of_pages'] = \
                data['number_of_pages']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0

        if 'publish_date' in data:
            months = ['January', 'February', "March", "April", "Mai",
                      "June", "July", "August", "September",
                      "October", "November", "December"]
            split = data['publish_date'].replace(',', '').split(' ')
            if len(split) == 1:
                self.cleaned_data['publish_date'] = split[0] + "-01-01"
            else:
                month_to_number = dict(
                    Jan="01",
                    Feb="02",
                    Mar="03",
                    Apr="04",
                    May="05",
                    Jun="06",
                    Jul="07",
                    Aug="08",
                    Sep="09",
                    Oct="10",
                    Nov="11",
                    Dec="12",
                )
                if split[0][:3] in month_to_number:
                    self.cleaned_data['publish_date']\
                        = split[2] + "-" \
                        + month_to_number[split[0][:3]] + "-" + split[1]
                else:
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                        .format(split[2], months.index(split[0])
                                + 1, int(split[1]), )

        if 'authors' not in self.cleaned_data \
                or not self.cleaned_data['authors']:
            self.cleaned_data['authors'] = list()

        if 'authors' in data:
            for author in data['authors']:
                author_obj = Author.objects.get_or_create(
                    name=author['name'])[0]
                self.cleaned_data['authors'].append(author_obj)

    def clean(self):
        """
        If user fetch ISBN data, then download data before validating the form
        """
        super().clean()

        if "_isbn" in self.data\
                or "_isbn_addanother" in self.data:
            isbn = self.cleaned_data.get('isbn')
            if "_isbn_addanother" in self.data:
                self.data = self.data.copy()
                self.data['_addanother'] = 42
                self.request.POST = self.data
            if isbn:
                scrap_result = self.download_data_isbndb(isbn)
                if not scrap_result:
                    # ISBN is present, try with bedeteque
                    scrap_result = self.download_data_bedeteque(isbn)
                    if not scrap_result:
                        # Try with Google
                        scrap_result = self.download_data_google(isbn)
                        if not scrap_result:
                            # Try with OpenLibrary
                            if not self.download_data_openlibrary(isbn):
                                self.add_error('isbn',
                                               _("This ISBN is not found."))
                                return self.cleaned_data

                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
                        r'\(AUT\) ',
                        '',
                        self.cleaned_data['title']
                    )

                if self.cleaned_data['authors']:
                    side_identifier = generate_side_identifier(
                        self.cleaned_data["title"],
                        self.cleaned_data["authors"],
                        self.cleaned_data["subtitle"],
                    )

                    self.cleaned_data['side_identifier'] = side_identifier

        return self.cleaned_data

    def _clean_fields(self):
        for name, field in self.fields.items():
            # value_from_datadict() gets the data from the data dictionaries.
            # Each widget type knows how to retrieve its own data, because some
            # widgets split data over several HTML fields.
            if field.disabled:
                value = self.get_initial_for_field(field, name)
            else:
                value = field.widget.value_from_datadict(
                    self.data, self.files, self.add_prefix(name))
            from django.core.exceptions import ValidationError
            try:
                # We don't want to check a field when we enter an ISBN.
                if "isbn" not in self.data \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value
                if hasattr(self, 'clean_%s' % name):
                    value = getattr(self, 'clean_%s' % name)()
                    self.cleaned_data[name] = value
            except ValidationError as e:
                self.add_error(name, e)

    class Meta:
        model = Comic
        fields = ('isbn', 'title', 'subtitle', 'external_url',
                  'side_identifier', 'authors', 'number_of_pages',
                  'publish_date', 'present', )
-												Massive cleanup (1)

											
										
										
											2019-08-02 14:57:53 +02:00
+								# -*- mode: python; coding: utf-8 -*-
-												Prepare the website for 2021

											
										
										
											2020-12-27 18:22:52 +01:00
+								# Copyright (C) 2017-2021 by BDE ENS Paris-Saclay
-												Massive cleanup (1)

											
										
										
											2019-08-02 14:57:53 +02:00
+								# SPDX-License-Identifier: GPL-3.0-or-later
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								import json
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								import os
-												Side identifiers cannot begin with le/la/les/l'/the

											
										
										
											2020-02-10 17:51:09 +01:00
+								import re
-												Remove special chars from side identifiers

											
										
										
											2020-02-10 16:47:05 +01:00
+								import unicodedata
-												Too much 404...

											
										
										
											2020-10-03 15:51:25 +02:00
+								from urllib.error import HTTPError
-												FUCK LINTERS TEST

											
										
										
											2020-02-10 21:12:12 +01:00
+								import urllib.request
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								from django.db.models import QuerySet
-												Massive cleanup (1)

											
										
										
											2019-08-02 14:57:53 +02:00
+								from django.forms import ModelForm
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								from django.utils.translation import gettext_lazy as _
-												Massive cleanup (1)

											
										
										
											2019-08-02 14:57:53 +02:00
-												Translate models in english

											
										
										
											2021-10-23 18:31:03 +02:00
+								from .models import Author, Comic
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								from .scraper import BedetequeScraper
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								def generate_side_identifier(title, authors, subtitle=None):
 								    if isinstance(authors, QuerySet):
 								        authors = list(authors)
 								    title_normalized = title.upper()
 								    title_normalized = title_normalized.replace('’', '\'')
 								    title_normalized = re.sub(r'^DE ', '', title_normalized)
-												Remove Des from title

											
										
										
											2020-10-03 11:03:22 +02:00
+								    title_normalized = re.sub(r'^DES ', '', title_normalized)
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								    title_normalized = re.sub(r'^LE ', '', title_normalized)
 								    title_normalized = re.sub(r'^LA ', '', title_normalized)
 								    title_normalized = re.sub(r'^LES ', '', title_normalized)
 								    title_normalized = re.sub(r'^L\'', '', title_normalized)
-												Normalize after replacing UN

											
										
										
											2020-10-02 16:20:24 +02:00
+								    title_normalized = re.sub(r'^UN ', '', title_normalized)
 								    title_normalized = re.sub(r'^UNE ', '', title_normalized)
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								    title_normalized = re.sub(r'^THE ', '', title_normalized)
 								    title_normalized = re.sub(r'Œ', 'OE', title_normalized)
 								    title_normalized = title_normalized.replace(' ', '')
-												Normalize after replacing L'

											
										
										
											2020-10-02 13:54:16 +02:00
+								    title_normalized = ''.join(
 								        char
 								        for char in unicodedata.normalize(
 								            'NFKD', title_normalized.casefold())
 								        if all(not unicodedata.category(char).startswith(cat)
 								               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
 								    ).casefold().upper()
-												Normalize before removing wrong letters

											
										
										
											2020-10-02 16:59:58 +02:00
+								    title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								    authors = authors.copy()
 								    def sort(author):
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								        return "{:042d}".format(-author.note) + author.name.split(" ")[-1]\
 								               + ".{:042d}".format(author.pk)
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
 								    authors.sort(key=sort)
 								    primary_author = authors[0]
 								    author_name = primary_author.name.upper()
 								    if ',' not in author_name and ' ' in author_name:
 								        author_name = author_name.split(' ')[-1]
 								    author_name = ''.join(
 								        char for char in unicodedata.normalize('NFKD', author_name.casefold())
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								        if all(not unicodedata.category(char).startswith(cat)
 								               for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								    ).casefold().upper()
 								    author_name = re.sub("[^A-Z]", "", author_name)
 								    side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
 								    if subtitle:
 								        subtitle = re.sub(r'</span>', '', subtitle)
 								        subtitle = re.sub(r'<span.*>', '', subtitle)
 								        start = subtitle.split(' ')[0].replace('.', '')
-												Allow to have a R in front of subtitles

											
										
										
											2020-10-02 16:43:04 +02:00
+								        start = re.sub("^R?", "", start)
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
 								        if start.isnumeric():
 								            side_identifier += " {:0>2}".format(start, )
 								    # Normalize side identifier, in order to remove accents
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								    side_identifier = ''.join(
 								        char for char in unicodedata.normalize('NFKD',
 								                                               side_identifier.casefold())
 								        if all(not unicodedata.category(char).startswith(cat)
 								               for cat in {'M', 'P', 'Z', 'C'})
 								        or char == ' ').casefold().upper()
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
 								    return side_identifier
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								class MediaAdminForm(ModelForm):
 								    def __init__(self, *args, **kwargs):
 								        super().__init__(*args, **kwargs)
-												Fix isbn widget

											
										
										
											2019-08-15 16:30:44 +02:00
+								        isbn_field = self.fields.get('isbn')
 								        if isbn_field:
 								            isbn_field.widget.template_name = "media/isbn_button.html"
 								            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
-												Add button to generate a side identifier

											
										
										
											2020-10-02 13:12:40 +02:00
+								        side_identifier_field = self.fields.get('side_identifier')
 								        if side_identifier_field and self.instance and self.instance.pk:
 								            instance = self.instance
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								            title, authors, subtitle = instance.title,\
 								                instance.authors.all(), None
-												Add button to generate a side identifier

											
										
										
											2020-10-02 13:12:40 +02:00
+								            if hasattr(instance, "subtitle"):
 								                subtitle = instance.subtitle
 								            side_identifier_field.widget.attrs.update(
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								                {'data-generated-side-identifier':
 								                 generate_side_identifier(title, authors, subtitle)})
 								            side_identifier_field.widget.template_name =\
 								                "media/generate_side_identifier.html"
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								    def download_data_isbndb(self, isbn):
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								        api_url = "https://api2.isbndb.com/book/" + str(isbn)\
 								                  + "?Authorization=" + os.getenv("ISBNDB_KEY", "")
-												Incorrect use of Authorization header

											
										
										
											2020-10-03 15:40:33 +02:00
+								        req = urllib.request.Request(api_url)
-												Fix CI, add django22-py39 CI step

											
										
										
											2020-12-28 22:55:48 +01:00
+								        req.add_header("Authorization", os.getenv("ISBNDB_KEY", ""))
-												Too much 404...

											
										
										
											2020-10-03 15:51:25 +02:00
+								        try:
 								            with urllib.request.urlopen(req) as url:
 								                data: dict = json.loads(url.read().decode())["book"]
 								        except HTTPError:
 								            return False
-												Incorrect use of Authorization header

											
										
										
											2020-10-03 15:41:49 +02:00
+								        print(data)
-												Wrong use of ISBNDB API

											
										
										
											2020-10-03 15:47:22 +02:00
+								        data.setdefault("title", "")
 								        data.setdefault("date_published", "1970-01-01")
 								        data.setdefault("pages", 0)
 								        data.setdefault("authors", [])
 								        data.setdefault("image", "")
-												Incorrect use of Authorization header

											
										
										
											2020-10-03 15:41:49 +02:00
+								        self.cleaned_data["title"] = data["title"]
-												Better publish date

											
										
										
											2020-10-03 15:55:56 +02:00
+								        self.cleaned_data["publish_date"] = data["date_published"][:10]
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								        while len(self.cleaned_data["publish_date"]) == 4 \
 								                or len(self.cleaned_data["publish_date"]) == 7:
-												Better publish date

											
										
										
											2020-10-03 15:55:56 +02:00
+								            self.cleaned_data["publish_date"] += "-01"
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								        self.cleaned_data["number_of_pages"] = data["pages"]
 								        self.cleaned_data["authors"] = \
-												Translate models in english

											
										
										
											2021-10-23 18:31:03 +02:00
+								            list(Author.objects.get_or_create(name=author_name)[0]
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								                 for author_name in data["authors"])
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								        self.cleaned_data["external_url"] = data["image"]
 								        return True
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								    def download_data_bedeteque(self, isbn):
 								        """
 								        Download data from bedeteque
 								        :return True if success
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								        """
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								        scraper = BedetequeScraper()
 								        r = scraper.search_by_isbn(isbn)
 								        if not r:
 								            return False
 								        # If results, then take the most accurate
-												Fix tests

											
										
										
											2021-10-23 18:33:51 +02:00
+								        data = scraper.scrap_comic_info(r[0])
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								        self.cleaned_data.update(data)
 								        return True
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								    def download_data_google(self, isbn):
 								        """
 								        Download data from google books
 								        :return True if success
 								        """
-												Fix linters

											
										
										
											2020-05-12 17:34:53 +02:00
+								        api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
 								            .format(isbn)
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								        with urllib.request.urlopen(api_url) as url:
 								            data = json.loads(url.read().decode())
 								        if data and data['totalItems']:
-												Ensure that the fetched roman ISBN is the queried ISBN

											
										
										
											2020-10-03 15:10:43 +02:00
+								            fetched_item = None
 								            for item in data['items']:
 								                for identifiers in item["volumeInfo"]["industryIdentifiers"]:
 								                    if identifiers["identifier"] == isbn:
 								                        fetched_item = item
 								                        break
 								                if fetched_item:
 								                    break
 								            if not fetched_item:
 								                return False
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								            # Fill the data
-												Ensure that the fetched roman ISBN is the queried ISBN

											
										
										
											2020-10-03 15:10:43 +02:00
+								            self.parse_data_google(fetched_item)
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								            return True
 								        return False
 								    def parse_data_google(self, data):
 								        info = data['volumeInfo']
 								        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
 								        if 'title' in info:
 								            self.cleaned_data['title'] = info['title']
 								        if 'subtitle' in data:
 								            self.cleaned_data['subtitle'] = info['subtitle']
 								        if 'pageCount' in info:
 								            self.cleaned_data['number_of_pages'] = \
 								                info['pageCount']
 								        elif not self.cleaned_data['number_of_pages']:
 								            self.cleaned_data['number_of_pages'] = 0
 								        if 'publishedDate' in info:
-												Better roman import

											
										
										
											2020-10-02 17:12:02 +02:00
+								            if "-" not in info["publishedDate"]:
 								                info["publishedDate"] += "-01-01"
-												If no day of month, set the first day of the month

											
										
										
											2020-10-03 15:15:35 +02:00
+								            elif len(info["publishedDate"]) == 7:
 								                info["publishedDate"] += "-01"
-												Get only the ten first characters for the date

											
										
										
											2020-10-02 17:42:56 +02:00
+								            self.cleaned_data['publish_date'] = info['publishedDate'][:10]
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
-												Fix synthax

											
										
										
											2020-05-21 17:07:50 +02:00
+								        if 'authors' not in self.cleaned_data \
 								                or not self.cleaned_data['authors']:
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								            self.cleaned_data['authors'] = list()
 								        if 'authors' in info:
 								            for author in info['authors']:
-												Translate models in english

											
										
										
											2021-10-23 18:31:03 +02:00
+								                author_obj = Author.objects.get_or_create(
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								                    name=author)[0]
 								                self.cleaned_data['authors'].append(author_obj)
 								        print(self.cleaned_data)
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								    def download_data_openlibrary(self, isbn):
 								        """
 								        Download data from openlibrary
 								        :return True if success
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								        """
 								        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
 								                  "&format=json&jscmd=data".format(isbn)
 								        with urllib.request.urlopen(api_url) as url:
 								            data = json.loads(url.read().decode())
-												Clean code

											
										
										
											2020-02-10 11:47:59 +01:00
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								        if data and data['ISBN:' + isbn]:
 								            data = data['ISBN:' + isbn]
-												Fix data dict for isbn

											
										
										
											2019-08-11 11:19:24 +02:00
+								            if 'url' in data:
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								                # Fill the data
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								                self.parse_data_openlibrary(data)
 								                return True
 								        return False
-												Clean code

											
										
										
											2020-02-10 11:47:59 +01:00
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								    def parse_data_openlibrary(self, data):
 								        self.cleaned_data['external_url'] = data['url']
 								        if 'title' in data:
 								            self.cleaned_data['title'] = data['title']
 								        if 'subtitle' in data:
 								            self.cleaned_data['subtitle'] = data['subtitle']
-												Clean code

											
										
										
											2020-02-10 11:47:59 +01:00
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								        if 'number_of_pages' in data:
 								            self.cleaned_data['number_of_pages'] = \
 								                data['number_of_pages']
 								        elif not self.cleaned_data['number_of_pages']:
 								            self.cleaned_data['number_of_pages'] = 0
-												Clean code

											
										
										
											2020-02-10 11:47:59 +01:00
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								        if 'publish_date' in data:
 								            months = ['January', 'February', "March", "April", "Mai",
 								                      "June", "July", "August", "September",
 								                      "October", "November", "December"]
 								            split = data['publish_date'].replace(',', '').split(' ')
-												Fixed publish date when there is only the year

											
										
										
											2020-02-10 19:33:34 +01:00
+								            if len(split) == 1:
 								                self.cleaned_data['publish_date'] = split[0] + "-01-01"
 								            else:
-												Fix openlibrary scrap, fix linters

											
										
										
											2020-05-12 14:56:31 +02:00
+								                month_to_number = dict(
 								                    Jan="01",
 								                    Feb="02",
 								                    Mar="03",
 								                    Apr="04",
 								                    May="05",
 								                    Jun="06",
 								                    Jul="07",
 								                    Aug="08",
 								                    Sep="09",
 								                    Oct="10",
 								                    Nov="11",
 								                    Dec="12",
 								                )
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								                if split[0][:3] in month_to_number:
-												Fix openlibrary scrap, fix linters

											
										
										
											2020-05-12 14:56:31 +02:00
+								                    self.cleaned_data['publish_date']\
 								                        = split[2] + "-" \
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								                        + month_to_number[split[0][:3]] + "-" + split[1]
-												Fix openlibrary scrap, fix linters

											
										
										
											2020-05-12 14:56:31 +02:00
+								                else:
 								                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
 								                        .format(split[2], months.index(split[0])
 								                                + 1, int(split[1]), )
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
-												Fix synthax

											
										
										
											2020-05-21 17:07:50 +02:00
+								        if 'authors' not in self.cleaned_data \
 								                or not self.cleaned_data['authors']:
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								            self.cleaned_data['authors'] = list()
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								        if 'authors' in data:
 								            for author in data['authors']:
-												Translate models in english

											
										
										
											2021-10-23 18:31:03 +02:00
+								                author_obj = Author.objects.get_or_create(
-												Less complexity for download_data_openlibrary

											
										
										
											2020-02-10 12:08:48 +01:00
+								                    name=author['name'])[0]
 								                self.cleaned_data['authors'].append(author_obj)
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
 								    def clean(self):
 								        """
 								        If user fetch ISBN data, then download data before validating the form
 								        """
-												Required fields are not checked when the ISBN is typed

											
										
										
											2020-02-10 03:08:08 +01:00
+								        super().clean()
-												Split comic strips and mangas

											
										
										
											2020-05-21 16:56:41 +02:00
+								        if "_isbn" in self.data\
 								                or "_isbn_addanother" in self.data:
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								            isbn = self.cleaned_data.get('isbn')
-												Split comic strips and mangas

											
										
										
											2020-05-21 16:56:41 +02:00
+								            if "_isbn_addanother" in self.data:
 								                self.data = self.data.copy()
 								                self.data['_addanother'] = 42
-												Romans were missing

											
										
										
											2020-05-22 21:08:44 +02:00
+								                self.request.POST = self.data
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
+								            if isbn:
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								                scrap_result = self.download_data_isbndb(isbn)
-												Add bedeteque scraper

											
										
										
											2019-08-19 12:17:49 +02:00
+								                if not scrap_result:
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								                    # ISBN is present, try with bedeteque
 								                    scrap_result = self.download_data_bedeteque(isbn)
-												Scrap from Google books API

											
										
										
											2020-05-12 17:08:40 +02:00
+								                    if not scrap_result:
-												Temporary use ISBNDB to scrap data

											
										
										
											2020-10-03 15:36:28 +02:00
+								                        # Try with Google
 								                        scrap_result = self.download_data_google(isbn)
 								                        if not scrap_result:
 								                            # Try with OpenLibrary
 								                            if not self.download_data_openlibrary(isbn):
 								                                self.add_error('isbn',
 								                                               _("This ISBN is not found."))
 								                                return self.cleaned_data
-												Add ISBN data downloader

											
										
										
											2019-08-11 10:40:39 +02:00
-												Remove `(AUT)` fields from titles

											
										
										
											2020-02-10 18:23:13 +01:00
+								                if self.cleaned_data['title']:
-												Fix indentation

											
										
										
											2020-02-11 21:12:00 +01:00
+								                    self.cleaned_data['title'] = re.sub(
 								                        r'\(AUT\) ',
 								                        '',
 								                        self.cleaned_data['title']
 								                    )
-												Remove `(AUT)` fields from titles

											
										
										
											2020-02-10 18:23:13 +01:00
-												Fix authors & openlibrary scrap

											
										
										
											2020-02-10 11:29:26 +01:00
+								                if self.cleaned_data['authors']:
-												Add script to regenerate side identifiers

											
										
										
											2020-09-28 10:51:21 +02:00
+								                    side_identifier = generate_side_identifier(
 								                        self.cleaned_data["title"],
 								                        self.cleaned_data["authors"],
 								                        self.cleaned_data["subtitle"],
 								                    )
-												Fix authors & openlibrary scrap

											
										
										
											2020-02-10 11:29:26 +01:00
+								                    self.cleaned_data['side_identifier'] = side_identifier
-												Required fields are not checked when the ISBN is typed

											
										
										
											2020-02-10 03:08:08 +01:00
+								        return self.cleaned_data
 								    def _clean_fields(self):
 								        for name, field in self.fields.items():
 								            # value_from_datadict() gets the data from the data dictionaries.
 								            # Each widget type knows how to retrieve its own data, because some
 								            # widgets split data over several HTML fields.
 								            if field.disabled:
 								                value = self.get_initial_for_field(field, name)
 								            else:
-												Pipelines

											
										
										
											2020-02-10 04:59:39 +01:00
+								                value = field.widget.value_from_datadict(
 								                    self.data, self.files, self.add_prefix(name))
-												Required fields are not checked when the ISBN is typed

											
										
										
											2020-02-10 03:08:08 +01:00
+								            from django.core.exceptions import ValidationError
 								            try:
 								                # We don't want to check a field when we enter an ISBN.
-												Split comic strips and mangas

											
										
										
											2020-05-21 16:56:41 +02:00
+								                if "isbn" not in self.data \
-												Pipelines

											
										
										
											2020-02-10 04:59:39 +01:00
+								                        or not self.cleaned_data.get('isbn'):
-												Required fields are not checked when the ISBN is typed

											
										
										
											2020-02-10 03:08:08 +01:00
+								                    value = field.clean(value)
 								                self.cleaned_data[name] = value
 								                if hasattr(self, 'clean_%s' % name):
 								                    value = getattr(self, 'clean_%s' % name)()
 								                    self.cleaned_data[name] = value
 								            except ValidationError as e:
 								                self.add_error(name, e)
-												Split comic strips and mangas

											
										
										
											2020-05-21 16:56:41 +02:00
 								    class Meta:
-												Translate models in english

											
										
										
											2021-10-23 18:31:03 +02:00
+								        model = Comic
-												Linting

											
										
										
											2020-12-28 23:12:27 +01:00
+								        fields = ('isbn', 'title', 'subtitle', 'external_url',
 								                  'side_identifier', 'authors', 'number_of_pages',
 								                  'publish_date', 'present', )