# -*- mode: python; coding: utf-8 -*- # Copyright (C) 2017-2021 by BDE ENS Paris-Saclay # SPDX-License-Identifier: GPL-3.0-or-later import json import os import re import unicodedata from urllib.error import HTTPError import urllib.request from django.core.exceptions import ValidationError from django.db.models import QuerySet from django.forms import ModelForm from django.utils.translation import gettext_lazy as _ from .models import Author, Comic from .scraper import BedetequeScraper def generate_side_identifier(title, authors, subtitle=None): if isinstance(authors, QuerySet): authors = list(authors) title_normalized = title.upper() title_normalized = title_normalized.replace('’', '\'') title_normalized = re.sub(r'^DE ', '', title_normalized) title_normalized = re.sub(r'^DES ', '', title_normalized) title_normalized = re.sub(r'^LE ', '', title_normalized) title_normalized = re.sub(r'^LA ', '', title_normalized) title_normalized = re.sub(r'^LES ', '', title_normalized) title_normalized = re.sub(r'^L\'', '', title_normalized) title_normalized = re.sub(r'^UN ', '', title_normalized) title_normalized = re.sub(r'^UNE ', '', title_normalized) title_normalized = re.sub(r'^THE ', '', title_normalized) title_normalized = re.sub(r'Œ', 'OE', title_normalized) title_normalized = title_normalized.replace(' ', '') title_normalized = ''.join( char for char in unicodedata.normalize( 'NFKD', title_normalized.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ' ).casefold().upper() title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized) authors = authors.copy() def sort(author): return "{:042d}".format(-author.note) + author.name.split(" ")[-1]\ + ".{:042d}".format(author.pk) authors.sort(key=sort) primary_author = authors[0] author_name = primary_author.name.upper() if ',' not in author_name and ' ' in author_name: author_name = author_name.split(' ')[-1] author_name = ''.join( char for char in unicodedata.normalize('NFKD', author_name.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ' ).casefold().upper() author_name = re.sub("[^A-Z]", "", author_name) side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, ) if subtitle: subtitle = re.sub(r'', '', subtitle) subtitle = re.sub(r'', '', subtitle) start = subtitle.split(' ')[0].replace('.', '') start = re.sub("^R?", "", start) if start.isnumeric(): side_identifier += " {:0>2}".format(start, ) # Normalize side identifier, in order to remove accents side_identifier = ''.join( char for char in unicodedata.normalize('NFKD', side_identifier.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ').casefold().upper() return side_identifier class MediaAdminForm(ModelForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) isbn_field = self.fields.get('isbn') if isbn_field: isbn_field.widget.template_name = "media/isbn_button.html" isbn_field.widget.attrs.update({'autofocus': 'autofocus'}) side_identifier_field = self.fields.get('side_identifier') if side_identifier_field and self.instance and self.instance.pk: instance = self.instance title, authors, subtitle = instance.title,\ instance.authors.all(), None if hasattr(instance, "subtitle"): subtitle = instance.subtitle side_identifier_field.widget.attrs.update( {'data-generated-side-identifier': generate_side_identifier(title, authors, subtitle)}) side_identifier_field.widget.template_name =\ "media/generate_side_identifier.html" def download_data_isbndb(self, isbn): api_url = "https://api2.isbndb.com/book/" + str(isbn)\ + "?Authorization=" + os.getenv("ISBNDB_KEY", "") req = urllib.request.Request(api_url) req.add_header("Authorization", os.getenv("ISBNDB_KEY", "")) try: with urllib.request.urlopen(req) as url: data: dict = json.loads(url.read().decode())["book"] except HTTPError: return False print(data) data.setdefault("title", "") data.setdefault("date_published", "1970-01-01") data.setdefault("pages", 0) data.setdefault("authors", []) data.setdefault("image", "") self.cleaned_data["title"] = data["title"] self.cleaned_data["publish_date"] = data["date_published"][:10] while len(self.cleaned_data["publish_date"]) == 4 \ or len(self.cleaned_data["publish_date"]) == 7: self.cleaned_data["publish_date"] += "-01" self.cleaned_data["number_of_pages"] = data["pages"] self.cleaned_data["authors"] = \ list(Author.objects.get_or_create(name=author_name)[0] for author_name in data["authors"]) self.cleaned_data["external_url"] = data["image"] return True def download_data_bedeteque(self, isbn): """ Download data from bedeteque :return True if success """ scraper = BedetequeScraper() r = scraper.search_by_isbn(isbn) if not r: return False # If results, then take the most accurate data = scraper.scrap_comic_info(r[0]) self.cleaned_data.update(data) return True def download_data_google(self, isbn): """ Download data from google books :return True if success """ api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\ .format(isbn) with urllib.request.urlopen(api_url) as url: data = json.loads(url.read().decode()) if data and data['totalItems']: fetched_item = None for item in data['items']: for identifiers in item["volumeInfo"]["industryIdentifiers"]: if identifiers["identifier"] == isbn: fetched_item = item break if fetched_item: break if not fetched_item: return False # Fill the data self.parse_data_google(fetched_item) return True return False def parse_data_google(self, data): info = data['volumeInfo'] self.cleaned_data['external_url'] = info['canonicalVolumeLink'] if 'title' in info: self.cleaned_data['title'] = info['title'] if 'subtitle' in data: self.cleaned_data['subtitle'] = info['subtitle'] if 'pageCount' in info: self.cleaned_data['number_of_pages'] = \ info['pageCount'] elif not self.cleaned_data['number_of_pages']: self.cleaned_data['number_of_pages'] = 0 if 'publishedDate' in info: if "-" not in info["publishedDate"]: info["publishedDate"] += "-01-01" elif len(info["publishedDate"]) == 7: info["publishedDate"] += "-01" self.cleaned_data['publish_date'] = info['publishedDate'][:10] if 'authors' not in self.cleaned_data \ or not self.cleaned_data['authors']: self.cleaned_data['authors'] = list() if 'authors' in info: for author in info['authors']: author_obj = Author.objects.get_or_create( name=author)[0] self.cleaned_data['authors'].append(author_obj) print(self.cleaned_data) def download_data_openlibrary(self, isbn): """ Download data from openlibrary :return True if success """ api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \ "&format=json&jscmd=data".format(isbn) with urllib.request.urlopen(api_url) as url: data = json.loads(url.read().decode()) if data and data['ISBN:' + isbn]: data = data['ISBN:' + isbn] if 'url' in data: # Fill the data self.parse_data_openlibrary(data) return True return False def parse_data_openlibrary(self, data): self.cleaned_data['external_url'] = data['url'] if 'title' in data: self.cleaned_data['title'] = data['title'] if 'subtitle' in data: self.cleaned_data['subtitle'] = data['subtitle'] if 'number_of_pages' in data: self.cleaned_data['number_of_pages'] = \ data['number_of_pages'] elif not self.cleaned_data['number_of_pages']: self.cleaned_data['number_of_pages'] = 0 if 'publish_date' in data: months = ['January', 'February', "March", "April", "Mai", "June", "July", "August", "September", "October", "November", "December"] split = data['publish_date'].replace(',', '').split(' ') if len(split) == 1: self.cleaned_data['publish_date'] = split[0] + "-01-01" else: month_to_number = dict( Jan="01", Feb="02", Mar="03", Apr="04", May="05", Jun="06", Jul="07", Aug="08", Sep="09", Oct="10", Nov="11", Dec="12", ) if split[0][:3] in month_to_number: self.cleaned_data['publish_date']\ = split[2] + "-" \ + month_to_number[split[0][:3]] + "-" + split[1] else: self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \ .format(split[2], months.index(split[0]) + 1, int(split[1]), ) if 'authors' not in self.cleaned_data \ or not self.cleaned_data['authors']: self.cleaned_data['authors'] = list() if 'authors' in data: for author in data['authors']: author_obj = Author.objects.get_or_create( name=author['name'])[0] self.cleaned_data['authors'].append(author_obj) def clean(self): """ If user fetch ISBN data, then download data before validating the form """ super().clean() if "_isbn" in self.data\ or "_isbn_addanother" in self.data: isbn = self.cleaned_data.get('isbn') if "_isbn_addanother" in self.data: self.data = self.data.copy() self.data['_addanother'] = 42 self.request.POST = self.data if isbn: scrap_result = self.download_data_isbndb(isbn) if not scrap_result: # ISBN is present, try with bedeteque scrap_result = self.download_data_bedeteque(isbn) if not scrap_result: # Try with Google scrap_result = self.download_data_google(isbn) if not scrap_result: # Try with OpenLibrary if not self.download_data_openlibrary(isbn): self.add_error('isbn', _("This ISBN is not found.")) return self.cleaned_data if self.cleaned_data['title']: self.cleaned_data['title'] = re.sub( r'\(AUT\) ', '', self.cleaned_data['title'] ) if self.cleaned_data['authors']: side_identifier = generate_side_identifier( self.cleaned_data["title"], self.cleaned_data["authors"], self.cleaned_data["subtitle"], ) self.cleaned_data['side_identifier'] = side_identifier return self.cleaned_data def _clean_fields(self): # First clean ISBN field isbn_field = self.fields['isbn'] isbn = isbn_field.widget.value_from_datadict( self.data, self.files, self.add_prefix('isbn')) isbn = isbn_field.clean(isbn) self.cleaned_data['isbn'] = isbn for name, field in self.fields.items(): # value_from_datadict() gets the data from the data dictionaries. # Each widget type knows how to retrieve its own data, because some # widgets split data over several HTML fields. if field.disabled: value = self.get_initial_for_field(field, name) else: value = field.widget.value_from_datadict( self.data, self.files, self.add_prefix(name)) try: # We don't want to check a field when we enter an ISBN. if "isbn" not in self.data \ or not self.cleaned_data.get('isbn'): value = field.clean(value) self.cleaned_data[name] = value if hasattr(self, 'clean_%s' % name): value = getattr(self, 'clean_%s' % name)() self.cleaned_data[name] = value except ValidationError as e: self.add_error(name, e) class Meta: model = Comic fields = ('isbn', 'title', 'subtitle', 'external_url', 'side_identifier', 'authors', 'number_of_pages', 'publish_date', 'present', )